From 95e049979130b10fc5d21f373b6fdf467c310ab7 Mon Sep 17 00:00:00 2001 From: Ben Reed Date: Mon, 18 Aug 2025 12:19:56 -0300 Subject: [PATCH] feat: Implement WordPress scraper with comprehensive tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Created WordPressScraper class extending BaseScraper - Fetches posts with pagination support - Enriches posts with author, category, and tag information - Implements incremental updates via state management - Word count calculation for content - All 11 tests passing 🤖 Generated with Claude Code Co-Authored-By: Claude --- src/wordpress_scraper.py | 293 ++++++++++++++++++++++++++++++++ tests/test_wordpress_scraper.py | 265 +++++++++++++++++++++++++++++ 2 files changed, 558 insertions(+) create mode 100644 src/wordpress_scraper.py create mode 100644 tests/test_wordpress_scraper.py diff --git a/src/wordpress_scraper.py b/src/wordpress_scraper.py new file mode 100644 index 0000000..e9cb5a7 --- /dev/null +++ b/src/wordpress_scraper.py @@ -0,0 +1,293 @@ +import os +import time +import requests +from typing import Any, Dict, List, Optional +from datetime import datetime +from src.base_scraper import BaseScraper, ScraperConfig + + +class WordPressScraper(BaseScraper): + def __init__(self, config: ScraperConfig): + super().__init__(config) + self.base_url = os.getenv('WORDPRESS_URL', 'https://hvacknowitall.com/') + self.username = os.getenv('WORDPRESS_USERNAME') + self.api_key = os.getenv('WORDPRESS_API_KEY') + self.auth = (self.username, self.api_key) + + # Ensure base_url ends with / + if not self.base_url.endswith('/'): + self.base_url += '/' + + # Cache for authors, categories, and tags + self.author_cache = {} + self.category_cache = {} + self.tag_cache = {} + + def fetch_posts(self, per_page: int = 100) -> List[Dict[str, Any]]: + """Fetch all posts from WordPress API with pagination.""" + posts = [] + page = 1 + + try: + while True: + self.logger.info(f"Fetching posts page {page}") + response = requests.get( + f"{self.base_url}wp-json/wp/v2/posts", + params={'per_page': per_page, 'page': page}, + auth=self.auth, + timeout=30 + ) + + if response.status_code != 200: + self.logger.error(f"Error fetching posts: {response.status_code}") + break + + page_posts = response.json() + if not page_posts: + break + + posts.extend(page_posts) + + # Check if there are more pages + total_pages = int(response.headers.get('X-WP-TotalPages', 1)) + if page >= total_pages: + break + + page += 1 + time.sleep(1) # Rate limiting + + except Exception as e: + self.logger.error(f"Error fetching posts: {e}") + + self.logger.info(f"Fetched {len(posts)} posts total") + return posts + + def fetch_author(self, author_id: int) -> Dict[str, Any]: + """Fetch author information.""" + if author_id in self.author_cache: + return self.author_cache[author_id] + + try: + response = requests.get( + f"{self.base_url}wp-json/wp/v2/users/{author_id}", + auth=self.auth, + timeout=30 + ) + + if response.status_code == 200: + author = response.json() + self.author_cache[author_id] = author + return author + except Exception as e: + self.logger.error(f"Error fetching author {author_id}: {e}") + + return {'name': 'Unknown'} + + def fetch_categories(self, category_ids: List[int]) -> List[Dict[str, Any]]: + """Fetch category information.""" + categories = [] + + for cat_id in category_ids: + if cat_id in self.category_cache: + categories.append(self.category_cache[cat_id]) + continue + + try: + response = requests.get( + f"{self.base_url}wp-json/wp/v2/categories/{cat_id}", + auth=self.auth, + timeout=30 + ) + + if response.status_code == 200: + category = response.json() + self.category_cache[cat_id] = category + categories.append(category) + except Exception as e: + self.logger.error(f"Error fetching category {cat_id}: {e}") + + return categories + + def fetch_tags(self, tag_ids: List[int]) -> List[Dict[str, Any]]: + """Fetch tag information.""" + tags = [] + + for tag_id in tag_ids: + if tag_id in self.tag_cache: + tags.append(self.tag_cache[tag_id]) + continue + + try: + response = requests.get( + f"{self.base_url}wp-json/wp/v2/tags/{tag_id}", + auth=self.auth, + timeout=30 + ) + + if response.status_code == 200: + tag = response.json() + self.tag_cache[tag_id] = tag + tags.append(tag) + except Exception as e: + self.logger.error(f"Error fetching tag {tag_id}: {e}") + + return tags + + def count_words(self, html_content: str) -> int: + """Count words in HTML content.""" + # Convert to markdown first to get clean text + text = self.convert_to_markdown(html_content) + # Simple word count + words = text.split() + return len(words) + + def fetch_content(self) -> List[Dict[str, Any]]: + """Fetch and enrich all content.""" + posts = self.fetch_posts() + + # Enrich posts with author, category, and tag information + enriched_posts = [] + for post in posts: + try: + # Fetch author info + author = self.fetch_author(post.get('author', 0)) + post['author_name'] = author.get('name', 'Unknown') + + # Fetch categories + category_ids = post.get('categories', []) + if category_ids: + categories = self.fetch_categories(category_ids) + post['category_names'] = [cat.get('name', '') for cat in categories] + else: + post['category_names'] = [] + + # Fetch tags + tag_ids = post.get('tags', []) + if tag_ids: + tags = self.fetch_tags(tag_ids) + post['tag_names'] = [tag.get('name', '') for tag in tags] + else: + post['tag_names'] = [] + + # Count words + content_html = post.get('content', {}).get('rendered', '') + post['word_count'] = self.count_words(content_html) + + enriched_posts.append(post) + + except Exception as e: + self.logger.error(f"Error enriching post {post.get('id')}: {e}") + enriched_posts.append(post) + + return enriched_posts + + def format_markdown(self, posts: List[Dict[str, Any]]) -> str: + """Format posts as markdown.""" + markdown_sections = [] + + for post in posts: + section = [] + + # ID + section.append(f"# ID: {post.get('id', 'N/A')}") + section.append("") + + # Title + title = post.get('title', {}).get('rendered', 'Untitled') + section.append(f"## Title: {title}") + section.append("") + + # Type + section.append("## Type: blog_post") + section.append("") + + # Author + author = post.get('author_name', 'Unknown') + section.append(f"## Author: {author}") + section.append("") + + # Publish Date + date = post.get('date', '') + section.append(f"## Publish Date: {date}") + section.append("") + + # Word Count + word_count = post.get('word_count', 0) + section.append(f"## Word Count: {word_count}") + section.append("") + + # Categories + categories = ', '.join(post.get('category_names', [])) + section.append(f"## Categories: {categories if categories else 'None'}") + section.append("") + + # Tags + tags = ', '.join(post.get('tag_names', [])) + section.append(f"## Tags: {tags if tags else 'None'}") + section.append("") + + # Permalink + link = post.get('link', '') + section.append(f"## Permalink: {link}") + section.append("") + + # Description/Content + section.append("## Description:") + content_html = post.get('content', {}).get('rendered', '') + if content_html: + content_md = self.convert_to_markdown(content_html) + section.append(content_md) + else: + excerpt_html = post.get('excerpt', {}).get('rendered', '') + if excerpt_html: + excerpt_md = self.convert_to_markdown(excerpt_html) + section.append(excerpt_md) + section.append("") + + # Separator + section.append("-" * 50) + section.append("") + + markdown_sections.append('\n'.join(section)) + + return '\n'.join(markdown_sections) + + def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]: + """Get only new posts since last sync.""" + if not state: + # No previous state, return all items + return items + + last_post_id = state.get('last_post_id') + last_post_date = state.get('last_post_date') + + if not last_post_id: + return items + + # Filter for posts newer than the last synced post + new_items = [] + for item in items: + post_id = item.get('id') + post_date = item.get('date') + + # Check if this is a new post + if post_id > last_post_id or (post_date and post_date > last_post_date): + new_items.append(item) + + return new_items + + def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]: + """Update state with latest post information.""" + if not items: + return state + + # Sort by ID to get the latest + sorted_items = sorted(items, key=lambda x: x.get('id', 0), reverse=True) + latest_item = sorted_items[0] + + state['last_post_id'] = latest_item.get('id') + state['last_post_date'] = latest_item.get('date') + state['last_sync'] = datetime.now(self.tz).isoformat() + state['post_count'] = len(items) + + return state \ No newline at end of file diff --git a/tests/test_wordpress_scraper.py b/tests/test_wordpress_scraper.py new file mode 100644 index 0000000..00c3767 --- /dev/null +++ b/tests/test_wordpress_scraper.py @@ -0,0 +1,265 @@ +import pytest +from unittest.mock import Mock, patch, MagicMock +from datetime import datetime +import json +from pathlib import Path +from src.wordpress_scraper import WordPressScraper +from src.base_scraper import ScraperConfig + + +class TestWordPressScraper: + @pytest.fixture + def config(self): + return ScraperConfig( + source_name="wordpress", + brand_name="hvacknowitall", + data_dir=Path("data"), + logs_dir=Path("logs"), + timezone="America/Halifax" + ) + + @pytest.fixture + def mock_env(self): + with patch.dict('os.environ', { + 'WORDPRESS_URL': 'https://hvacknowitall.com/', + 'WORDPRESS_USERNAME': 'test@example.com', + 'WORDPRESS_API_KEY': 'test_api_key' + }): + yield + + def test_initialization(self, config, mock_env): + scraper = WordPressScraper(config) + assert scraper.config == config + assert scraper.base_url == 'https://hvacknowitall.com/' + assert scraper.auth == ('test@example.com', 'test_api_key') + + @patch('requests.get') + def test_fetch_posts_success(self, mock_get, config, mock_env): + # Mock successful API response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + 'id': 1, + 'title': {'rendered': 'Test Post 1'}, + 'author': 1, + 'date': '2024-01-01T12:00:00', + 'content': {'rendered': '

Test content 1

'}, + 'excerpt': {'rendered': '

Test excerpt 1

'}, + 'link': 'https://hvacknowitall.com/test-post-1', + 'categories': [1, 2], + 'tags': [3, 4] + }, + { + 'id': 2, + 'title': {'rendered': 'Test Post 2'}, + 'author': 1, + 'date': '2024-01-02T12:00:00', + 'content': {'rendered': '

Test content 2

'}, + 'excerpt': {'rendered': '

Test excerpt 2

'}, + 'link': 'https://hvacknowitall.com/test-post-2', + 'categories': [1], + 'tags': [3] + } + ] + mock_get.return_value = mock_response + + scraper = WordPressScraper(config) + posts = scraper.fetch_posts() + + assert len(posts) == 2 + assert posts[0]['id'] == 1 + assert posts[0]['title']['rendered'] == 'Test Post 1' + mock_get.assert_called_with( + 'https://hvacknowitall.com/wp-json/wp/v2/posts', + params={'per_page': 100, 'page': 1}, + auth=('test@example.com', 'test_api_key'), + timeout=30 + ) + + @patch('requests.get') + def test_fetch_posts_with_pagination(self, mock_get, config, mock_env): + # First page response + mock_response1 = Mock() + mock_response1.status_code = 200 + mock_response1.headers = {'X-WP-TotalPages': '2'} + mock_response1.json.return_value = [{'id': 1, 'title': {'rendered': 'Post 1'}}] + + # Second page response + mock_response2 = Mock() + mock_response2.status_code = 200 + mock_response2.headers = {'X-WP-TotalPages': '2'} + mock_response2.json.return_value = [{'id': 2, 'title': {'rendered': 'Post 2'}}] + + mock_get.side_effect = [mock_response1, mock_response2] + + scraper = WordPressScraper(config) + posts = scraper.fetch_posts() + + assert len(posts) == 2 + assert posts[0]['id'] == 1 + assert posts[1]['id'] == 2 + assert mock_get.call_count == 2 + + @patch('requests.get') + def test_fetch_posts_error_handling(self, mock_get, config, mock_env): + mock_get.side_effect = Exception("Connection error") + + scraper = WordPressScraper(config) + posts = scraper.fetch_posts() + + assert posts == [] + + @patch('requests.get') + def test_fetch_author_info(self, mock_get, config, mock_env): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + 'id': 1, + 'name': 'John Doe', + 'slug': 'john-doe' + } + mock_get.return_value = mock_response + + scraper = WordPressScraper(config) + author = scraper.fetch_author(1) + + assert author['name'] == 'John Doe' + mock_get.assert_called_with( + 'https://hvacknowitall.com/wp-json/wp/v2/users/1', + auth=('test@example.com', 'test_api_key'), + timeout=30 + ) + + @patch('requests.get') + def test_fetch_categories(self, mock_get, config, mock_env): + # Mock individual category responses + mock_response1 = Mock() + mock_response1.status_code = 200 + mock_response1.json.return_value = {'id': 1, 'name': 'Category 1'} + + mock_response2 = Mock() + mock_response2.status_code = 200 + mock_response2.json.return_value = {'id': 2, 'name': 'Category 2'} + + mock_get.side_effect = [mock_response1, mock_response2] + + scraper = WordPressScraper(config) + categories = scraper.fetch_categories([1, 2]) + + assert len(categories) == 2 + assert categories[0]['name'] == 'Category 1' + assert categories[1]['name'] == 'Category 2' + + @patch('requests.get') + def test_fetch_tags(self, mock_get, config, mock_env): + # Mock individual tag responses + mock_response1 = Mock() + mock_response1.status_code = 200 + mock_response1.json.return_value = {'id': 3, 'name': 'Tag 1'} + + mock_response2 = Mock() + mock_response2.status_code = 200 + mock_response2.json.return_value = {'id': 4, 'name': 'Tag 2'} + + mock_get.side_effect = [mock_response1, mock_response2] + + scraper = WordPressScraper(config) + tags = scraper.fetch_tags([3, 4]) + + assert len(tags) == 2 + assert tags[0]['name'] == 'Tag 1' + assert tags[1]['name'] == 'Tag 2' + + def test_format_markdown(self, config, mock_env): + scraper = WordPressScraper(config) + + # Mock enriched posts data + posts = [ + { + 'id': 1, + 'title': {'rendered': 'Test Post 1'}, + 'author_name': 'John Doe', + 'date': '2024-01-01T12:00:00', + 'content': {'rendered': '

Test content 1

'}, + 'link': 'https://hvacknowitall.com/test-post-1', + 'category_names': ['HVAC', 'Tips'], + 'tag_names': ['maintenance', 'diy'], + 'word_count': 500 + } + ] + + markdown = scraper.format_markdown(posts) + + assert '# ID: 1' in markdown + assert '## Title: Test Post 1' in markdown + assert '## Author: John Doe' in markdown + assert '## Publish Date: 2024-01-01T12:00:00' in markdown + assert '## Word Count: 500' in markdown + assert '## Categories: HVAC, Tips' in markdown + assert '## Tags: maintenance, diy' in markdown + assert '## Permalink: https://hvacknowitall.com/test-post-1' in markdown + + def test_get_incremental_items(self, config, mock_env): + scraper = WordPressScraper(config) + + posts = [ + {'id': 3, 'date': '2024-01-03T12:00:00'}, + {'id': 2, 'date': '2024-01-02T12:00:00'}, + {'id': 1, 'date': '2024-01-01T12:00:00'} + ] + + # Test with no previous state + state = {} + new_posts = scraper.get_incremental_items(posts, state) + assert len(new_posts) == 3 + + # Test with existing state + state = {'last_post_id': 2, 'last_post_date': '2024-01-02T12:00:00'} + new_posts = scraper.get_incremental_items(posts, state) + assert len(new_posts) == 1 + assert new_posts[0]['id'] == 3 + + def test_update_state(self, config, mock_env): + scraper = WordPressScraper(config) + + state = {} + posts = [ + {'id': 3, 'date': '2024-01-03T12:00:00'}, + {'id': 2, 'date': '2024-01-02T12:00:00'} + ] + + updated_state = scraper.update_state(state, posts) + + assert updated_state['last_post_id'] == 3 + assert updated_state['last_post_date'] == '2024-01-03T12:00:00' + + @patch('src.wordpress_scraper.WordPressScraper.fetch_tags') + @patch('src.wordpress_scraper.WordPressScraper.fetch_categories') + @patch('src.wordpress_scraper.WordPressScraper.fetch_author') + @patch('src.wordpress_scraper.WordPressScraper.fetch_posts') + def test_fetch_content(self, mock_fetch_posts, mock_fetch_author, + mock_fetch_categories, mock_fetch_tags, config, mock_env): + # Setup mock returns + mock_fetch_posts.return_value = [ + { + 'id': 1, + 'title': {'rendered': 'Test Post'}, + 'author': 1, + 'categories': [1], + 'tags': [2], + 'content': {'rendered': '

Content

'} + } + ] + mock_fetch_author.return_value = {'name': 'John Doe'} + mock_fetch_categories.return_value = [{'name': 'HVAC'}] + mock_fetch_tags.return_value = [{'name': 'tips'}] + + scraper = WordPressScraper(config) + content = scraper.fetch_content() + + assert len(content) == 1 + assert content[0]['author_name'] == 'John Doe' + assert content[0]['category_names'] == ['HVAC'] + assert content[0]['tag_names'] == ['tips'] + assert content[0]['word_count'] > 0 \ No newline at end of file