feat: Implement WordPress scraper with comprehensive tests
- Created WordPressScraper class extending BaseScraper
- Fetches posts with pagination support
- Enriches posts with author, category, and tag information
- Implements incremental updates via state management
- Word count calculation for content
- All 11 tests passing
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
f9a8e719a7
commit
95e0499791
2 changed files with 558 additions and 0 deletions
293
src/wordpress_scraper.py
Normal file
293
src/wordpress_scraper.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
import os
|
||||
import time
|
||||
import requests
|
||||
from typing import Any, Dict, List, Optional
|
||||
from datetime import datetime
|
||||
from src.base_scraper import BaseScraper, ScraperConfig
|
||||
|
||||
|
||||
class WordPressScraper(BaseScraper):
|
||||
def __init__(self, config: ScraperConfig):
|
||||
super().__init__(config)
|
||||
self.base_url = os.getenv('WORDPRESS_URL', 'https://hvacknowitall.com/')
|
||||
self.username = os.getenv('WORDPRESS_USERNAME')
|
||||
self.api_key = os.getenv('WORDPRESS_API_KEY')
|
||||
self.auth = (self.username, self.api_key)
|
||||
|
||||
# Ensure base_url ends with /
|
||||
if not self.base_url.endswith('/'):
|
||||
self.base_url += '/'
|
||||
|
||||
# Cache for authors, categories, and tags
|
||||
self.author_cache = {}
|
||||
self.category_cache = {}
|
||||
self.tag_cache = {}
|
||||
|
||||
def fetch_posts(self, per_page: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Fetch all posts from WordPress API with pagination."""
|
||||
posts = []
|
||||
page = 1
|
||||
|
||||
try:
|
||||
while True:
|
||||
self.logger.info(f"Fetching posts page {page}")
|
||||
response = requests.get(
|
||||
f"{self.base_url}wp-json/wp/v2/posts",
|
||||
params={'per_page': per_page, 'page': page},
|
||||
auth=self.auth,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
self.logger.error(f"Error fetching posts: {response.status_code}")
|
||||
break
|
||||
|
||||
page_posts = response.json()
|
||||
if not page_posts:
|
||||
break
|
||||
|
||||
posts.extend(page_posts)
|
||||
|
||||
# Check if there are more pages
|
||||
total_pages = int(response.headers.get('X-WP-TotalPages', 1))
|
||||
if page >= total_pages:
|
||||
break
|
||||
|
||||
page += 1
|
||||
time.sleep(1) # Rate limiting
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching posts: {e}")
|
||||
|
||||
self.logger.info(f"Fetched {len(posts)} posts total")
|
||||
return posts
|
||||
|
||||
def fetch_author(self, author_id: int) -> Dict[str, Any]:
|
||||
"""Fetch author information."""
|
||||
if author_id in self.author_cache:
|
||||
return self.author_cache[author_id]
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.base_url}wp-json/wp/v2/users/{author_id}",
|
||||
auth=self.auth,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
author = response.json()
|
||||
self.author_cache[author_id] = author
|
||||
return author
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching author {author_id}: {e}")
|
||||
|
||||
return {'name': 'Unknown'}
|
||||
|
||||
def fetch_categories(self, category_ids: List[int]) -> List[Dict[str, Any]]:
|
||||
"""Fetch category information."""
|
||||
categories = []
|
||||
|
||||
for cat_id in category_ids:
|
||||
if cat_id in self.category_cache:
|
||||
categories.append(self.category_cache[cat_id])
|
||||
continue
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.base_url}wp-json/wp/v2/categories/{cat_id}",
|
||||
auth=self.auth,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
category = response.json()
|
||||
self.category_cache[cat_id] = category
|
||||
categories.append(category)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching category {cat_id}: {e}")
|
||||
|
||||
return categories
|
||||
|
||||
def fetch_tags(self, tag_ids: List[int]) -> List[Dict[str, Any]]:
|
||||
"""Fetch tag information."""
|
||||
tags = []
|
||||
|
||||
for tag_id in tag_ids:
|
||||
if tag_id in self.tag_cache:
|
||||
tags.append(self.tag_cache[tag_id])
|
||||
continue
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.base_url}wp-json/wp/v2/tags/{tag_id}",
|
||||
auth=self.auth,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
tag = response.json()
|
||||
self.tag_cache[tag_id] = tag
|
||||
tags.append(tag)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching tag {tag_id}: {e}")
|
||||
|
||||
return tags
|
||||
|
||||
def count_words(self, html_content: str) -> int:
|
||||
"""Count words in HTML content."""
|
||||
# Convert to markdown first to get clean text
|
||||
text = self.convert_to_markdown(html_content)
|
||||
# Simple word count
|
||||
words = text.split()
|
||||
return len(words)
|
||||
|
||||
def fetch_content(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch and enrich all content."""
|
||||
posts = self.fetch_posts()
|
||||
|
||||
# Enrich posts with author, category, and tag information
|
||||
enriched_posts = []
|
||||
for post in posts:
|
||||
try:
|
||||
# Fetch author info
|
||||
author = self.fetch_author(post.get('author', 0))
|
||||
post['author_name'] = author.get('name', 'Unknown')
|
||||
|
||||
# Fetch categories
|
||||
category_ids = post.get('categories', [])
|
||||
if category_ids:
|
||||
categories = self.fetch_categories(category_ids)
|
||||
post['category_names'] = [cat.get('name', '') for cat in categories]
|
||||
else:
|
||||
post['category_names'] = []
|
||||
|
||||
# Fetch tags
|
||||
tag_ids = post.get('tags', [])
|
||||
if tag_ids:
|
||||
tags = self.fetch_tags(tag_ids)
|
||||
post['tag_names'] = [tag.get('name', '') for tag in tags]
|
||||
else:
|
||||
post['tag_names'] = []
|
||||
|
||||
# Count words
|
||||
content_html = post.get('content', {}).get('rendered', '')
|
||||
post['word_count'] = self.count_words(content_html)
|
||||
|
||||
enriched_posts.append(post)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error enriching post {post.get('id')}: {e}")
|
||||
enriched_posts.append(post)
|
||||
|
||||
return enriched_posts
|
||||
|
||||
def format_markdown(self, posts: List[Dict[str, Any]]) -> str:
|
||||
"""Format posts as markdown."""
|
||||
markdown_sections = []
|
||||
|
||||
for post in posts:
|
||||
section = []
|
||||
|
||||
# ID
|
||||
section.append(f"# ID: {post.get('id', 'N/A')}")
|
||||
section.append("")
|
||||
|
||||
# Title
|
||||
title = post.get('title', {}).get('rendered', 'Untitled')
|
||||
section.append(f"## Title: {title}")
|
||||
section.append("")
|
||||
|
||||
# Type
|
||||
section.append("## Type: blog_post")
|
||||
section.append("")
|
||||
|
||||
# Author
|
||||
author = post.get('author_name', 'Unknown')
|
||||
section.append(f"## Author: {author}")
|
||||
section.append("")
|
||||
|
||||
# Publish Date
|
||||
date = post.get('date', '')
|
||||
section.append(f"## Publish Date: {date}")
|
||||
section.append("")
|
||||
|
||||
# Word Count
|
||||
word_count = post.get('word_count', 0)
|
||||
section.append(f"## Word Count: {word_count}")
|
||||
section.append("")
|
||||
|
||||
# Categories
|
||||
categories = ', '.join(post.get('category_names', []))
|
||||
section.append(f"## Categories: {categories if categories else 'None'}")
|
||||
section.append("")
|
||||
|
||||
# Tags
|
||||
tags = ', '.join(post.get('tag_names', []))
|
||||
section.append(f"## Tags: {tags if tags else 'None'}")
|
||||
section.append("")
|
||||
|
||||
# Permalink
|
||||
link = post.get('link', '')
|
||||
section.append(f"## Permalink: {link}")
|
||||
section.append("")
|
||||
|
||||
# Description/Content
|
||||
section.append("## Description:")
|
||||
content_html = post.get('content', {}).get('rendered', '')
|
||||
if content_html:
|
||||
content_md = self.convert_to_markdown(content_html)
|
||||
section.append(content_md)
|
||||
else:
|
||||
excerpt_html = post.get('excerpt', {}).get('rendered', '')
|
||||
if excerpt_html:
|
||||
excerpt_md = self.convert_to_markdown(excerpt_html)
|
||||
section.append(excerpt_md)
|
||||
section.append("")
|
||||
|
||||
# Separator
|
||||
section.append("-" * 50)
|
||||
section.append("")
|
||||
|
||||
markdown_sections.append('\n'.join(section))
|
||||
|
||||
return '\n'.join(markdown_sections)
|
||||
|
||||
def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Get only new posts since last sync."""
|
||||
if not state:
|
||||
# No previous state, return all items
|
||||
return items
|
||||
|
||||
last_post_id = state.get('last_post_id')
|
||||
last_post_date = state.get('last_post_date')
|
||||
|
||||
if not last_post_id:
|
||||
return items
|
||||
|
||||
# Filter for posts newer than the last synced post
|
||||
new_items = []
|
||||
for item in items:
|
||||
post_id = item.get('id')
|
||||
post_date = item.get('date')
|
||||
|
||||
# Check if this is a new post
|
||||
if post_id > last_post_id or (post_date and post_date > last_post_date):
|
||||
new_items.append(item)
|
||||
|
||||
return new_items
|
||||
|
||||
def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Update state with latest post information."""
|
||||
if not items:
|
||||
return state
|
||||
|
||||
# Sort by ID to get the latest
|
||||
sorted_items = sorted(items, key=lambda x: x.get('id', 0), reverse=True)
|
||||
latest_item = sorted_items[0]
|
||||
|
||||
state['last_post_id'] = latest_item.get('id')
|
||||
state['last_post_date'] = latest_item.get('date')
|
||||
state['last_sync'] = datetime.now(self.tz).isoformat()
|
||||
state['post_count'] = len(items)
|
||||
|
||||
return state
|
||||
265
tests/test_wordpress_scraper.py
Normal file
265
tests/test_wordpress_scraper.py
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
import pytest
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from datetime import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
from src.wordpress_scraper import WordPressScraper
|
||||
from src.base_scraper import ScraperConfig
|
||||
|
||||
|
||||
class TestWordPressScraper:
|
||||
@pytest.fixture
|
||||
def config(self):
|
||||
return ScraperConfig(
|
||||
source_name="wordpress",
|
||||
brand_name="hvacknowitall",
|
||||
data_dir=Path("data"),
|
||||
logs_dir=Path("logs"),
|
||||
timezone="America/Halifax"
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_env(self):
|
||||
with patch.dict('os.environ', {
|
||||
'WORDPRESS_URL': 'https://hvacknowitall.com/',
|
||||
'WORDPRESS_USERNAME': 'test@example.com',
|
||||
'WORDPRESS_API_KEY': 'test_api_key'
|
||||
}):
|
||||
yield
|
||||
|
||||
def test_initialization(self, config, mock_env):
|
||||
scraper = WordPressScraper(config)
|
||||
assert scraper.config == config
|
||||
assert scraper.base_url == 'https://hvacknowitall.com/'
|
||||
assert scraper.auth == ('test@example.com', 'test_api_key')
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_posts_success(self, mock_get, config, mock_env):
|
||||
# Mock successful API response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = [
|
||||
{
|
||||
'id': 1,
|
||||
'title': {'rendered': 'Test Post 1'},
|
||||
'author': 1,
|
||||
'date': '2024-01-01T12:00:00',
|
||||
'content': {'rendered': '<p>Test content 1</p>'},
|
||||
'excerpt': {'rendered': '<p>Test excerpt 1</p>'},
|
||||
'link': 'https://hvacknowitall.com/test-post-1',
|
||||
'categories': [1, 2],
|
||||
'tags': [3, 4]
|
||||
},
|
||||
{
|
||||
'id': 2,
|
||||
'title': {'rendered': 'Test Post 2'},
|
||||
'author': 1,
|
||||
'date': '2024-01-02T12:00:00',
|
||||
'content': {'rendered': '<p>Test content 2</p>'},
|
||||
'excerpt': {'rendered': '<p>Test excerpt 2</p>'},
|
||||
'link': 'https://hvacknowitall.com/test-post-2',
|
||||
'categories': [1],
|
||||
'tags': [3]
|
||||
}
|
||||
]
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
posts = scraper.fetch_posts()
|
||||
|
||||
assert len(posts) == 2
|
||||
assert posts[0]['id'] == 1
|
||||
assert posts[0]['title']['rendered'] == 'Test Post 1'
|
||||
mock_get.assert_called_with(
|
||||
'https://hvacknowitall.com/wp-json/wp/v2/posts',
|
||||
params={'per_page': 100, 'page': 1},
|
||||
auth=('test@example.com', 'test_api_key'),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_posts_with_pagination(self, mock_get, config, mock_env):
|
||||
# First page response
|
||||
mock_response1 = Mock()
|
||||
mock_response1.status_code = 200
|
||||
mock_response1.headers = {'X-WP-TotalPages': '2'}
|
||||
mock_response1.json.return_value = [{'id': 1, 'title': {'rendered': 'Post 1'}}]
|
||||
|
||||
# Second page response
|
||||
mock_response2 = Mock()
|
||||
mock_response2.status_code = 200
|
||||
mock_response2.headers = {'X-WP-TotalPages': '2'}
|
||||
mock_response2.json.return_value = [{'id': 2, 'title': {'rendered': 'Post 2'}}]
|
||||
|
||||
mock_get.side_effect = [mock_response1, mock_response2]
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
posts = scraper.fetch_posts()
|
||||
|
||||
assert len(posts) == 2
|
||||
assert posts[0]['id'] == 1
|
||||
assert posts[1]['id'] == 2
|
||||
assert mock_get.call_count == 2
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_posts_error_handling(self, mock_get, config, mock_env):
|
||||
mock_get.side_effect = Exception("Connection error")
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
posts = scraper.fetch_posts()
|
||||
|
||||
assert posts == []
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_author_info(self, mock_get, config, mock_env):
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {
|
||||
'id': 1,
|
||||
'name': 'John Doe',
|
||||
'slug': 'john-doe'
|
||||
}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
author = scraper.fetch_author(1)
|
||||
|
||||
assert author['name'] == 'John Doe'
|
||||
mock_get.assert_called_with(
|
||||
'https://hvacknowitall.com/wp-json/wp/v2/users/1',
|
||||
auth=('test@example.com', 'test_api_key'),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_categories(self, mock_get, config, mock_env):
|
||||
# Mock individual category responses
|
||||
mock_response1 = Mock()
|
||||
mock_response1.status_code = 200
|
||||
mock_response1.json.return_value = {'id': 1, 'name': 'Category 1'}
|
||||
|
||||
mock_response2 = Mock()
|
||||
mock_response2.status_code = 200
|
||||
mock_response2.json.return_value = {'id': 2, 'name': 'Category 2'}
|
||||
|
||||
mock_get.side_effect = [mock_response1, mock_response2]
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
categories = scraper.fetch_categories([1, 2])
|
||||
|
||||
assert len(categories) == 2
|
||||
assert categories[0]['name'] == 'Category 1'
|
||||
assert categories[1]['name'] == 'Category 2'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_fetch_tags(self, mock_get, config, mock_env):
|
||||
# Mock individual tag responses
|
||||
mock_response1 = Mock()
|
||||
mock_response1.status_code = 200
|
||||
mock_response1.json.return_value = {'id': 3, 'name': 'Tag 1'}
|
||||
|
||||
mock_response2 = Mock()
|
||||
mock_response2.status_code = 200
|
||||
mock_response2.json.return_value = {'id': 4, 'name': 'Tag 2'}
|
||||
|
||||
mock_get.side_effect = [mock_response1, mock_response2]
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
tags = scraper.fetch_tags([3, 4])
|
||||
|
||||
assert len(tags) == 2
|
||||
assert tags[0]['name'] == 'Tag 1'
|
||||
assert tags[1]['name'] == 'Tag 2'
|
||||
|
||||
def test_format_markdown(self, config, mock_env):
|
||||
scraper = WordPressScraper(config)
|
||||
|
||||
# Mock enriched posts data
|
||||
posts = [
|
||||
{
|
||||
'id': 1,
|
||||
'title': {'rendered': 'Test Post 1'},
|
||||
'author_name': 'John Doe',
|
||||
'date': '2024-01-01T12:00:00',
|
||||
'content': {'rendered': '<p>Test content 1</p>'},
|
||||
'link': 'https://hvacknowitall.com/test-post-1',
|
||||
'category_names': ['HVAC', 'Tips'],
|
||||
'tag_names': ['maintenance', 'diy'],
|
||||
'word_count': 500
|
||||
}
|
||||
]
|
||||
|
||||
markdown = scraper.format_markdown(posts)
|
||||
|
||||
assert '# ID: 1' in markdown
|
||||
assert '## Title: Test Post 1' in markdown
|
||||
assert '## Author: John Doe' in markdown
|
||||
assert '## Publish Date: 2024-01-01T12:00:00' in markdown
|
||||
assert '## Word Count: 500' in markdown
|
||||
assert '## Categories: HVAC, Tips' in markdown
|
||||
assert '## Tags: maintenance, diy' in markdown
|
||||
assert '## Permalink: https://hvacknowitall.com/test-post-1' in markdown
|
||||
|
||||
def test_get_incremental_items(self, config, mock_env):
|
||||
scraper = WordPressScraper(config)
|
||||
|
||||
posts = [
|
||||
{'id': 3, 'date': '2024-01-03T12:00:00'},
|
||||
{'id': 2, 'date': '2024-01-02T12:00:00'},
|
||||
{'id': 1, 'date': '2024-01-01T12:00:00'}
|
||||
]
|
||||
|
||||
# Test with no previous state
|
||||
state = {}
|
||||
new_posts = scraper.get_incremental_items(posts, state)
|
||||
assert len(new_posts) == 3
|
||||
|
||||
# Test with existing state
|
||||
state = {'last_post_id': 2, 'last_post_date': '2024-01-02T12:00:00'}
|
||||
new_posts = scraper.get_incremental_items(posts, state)
|
||||
assert len(new_posts) == 1
|
||||
assert new_posts[0]['id'] == 3
|
||||
|
||||
def test_update_state(self, config, mock_env):
|
||||
scraper = WordPressScraper(config)
|
||||
|
||||
state = {}
|
||||
posts = [
|
||||
{'id': 3, 'date': '2024-01-03T12:00:00'},
|
||||
{'id': 2, 'date': '2024-01-02T12:00:00'}
|
||||
]
|
||||
|
||||
updated_state = scraper.update_state(state, posts)
|
||||
|
||||
assert updated_state['last_post_id'] == 3
|
||||
assert updated_state['last_post_date'] == '2024-01-03T12:00:00'
|
||||
|
||||
@patch('src.wordpress_scraper.WordPressScraper.fetch_tags')
|
||||
@patch('src.wordpress_scraper.WordPressScraper.fetch_categories')
|
||||
@patch('src.wordpress_scraper.WordPressScraper.fetch_author')
|
||||
@patch('src.wordpress_scraper.WordPressScraper.fetch_posts')
|
||||
def test_fetch_content(self, mock_fetch_posts, mock_fetch_author,
|
||||
mock_fetch_categories, mock_fetch_tags, config, mock_env):
|
||||
# Setup mock returns
|
||||
mock_fetch_posts.return_value = [
|
||||
{
|
||||
'id': 1,
|
||||
'title': {'rendered': 'Test Post'},
|
||||
'author': 1,
|
||||
'categories': [1],
|
||||
'tags': [2],
|
||||
'content': {'rendered': '<p>Content</p>'}
|
||||
}
|
||||
]
|
||||
mock_fetch_author.return_value = {'name': 'John Doe'}
|
||||
mock_fetch_categories.return_value = [{'name': 'HVAC'}]
|
||||
mock_fetch_tags.return_value = [{'name': 'tips'}]
|
||||
|
||||
scraper = WordPressScraper(config)
|
||||
content = scraper.fetch_content()
|
||||
|
||||
assert len(content) == 1
|
||||
assert content[0]['author_name'] == 'John Doe'
|
||||
assert content[0]['category_names'] == ['HVAC']
|
||||
assert content[0]['tag_names'] == ['tips']
|
||||
assert content[0]['word_count'] > 0
|
||||
Loading…
Reference in a new issue