- Created WordPressScraper class extending BaseScraper
- Fetches posts with pagination support
- Enriches posts with author, category, and tag information
- Implements incremental updates via state management
- Word count calculation for content
- All 11 tests passing
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
265 lines
No EOL
9.5 KiB
Python
265 lines
No EOL
9.5 KiB
Python
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from datetime import datetime
|
|
import json
|
|
from pathlib import Path
|
|
from src.wordpress_scraper import WordPressScraper
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
|
|
class TestWordPressScraper:
|
|
@pytest.fixture
|
|
def config(self):
|
|
return ScraperConfig(
|
|
source_name="wordpress",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_env(self):
|
|
with patch.dict('os.environ', {
|
|
'WORDPRESS_URL': 'https://hvacknowitall.com/',
|
|
'WORDPRESS_USERNAME': 'test@example.com',
|
|
'WORDPRESS_API_KEY': 'test_api_key'
|
|
}):
|
|
yield
|
|
|
|
def test_initialization(self, config, mock_env):
|
|
scraper = WordPressScraper(config)
|
|
assert scraper.config == config
|
|
assert scraper.base_url == 'https://hvacknowitall.com/'
|
|
assert scraper.auth == ('test@example.com', 'test_api_key')
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_posts_success(self, mock_get, config, mock_env):
|
|
# Mock successful API response
|
|
mock_response = Mock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = [
|
|
{
|
|
'id': 1,
|
|
'title': {'rendered': 'Test Post 1'},
|
|
'author': 1,
|
|
'date': '2024-01-01T12:00:00',
|
|
'content': {'rendered': '<p>Test content 1</p>'},
|
|
'excerpt': {'rendered': '<p>Test excerpt 1</p>'},
|
|
'link': 'https://hvacknowitall.com/test-post-1',
|
|
'categories': [1, 2],
|
|
'tags': [3, 4]
|
|
},
|
|
{
|
|
'id': 2,
|
|
'title': {'rendered': 'Test Post 2'},
|
|
'author': 1,
|
|
'date': '2024-01-02T12:00:00',
|
|
'content': {'rendered': '<p>Test content 2</p>'},
|
|
'excerpt': {'rendered': '<p>Test excerpt 2</p>'},
|
|
'link': 'https://hvacknowitall.com/test-post-2',
|
|
'categories': [1],
|
|
'tags': [3]
|
|
}
|
|
]
|
|
mock_get.return_value = mock_response
|
|
|
|
scraper = WordPressScraper(config)
|
|
posts = scraper.fetch_posts()
|
|
|
|
assert len(posts) == 2
|
|
assert posts[0]['id'] == 1
|
|
assert posts[0]['title']['rendered'] == 'Test Post 1'
|
|
mock_get.assert_called_with(
|
|
'https://hvacknowitall.com/wp-json/wp/v2/posts',
|
|
params={'per_page': 100, 'page': 1},
|
|
auth=('test@example.com', 'test_api_key'),
|
|
timeout=30
|
|
)
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_posts_with_pagination(self, mock_get, config, mock_env):
|
|
# First page response
|
|
mock_response1 = Mock()
|
|
mock_response1.status_code = 200
|
|
mock_response1.headers = {'X-WP-TotalPages': '2'}
|
|
mock_response1.json.return_value = [{'id': 1, 'title': {'rendered': 'Post 1'}}]
|
|
|
|
# Second page response
|
|
mock_response2 = Mock()
|
|
mock_response2.status_code = 200
|
|
mock_response2.headers = {'X-WP-TotalPages': '2'}
|
|
mock_response2.json.return_value = [{'id': 2, 'title': {'rendered': 'Post 2'}}]
|
|
|
|
mock_get.side_effect = [mock_response1, mock_response2]
|
|
|
|
scraper = WordPressScraper(config)
|
|
posts = scraper.fetch_posts()
|
|
|
|
assert len(posts) == 2
|
|
assert posts[0]['id'] == 1
|
|
assert posts[1]['id'] == 2
|
|
assert mock_get.call_count == 2
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_posts_error_handling(self, mock_get, config, mock_env):
|
|
mock_get.side_effect = Exception("Connection error")
|
|
|
|
scraper = WordPressScraper(config)
|
|
posts = scraper.fetch_posts()
|
|
|
|
assert posts == []
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_author_info(self, mock_get, config, mock_env):
|
|
mock_response = Mock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = {
|
|
'id': 1,
|
|
'name': 'John Doe',
|
|
'slug': 'john-doe'
|
|
}
|
|
mock_get.return_value = mock_response
|
|
|
|
scraper = WordPressScraper(config)
|
|
author = scraper.fetch_author(1)
|
|
|
|
assert author['name'] == 'John Doe'
|
|
mock_get.assert_called_with(
|
|
'https://hvacknowitall.com/wp-json/wp/v2/users/1',
|
|
auth=('test@example.com', 'test_api_key'),
|
|
timeout=30
|
|
)
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_categories(self, mock_get, config, mock_env):
|
|
# Mock individual category responses
|
|
mock_response1 = Mock()
|
|
mock_response1.status_code = 200
|
|
mock_response1.json.return_value = {'id': 1, 'name': 'Category 1'}
|
|
|
|
mock_response2 = Mock()
|
|
mock_response2.status_code = 200
|
|
mock_response2.json.return_value = {'id': 2, 'name': 'Category 2'}
|
|
|
|
mock_get.side_effect = [mock_response1, mock_response2]
|
|
|
|
scraper = WordPressScraper(config)
|
|
categories = scraper.fetch_categories([1, 2])
|
|
|
|
assert len(categories) == 2
|
|
assert categories[0]['name'] == 'Category 1'
|
|
assert categories[1]['name'] == 'Category 2'
|
|
|
|
@patch('requests.get')
|
|
def test_fetch_tags(self, mock_get, config, mock_env):
|
|
# Mock individual tag responses
|
|
mock_response1 = Mock()
|
|
mock_response1.status_code = 200
|
|
mock_response1.json.return_value = {'id': 3, 'name': 'Tag 1'}
|
|
|
|
mock_response2 = Mock()
|
|
mock_response2.status_code = 200
|
|
mock_response2.json.return_value = {'id': 4, 'name': 'Tag 2'}
|
|
|
|
mock_get.side_effect = [mock_response1, mock_response2]
|
|
|
|
scraper = WordPressScraper(config)
|
|
tags = scraper.fetch_tags([3, 4])
|
|
|
|
assert len(tags) == 2
|
|
assert tags[0]['name'] == 'Tag 1'
|
|
assert tags[1]['name'] == 'Tag 2'
|
|
|
|
def test_format_markdown(self, config, mock_env):
|
|
scraper = WordPressScraper(config)
|
|
|
|
# Mock enriched posts data
|
|
posts = [
|
|
{
|
|
'id': 1,
|
|
'title': {'rendered': 'Test Post 1'},
|
|
'author_name': 'John Doe',
|
|
'date': '2024-01-01T12:00:00',
|
|
'content': {'rendered': '<p>Test content 1</p>'},
|
|
'link': 'https://hvacknowitall.com/test-post-1',
|
|
'category_names': ['HVAC', 'Tips'],
|
|
'tag_names': ['maintenance', 'diy'],
|
|
'word_count': 500
|
|
}
|
|
]
|
|
|
|
markdown = scraper.format_markdown(posts)
|
|
|
|
assert '# ID: 1' in markdown
|
|
assert '## Title: Test Post 1' in markdown
|
|
assert '## Author: John Doe' in markdown
|
|
assert '## Publish Date: 2024-01-01T12:00:00' in markdown
|
|
assert '## Word Count: 500' in markdown
|
|
assert '## Categories: HVAC, Tips' in markdown
|
|
assert '## Tags: maintenance, diy' in markdown
|
|
assert '## Permalink: https://hvacknowitall.com/test-post-1' in markdown
|
|
|
|
def test_get_incremental_items(self, config, mock_env):
|
|
scraper = WordPressScraper(config)
|
|
|
|
posts = [
|
|
{'id': 3, 'date': '2024-01-03T12:00:00'},
|
|
{'id': 2, 'date': '2024-01-02T12:00:00'},
|
|
{'id': 1, 'date': '2024-01-01T12:00:00'}
|
|
]
|
|
|
|
# Test with no previous state
|
|
state = {}
|
|
new_posts = scraper.get_incremental_items(posts, state)
|
|
assert len(new_posts) == 3
|
|
|
|
# Test with existing state
|
|
state = {'last_post_id': 2, 'last_post_date': '2024-01-02T12:00:00'}
|
|
new_posts = scraper.get_incremental_items(posts, state)
|
|
assert len(new_posts) == 1
|
|
assert new_posts[0]['id'] == 3
|
|
|
|
def test_update_state(self, config, mock_env):
|
|
scraper = WordPressScraper(config)
|
|
|
|
state = {}
|
|
posts = [
|
|
{'id': 3, 'date': '2024-01-03T12:00:00'},
|
|
{'id': 2, 'date': '2024-01-02T12:00:00'}
|
|
]
|
|
|
|
updated_state = scraper.update_state(state, posts)
|
|
|
|
assert updated_state['last_post_id'] == 3
|
|
assert updated_state['last_post_date'] == '2024-01-03T12:00:00'
|
|
|
|
@patch('src.wordpress_scraper.WordPressScraper.fetch_tags')
|
|
@patch('src.wordpress_scraper.WordPressScraper.fetch_categories')
|
|
@patch('src.wordpress_scraper.WordPressScraper.fetch_author')
|
|
@patch('src.wordpress_scraper.WordPressScraper.fetch_posts')
|
|
def test_fetch_content(self, mock_fetch_posts, mock_fetch_author,
|
|
mock_fetch_categories, mock_fetch_tags, config, mock_env):
|
|
# Setup mock returns
|
|
mock_fetch_posts.return_value = [
|
|
{
|
|
'id': 1,
|
|
'title': {'rendered': 'Test Post'},
|
|
'author': 1,
|
|
'categories': [1],
|
|
'tags': [2],
|
|
'content': {'rendered': '<p>Content</p>'}
|
|
}
|
|
]
|
|
mock_fetch_author.return_value = {'name': 'John Doe'}
|
|
mock_fetch_categories.return_value = [{'name': 'HVAC'}]
|
|
mock_fetch_tags.return_value = [{'name': 'tips'}]
|
|
|
|
scraper = WordPressScraper(config)
|
|
content = scraper.fetch_content()
|
|
|
|
assert len(content) == 1
|
|
assert content[0]['author_name'] == 'John Doe'
|
|
assert content[0]['category_names'] == ['HVAC']
|
|
assert content[0]['tag_names'] == ['tips']
|
|
assert content[0]['word_count'] > 0 |