import pytest from unittest.mock import Mock, patch, MagicMock from datetime import datetime import json from pathlib import Path from src.wordpress_scraper import WordPressScraper from src.base_scraper import ScraperConfig class TestWordPressScraper: @pytest.fixture def config(self): return ScraperConfig( source_name="wordpress", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) @pytest.fixture def mock_env(self): with patch.dict('os.environ', { 'WORDPRESS_URL': 'https://hvacknowitall.com/', 'WORDPRESS_USERNAME': 'test@example.com', 'WORDPRESS_API_KEY': 'test_api_key' }): yield def test_initialization(self, config, mock_env): scraper = WordPressScraper(config) assert scraper.config == config assert scraper.base_url == 'https://hvacknowitall.com/' assert scraper.auth == ('test@example.com', 'test_api_key') @patch('requests.get') def test_fetch_posts_success(self, mock_get, config, mock_env): # Mock successful API response mock_response = Mock() mock_response.status_code = 200 mock_response.json.return_value = [ { 'id': 1, 'title': {'rendered': 'Test Post 1'}, 'author': 1, 'date': '2024-01-01T12:00:00', 'content': {'rendered': '
Test content 1
'}, 'excerpt': {'rendered': 'Test excerpt 1
'}, 'link': 'https://hvacknowitall.com/test-post-1', 'categories': [1, 2], 'tags': [3, 4] }, { 'id': 2, 'title': {'rendered': 'Test Post 2'}, 'author': 1, 'date': '2024-01-02T12:00:00', 'content': {'rendered': 'Test content 2
'}, 'excerpt': {'rendered': 'Test excerpt 2
'}, 'link': 'https://hvacknowitall.com/test-post-2', 'categories': [1], 'tags': [3] } ] mock_get.return_value = mock_response scraper = WordPressScraper(config) posts = scraper.fetch_posts() assert len(posts) == 2 assert posts[0]['id'] == 1 assert posts[0]['title']['rendered'] == 'Test Post 1' mock_get.assert_called_with( 'https://hvacknowitall.com/wp-json/wp/v2/posts', params={'per_page': 100, 'page': 1}, auth=('test@example.com', 'test_api_key'), timeout=30 ) @patch('requests.get') def test_fetch_posts_with_pagination(self, mock_get, config, mock_env): # First page response mock_response1 = Mock() mock_response1.status_code = 200 mock_response1.headers = {'X-WP-TotalPages': '2'} mock_response1.json.return_value = [{'id': 1, 'title': {'rendered': 'Post 1'}}] # Second page response mock_response2 = Mock() mock_response2.status_code = 200 mock_response2.headers = {'X-WP-TotalPages': '2'} mock_response2.json.return_value = [{'id': 2, 'title': {'rendered': 'Post 2'}}] mock_get.side_effect = [mock_response1, mock_response2] scraper = WordPressScraper(config) posts = scraper.fetch_posts() assert len(posts) == 2 assert posts[0]['id'] == 1 assert posts[1]['id'] == 2 assert mock_get.call_count == 2 @patch('requests.get') def test_fetch_posts_error_handling(self, mock_get, config, mock_env): mock_get.side_effect = Exception("Connection error") scraper = WordPressScraper(config) posts = scraper.fetch_posts() assert posts == [] @patch('requests.get') def test_fetch_author_info(self, mock_get, config, mock_env): mock_response = Mock() mock_response.status_code = 200 mock_response.json.return_value = { 'id': 1, 'name': 'John Doe', 'slug': 'john-doe' } mock_get.return_value = mock_response scraper = WordPressScraper(config) author = scraper.fetch_author(1) assert author['name'] == 'John Doe' mock_get.assert_called_with( 'https://hvacknowitall.com/wp-json/wp/v2/users/1', auth=('test@example.com', 'test_api_key'), timeout=30 ) @patch('requests.get') def test_fetch_categories(self, mock_get, config, mock_env): # Mock individual category responses mock_response1 = Mock() mock_response1.status_code = 200 mock_response1.json.return_value = {'id': 1, 'name': 'Category 1'} mock_response2 = Mock() mock_response2.status_code = 200 mock_response2.json.return_value = {'id': 2, 'name': 'Category 2'} mock_get.side_effect = [mock_response1, mock_response2] scraper = WordPressScraper(config) categories = scraper.fetch_categories([1, 2]) assert len(categories) == 2 assert categories[0]['name'] == 'Category 1' assert categories[1]['name'] == 'Category 2' @patch('requests.get') def test_fetch_tags(self, mock_get, config, mock_env): # Mock individual tag responses mock_response1 = Mock() mock_response1.status_code = 200 mock_response1.json.return_value = {'id': 3, 'name': 'Tag 1'} mock_response2 = Mock() mock_response2.status_code = 200 mock_response2.json.return_value = {'id': 4, 'name': 'Tag 2'} mock_get.side_effect = [mock_response1, mock_response2] scraper = WordPressScraper(config) tags = scraper.fetch_tags([3, 4]) assert len(tags) == 2 assert tags[0]['name'] == 'Tag 1' assert tags[1]['name'] == 'Tag 2' def test_format_markdown(self, config, mock_env): scraper = WordPressScraper(config) # Mock enriched posts data posts = [ { 'id': 1, 'title': {'rendered': 'Test Post 1'}, 'author_name': 'John Doe', 'date': '2024-01-01T12:00:00', 'content': {'rendered': 'Test content 1
'}, 'link': 'https://hvacknowitall.com/test-post-1', 'category_names': ['HVAC', 'Tips'], 'tag_names': ['maintenance', 'diy'], 'word_count': 500 } ] markdown = scraper.format_markdown(posts) assert '# ID: 1' in markdown assert '## Title: Test Post 1' in markdown assert '## Author: John Doe' in markdown assert '## Publish Date: 2024-01-01T12:00:00' in markdown assert '## Word Count: 500' in markdown assert '## Categories: HVAC, Tips' in markdown assert '## Tags: maintenance, diy' in markdown assert '## Permalink: https://hvacknowitall.com/test-post-1' in markdown def test_get_incremental_items(self, config, mock_env): scraper = WordPressScraper(config) posts = [ {'id': 3, 'date': '2024-01-03T12:00:00'}, {'id': 2, 'date': '2024-01-02T12:00:00'}, {'id': 1, 'date': '2024-01-01T12:00:00'} ] # Test with no previous state state = {} new_posts = scraper.get_incremental_items(posts, state) assert len(new_posts) == 3 # Test with existing state state = {'last_post_id': 2, 'last_post_date': '2024-01-02T12:00:00'} new_posts = scraper.get_incremental_items(posts, state) assert len(new_posts) == 1 assert new_posts[0]['id'] == 3 def test_update_state(self, config, mock_env): scraper = WordPressScraper(config) state = {} posts = [ {'id': 3, 'date': '2024-01-03T12:00:00'}, {'id': 2, 'date': '2024-01-02T12:00:00'} ] updated_state = scraper.update_state(state, posts) assert updated_state['last_post_id'] == 3 assert updated_state['last_post_date'] == '2024-01-03T12:00:00' @patch('src.wordpress_scraper.WordPressScraper.fetch_tags') @patch('src.wordpress_scraper.WordPressScraper.fetch_categories') @patch('src.wordpress_scraper.WordPressScraper.fetch_author') @patch('src.wordpress_scraper.WordPressScraper.fetch_posts') def test_fetch_content(self, mock_fetch_posts, mock_fetch_author, mock_fetch_categories, mock_fetch_tags, config, mock_env): # Setup mock returns mock_fetch_posts.return_value = [ { 'id': 1, 'title': {'rendered': 'Test Post'}, 'author': 1, 'categories': [1], 'tags': [2], 'content': {'rendered': 'Content
'} } ] mock_fetch_author.return_value = {'name': 'John Doe'} mock_fetch_categories.return_value = [{'name': 'HVAC'}] mock_fetch_tags.return_value = [{'name': 'tips'}] scraper = WordPressScraper(config) content = scraper.fetch_content() assert len(content) == 1 assert content[0]['author_name'] == 'John Doe' assert content[0]['category_names'] == ['HVAC'] assert content[0]['tag_names'] == ['tips'] assert content[0]['word_count'] > 0