- Created unit tests for BaseScraper with mocking - Added integration tests for parallel processing - Created end-to-end tests with realistic mock data - Fixed initialization order in BaseScraper (logger before user agent) - Fixed orchestrator method name (archive_current_file) - Added tenacity dependency for retry logic - Validated parallel processing performance and overlap detection - Confirmed spec-compliant markdown formatting in tests Tests cover: - Base scraper functionality (state, markdown, retry logic, media downloads) - Parallel vs sequential execution timing - Error isolation between scrapers - Directory structure creation - State management across runs - Full workflow with realistic data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
441 lines
No EOL
18 KiB
Python
441 lines
No EOL
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
End-to-end tests with mock data for full workflow validation
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
import time
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
import requests
|
|
|
|
# Add project to path
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.orchestrator import ContentOrchestrator
|
|
from src.base_scraper import BaseScraper, ScraperConfig
|
|
from src.rss_scraper import BaseRSSScraper
|
|
|
|
|
|
class MockEndToEndScraper(BaseScraper):
|
|
"""End-to-end mock scraper with realistic data"""
|
|
|
|
def __init__(self, config: ScraperConfig, mock_data: list):
|
|
super().__init__(config)
|
|
self.mock_data = mock_data
|
|
|
|
def fetch_content(self):
|
|
return self.mock_data
|
|
|
|
def get_incremental_items(self, items, state):
|
|
if not state.get('last_id'):
|
|
return items
|
|
|
|
# Find items after last_id
|
|
last_seen = False
|
|
new_items = []
|
|
for item in items:
|
|
if last_seen:
|
|
new_items.append(item)
|
|
elif item['id'] == state['last_id']:
|
|
last_seen = True
|
|
return new_items
|
|
|
|
def update_state(self, state, items):
|
|
if items:
|
|
state['last_id'] = items[-1]['id']
|
|
return state
|
|
|
|
|
|
class TestEndToEnd:
|
|
"""End-to-end workflow tests"""
|
|
|
|
@pytest.fixture
|
|
def temp_config(self):
|
|
"""Create temporary config for testing"""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
config = ScraperConfig(
|
|
source_name="e2e_test",
|
|
brand_name="hvacknowitall",
|
|
data_dir=temp_path / "data",
|
|
logs_dir=temp_path / "logs",
|
|
timezone="America/Halifax"
|
|
)
|
|
yield config
|
|
|
|
@pytest.fixture
|
|
def mock_wordpress_data(self):
|
|
"""Mock WordPress blog post data"""
|
|
return [
|
|
{
|
|
'id': 'wp_1001',
|
|
'title': 'Understanding HVAC System Efficiency',
|
|
'url': 'https://hvacknowitall.com/hvac-efficiency/',
|
|
'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.',
|
|
'author': 'HVAC Expert',
|
|
'publish_date': '2024-01-15T10:30:00',
|
|
'word_count': 1250,
|
|
'tags': ['efficiency', 'energy-saving', 'maintenance'],
|
|
'categories': ['HVAC Tips', 'Energy Efficiency']
|
|
},
|
|
{
|
|
'id': 'wp_1002',
|
|
'title': 'Common HVAC Problems in Winter',
|
|
'url': 'https://hvacknowitall.com/winter-hvac-problems/',
|
|
'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.',
|
|
'author': 'HVAC Expert',
|
|
'publish_date': '2024-01-20T14:15:00',
|
|
'word_count': 980,
|
|
'tags': ['troubleshooting', 'winter', 'maintenance'],
|
|
'categories': ['Troubleshooting', 'Seasonal Tips']
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def mock_youtube_data(self):
|
|
"""Mock YouTube video data"""
|
|
return [
|
|
{
|
|
'id': 'yt_abc123',
|
|
'title': 'How to Replace Your Air Filter - DIY HVAC',
|
|
'url': 'https://youtube.com/watch?v=abc123',
|
|
'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!',
|
|
'author': 'HVAC Know It All',
|
|
'type': 'video',
|
|
'views': 15420,
|
|
'likes': 247,
|
|
'comments': 18,
|
|
'shares': 12,
|
|
'duration': '8:45',
|
|
'tags': ['DIY', 'air filter', 'maintenance']
|
|
},
|
|
{
|
|
'id': 'yt_def456',
|
|
'title': 'HVAC Short: Quick Thermostat Tip',
|
|
'url': 'https://youtube.com/shorts/def456',
|
|
'description': 'Quick tip for optimizing your thermostat settings.',
|
|
'author': 'HVAC Know It All',
|
|
'type': 'short',
|
|
'views': 8934,
|
|
'likes': 156,
|
|
'comments': 7,
|
|
'shares': 23,
|
|
'duration': '0:58',
|
|
'tags': ['thermostat', 'tips', 'energy-saving']
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def mock_podcast_data(self):
|
|
"""Mock podcast episode data"""
|
|
return [
|
|
{
|
|
'id': 'pod_ep101',
|
|
'title': 'Episode 101: Heat Pump vs Furnace Debate',
|
|
'url': 'https://hvacknowitall.com/podcast/ep101/',
|
|
'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.',
|
|
'author': 'HVAC Know It All Podcast',
|
|
'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3',
|
|
'duration': '45:32',
|
|
'publish_date': '2024-01-18T09:00:00',
|
|
'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg',
|
|
'tags': ['heat pump', 'furnace', 'comparison']
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data):
|
|
"""Create orchestrator with realistic mock data"""
|
|
orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)
|
|
|
|
# Replace scrapers with mock versions
|
|
orchestrator.scrapers = {
|
|
'wordpress': MockEndToEndScraper(
|
|
ScraperConfig(
|
|
source_name="wordpress",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
mock_wordpress_data
|
|
),
|
|
'youtube': MockEndToEndScraper(
|
|
ScraperConfig(
|
|
source_name="youtube",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
mock_youtube_data
|
|
),
|
|
'podcast': MockEndToEndScraper(
|
|
ScraperConfig(
|
|
source_name="podcast",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
mock_podcast_data
|
|
)
|
|
}
|
|
|
|
return orchestrator
|
|
|
|
def test_full_workflow_execution(self, orchestrator_with_mock_data):
|
|
"""Test complete workflow from start to finish"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
|
|
# Run full workflow
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Verify markdown files were created
|
|
markdown_dir = orchestrator.config.data_dir / "markdown_current"
|
|
markdown_files = list(markdown_dir.glob("*.md"))
|
|
assert len(markdown_files) == 3
|
|
|
|
# Verify file naming convention
|
|
for file_path in markdown_files:
|
|
filename = file_path.name
|
|
assert filename.startswith("hvacknowitall_")
|
|
assert any(source in filename for source in ['wordpress', 'youtube', 'podcast'])
|
|
assert ".md" in filename
|
|
# Check timestamp format (YYYY-DD-MM-THHMMSS)
|
|
assert len(filename.split('_')) >= 3
|
|
|
|
def test_markdown_format_compliance(self, orchestrator_with_mock_data):
|
|
"""Test that generated markdown follows specification exactly"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check each markdown file
|
|
markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
|
|
|
|
for file_path in markdown_files:
|
|
content = file_path.read_text(encoding='utf-8')
|
|
|
|
# Verify spec format for each item
|
|
assert "# ID:" in content
|
|
assert "## Title:" in content
|
|
assert "## Type:" in content
|
|
assert "## Permalink:" in content
|
|
assert "## Description:" in content
|
|
assert "## Metadata:" in content
|
|
assert "### Comments:" in content
|
|
assert "### Likes:" in content
|
|
assert "### Tags:" in content
|
|
|
|
# Verify separator between items
|
|
if content.count("# ID:") > 1:
|
|
assert "--------------" in content
|
|
|
|
# Verify specific content based on source
|
|
if "wordpress" in file_path.name:
|
|
assert "Understanding HVAC System Efficiency" in content
|
|
assert "energy-saving" in content
|
|
assert "1250" in content # word count should be preserved
|
|
|
|
elif "youtube" in file_path.name:
|
|
assert "How to Replace Your Air Filter" in content
|
|
assert "15420" in content # view count
|
|
assert "247" in content # like count
|
|
|
|
elif "podcast" in file_path.name:
|
|
assert "Heat Pump vs Furnace Debate" in content
|
|
assert "45:32" in content # duration
|
|
|
|
def test_directory_structure_creation(self, orchestrator_with_mock_data):
|
|
"""Test that proper directory structure is created"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
base_dir = orchestrator.config.data_dir
|
|
logs_dir = orchestrator.config.logs_dir
|
|
|
|
# Check main directories
|
|
assert (base_dir / "markdown_current").exists()
|
|
assert (base_dir / "markdown_archives").exists()
|
|
assert (base_dir / "media").exists()
|
|
assert (base_dir / ".state").exists()
|
|
|
|
# Check source-specific directories
|
|
sources = ['Wordpress', 'Youtube', 'Podcast']
|
|
for source in sources:
|
|
assert (base_dir / "markdown_archives" / source).exists()
|
|
assert (base_dir / "media" / source).exists()
|
|
assert (logs_dir / source).exists()
|
|
|
|
def test_state_persistence_workflow(self, orchestrator_with_mock_data):
|
|
"""Test incremental updates with state persistence"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
|
|
# First run - should process all items
|
|
orchestrator.run_all_scrapers(parallel=False)
|
|
|
|
# Check state files were created
|
|
state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
|
|
assert len(state_files) == 3
|
|
|
|
# Verify state content
|
|
wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json"
|
|
assert wordpress_state_file.exists()
|
|
|
|
state_data = json.loads(wordpress_state_file.read_text())
|
|
assert 'last_id' in state_data
|
|
assert 'last_update' in state_data
|
|
assert 'last_item_count' in state_data
|
|
assert state_data['last_id'] == 'wp_1002' # Last item ID
|
|
assert state_data['last_item_count'] == 2 # Both items processed
|
|
|
|
# Add new item to WordPress scraper
|
|
new_item = {
|
|
'id': 'wp_1003',
|
|
'title': 'New HVAC Article',
|
|
'url': 'https://hvacknowitall.com/new-article/',
|
|
'description': 'Brand new article about HVAC.',
|
|
'author': 'HVAC Expert',
|
|
'tags': ['new'],
|
|
'categories': ['News']
|
|
}
|
|
orchestrator.scrapers['wordpress'].mock_data.append(new_item)
|
|
|
|
# Archive existing files to simulate next run
|
|
for scraper in orchestrator.scrapers.values():
|
|
scraper.archive_current_file()
|
|
|
|
# Second run - should only process new item
|
|
orchestrator.run_all_scrapers(parallel=False)
|
|
|
|
# Check that only incremental content was processed
|
|
updated_state = json.loads(wordpress_state_file.read_text())
|
|
assert updated_state['last_id'] == 'wp_1003'
|
|
assert updated_state['last_item_count'] == 1 # Only new item
|
|
|
|
# Verify new markdown contains only new item
|
|
new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md"))
|
|
assert len(new_markdown_files) == 1
|
|
|
|
new_content = new_markdown_files[0].read_text()
|
|
assert "New HVAC Article" in new_content
|
|
assert "Understanding HVAC System Efficiency" not in new_content # Old content not repeated
|
|
|
|
@patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
|
|
def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data):
|
|
"""Test NAS sync is called with correct parameters"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Verify sync was called
|
|
mock_sync.assert_called_once()
|
|
|
|
def test_error_recovery_workflow(self, orchestrator_with_mock_data):
|
|
"""Test that workflow continues when one source fails"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
|
|
# Make YouTube scraper fail
|
|
def failing_run():
|
|
raise Exception("YouTube API error")
|
|
|
|
orchestrator.scrapers['youtube'].run = failing_run
|
|
|
|
# Run workflow - should not crash
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Verify other sources still completed
|
|
markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
|
|
|
|
# Should have 2 files (WordPress and Podcast), not 3
|
|
assert len(markdown_files) == 2
|
|
|
|
source_names = [f.name for f in markdown_files]
|
|
assert any('wordpress' in name for name in source_names)
|
|
assert any('podcast' in name for name in source_names)
|
|
assert not any('youtube' in name for name in source_names)
|
|
|
|
def test_logging_integration(self, orchestrator_with_mock_data):
|
|
"""Test that comprehensive logging works throughout workflow"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check log files exist for each source
|
|
sources = ['wordpress', 'youtube', 'podcast']
|
|
for source in sources:
|
|
log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log"
|
|
assert log_file.exists()
|
|
|
|
log_content = log_file.read_text()
|
|
assert f"Starting {source} scraper" in log_content
|
|
assert "Successfully processed" in log_content
|
|
assert "Saved markdown to" in log_content
|
|
|
|
@patch('requests.Session.request')
|
|
def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data):
|
|
"""Test media downloading integration"""
|
|
# Mock successful media download
|
|
mock_response = Mock()
|
|
mock_response.status_code = 200
|
|
mock_response.iter_content.return_value = [b'fake image data']
|
|
mock_request.return_value = mock_response
|
|
|
|
# Add media URLs to mock data
|
|
orchestrator = orchestrator_with_mock_data
|
|
podcast_scraper = orchestrator.scrapers['podcast']
|
|
podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg'
|
|
|
|
# Override download_media to actually download
|
|
original_run = podcast_scraper.run
|
|
def run_with_media():
|
|
original_run()
|
|
# Simulate media download
|
|
for item in podcast_scraper.mock_data:
|
|
if 'image' in item:
|
|
podcast_scraper.download_media(item['image'], item['id'], 'image')
|
|
|
|
podcast_scraper.run = run_with_media
|
|
|
|
orchestrator.run_all_scrapers(parallel=False)
|
|
|
|
# Verify media directory and file
|
|
media_dir = orchestrator.config.data_dir / "media" / "Podcast"
|
|
assert media_dir.exists()
|
|
|
|
# Check if media file was created
|
|
media_files = list(media_dir.glob("*"))
|
|
if media_files: # Media download attempted
|
|
assert len(media_files) > 0
|
|
|
|
def test_archive_workflow(self, orchestrator_with_mock_data):
|
|
"""Test that archiving works correctly in full workflow"""
|
|
orchestrator = orchestrator_with_mock_data
|
|
|
|
# Create some existing files to archive
|
|
current_dir = orchestrator.config.data_dir / "markdown_current"
|
|
current_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md"
|
|
old_file.write_text("Old content")
|
|
|
|
# Run workflow
|
|
orchestrator.run_all_scrapers(parallel=False)
|
|
|
|
# Check that old file was archived
|
|
archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress"
|
|
archived_files = list(archive_dir.glob("*.md"))
|
|
|
|
# Should have archived the old file
|
|
assert any("2024-01-01" in f.name for f in archived_files)
|
|
|
|
# Current directory should have new files
|
|
current_files = list(current_dir.glob("*.md"))
|
|
assert len(current_files) == 3 # One per source
|
|
assert not any("2024-01-01" in f.name for f in current_files)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v']) |