hvac-kia-content/tests/test_end_to_end.py
Ben Reed 8d5750b1d1 Add comprehensive test infrastructure
- Created unit tests for BaseScraper with mocking
- Added integration tests for parallel processing
- Created end-to-end tests with realistic mock data
- Fixed initialization order in BaseScraper (logger before user agent)
- Fixed orchestrator method name (archive_current_file)
- Added tenacity dependency for retry logic
- Validated parallel processing performance and overlap detection
- Confirmed spec-compliant markdown formatting in tests

Tests cover:
- Base scraper functionality (state, markdown, retry logic, media downloads)
- Parallel vs sequential execution timing
- Error isolation between scrapers
- Directory structure creation
- State management across runs
- Full workflow with realistic data

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 21:16:14 -03:00

441 lines
No EOL
18 KiB
Python

#!/usr/bin/env python3
"""
End-to-end tests with mock data for full workflow validation
"""
import pytest
import json
import time
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import requests
# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.orchestrator import ContentOrchestrator
from src.base_scraper import BaseScraper, ScraperConfig
from src.rss_scraper import BaseRSSScraper
class MockEndToEndScraper(BaseScraper):
"""End-to-end mock scraper with realistic data"""
def __init__(self, config: ScraperConfig, mock_data: list):
super().__init__(config)
self.mock_data = mock_data
def fetch_content(self):
return self.mock_data
def get_incremental_items(self, items, state):
if not state.get('last_id'):
return items
# Find items after last_id
last_seen = False
new_items = []
for item in items:
if last_seen:
new_items.append(item)
elif item['id'] == state['last_id']:
last_seen = True
return new_items
def update_state(self, state, items):
if items:
state['last_id'] = items[-1]['id']
return state
class TestEndToEnd:
"""End-to-end workflow tests"""
@pytest.fixture
def temp_config(self):
"""Create temporary config for testing"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
config = ScraperConfig(
source_name="e2e_test",
brand_name="hvacknowitall",
data_dir=temp_path / "data",
logs_dir=temp_path / "logs",
timezone="America/Halifax"
)
yield config
@pytest.fixture
def mock_wordpress_data(self):
"""Mock WordPress blog post data"""
return [
{
'id': 'wp_1001',
'title': 'Understanding HVAC System Efficiency',
'url': 'https://hvacknowitall.com/hvac-efficiency/',
'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.',
'author': 'HVAC Expert',
'publish_date': '2024-01-15T10:30:00',
'word_count': 1250,
'tags': ['efficiency', 'energy-saving', 'maintenance'],
'categories': ['HVAC Tips', 'Energy Efficiency']
},
{
'id': 'wp_1002',
'title': 'Common HVAC Problems in Winter',
'url': 'https://hvacknowitall.com/winter-hvac-problems/',
'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.',
'author': 'HVAC Expert',
'publish_date': '2024-01-20T14:15:00',
'word_count': 980,
'tags': ['troubleshooting', 'winter', 'maintenance'],
'categories': ['Troubleshooting', 'Seasonal Tips']
}
]
@pytest.fixture
def mock_youtube_data(self):
"""Mock YouTube video data"""
return [
{
'id': 'yt_abc123',
'title': 'How to Replace Your Air Filter - DIY HVAC',
'url': 'https://youtube.com/watch?v=abc123',
'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!',
'author': 'HVAC Know It All',
'type': 'video',
'views': 15420,
'likes': 247,
'comments': 18,
'shares': 12,
'duration': '8:45',
'tags': ['DIY', 'air filter', 'maintenance']
},
{
'id': 'yt_def456',
'title': 'HVAC Short: Quick Thermostat Tip',
'url': 'https://youtube.com/shorts/def456',
'description': 'Quick tip for optimizing your thermostat settings.',
'author': 'HVAC Know It All',
'type': 'short',
'views': 8934,
'likes': 156,
'comments': 7,
'shares': 23,
'duration': '0:58',
'tags': ['thermostat', 'tips', 'energy-saving']
}
]
@pytest.fixture
def mock_podcast_data(self):
"""Mock podcast episode data"""
return [
{
'id': 'pod_ep101',
'title': 'Episode 101: Heat Pump vs Furnace Debate',
'url': 'https://hvacknowitall.com/podcast/ep101/',
'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.',
'author': 'HVAC Know It All Podcast',
'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3',
'duration': '45:32',
'publish_date': '2024-01-18T09:00:00',
'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg',
'tags': ['heat pump', 'furnace', 'comparison']
}
]
@pytest.fixture
def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data):
"""Create orchestrator with realistic mock data"""
orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)
# Replace scrapers with mock versions
orchestrator.scrapers = {
'wordpress': MockEndToEndScraper(
ScraperConfig(
source_name="wordpress",
brand_name=temp_config.brand_name,
data_dir=temp_config.data_dir,
logs_dir=temp_config.logs_dir,
timezone=temp_config.timezone
),
mock_wordpress_data
),
'youtube': MockEndToEndScraper(
ScraperConfig(
source_name="youtube",
brand_name=temp_config.brand_name,
data_dir=temp_config.data_dir,
logs_dir=temp_config.logs_dir,
timezone=temp_config.timezone
),
mock_youtube_data
),
'podcast': MockEndToEndScraper(
ScraperConfig(
source_name="podcast",
brand_name=temp_config.brand_name,
data_dir=temp_config.data_dir,
logs_dir=temp_config.logs_dir,
timezone=temp_config.timezone
),
mock_podcast_data
)
}
return orchestrator
def test_full_workflow_execution(self, orchestrator_with_mock_data):
"""Test complete workflow from start to finish"""
orchestrator = orchestrator_with_mock_data
# Run full workflow
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
# Verify markdown files were created
markdown_dir = orchestrator.config.data_dir / "markdown_current"
markdown_files = list(markdown_dir.glob("*.md"))
assert len(markdown_files) == 3
# Verify file naming convention
for file_path in markdown_files:
filename = file_path.name
assert filename.startswith("hvacknowitall_")
assert any(source in filename for source in ['wordpress', 'youtube', 'podcast'])
assert ".md" in filename
# Check timestamp format (YYYY-DD-MM-THHMMSS)
assert len(filename.split('_')) >= 3
def test_markdown_format_compliance(self, orchestrator_with_mock_data):
"""Test that generated markdown follows specification exactly"""
orchestrator = orchestrator_with_mock_data
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
# Check each markdown file
markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
for file_path in markdown_files:
content = file_path.read_text(encoding='utf-8')
# Verify spec format for each item
assert "# ID:" in content
assert "## Title:" in content
assert "## Type:" in content
assert "## Permalink:" in content
assert "## Description:" in content
assert "## Metadata:" in content
assert "### Comments:" in content
assert "### Likes:" in content
assert "### Tags:" in content
# Verify separator between items
if content.count("# ID:") > 1:
assert "--------------" in content
# Verify specific content based on source
if "wordpress" in file_path.name:
assert "Understanding HVAC System Efficiency" in content
assert "energy-saving" in content
assert "1250" in content # word count should be preserved
elif "youtube" in file_path.name:
assert "How to Replace Your Air Filter" in content
assert "15420" in content # view count
assert "247" in content # like count
elif "podcast" in file_path.name:
assert "Heat Pump vs Furnace Debate" in content
assert "45:32" in content # duration
def test_directory_structure_creation(self, orchestrator_with_mock_data):
"""Test that proper directory structure is created"""
orchestrator = orchestrator_with_mock_data
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
base_dir = orchestrator.config.data_dir
logs_dir = orchestrator.config.logs_dir
# Check main directories
assert (base_dir / "markdown_current").exists()
assert (base_dir / "markdown_archives").exists()
assert (base_dir / "media").exists()
assert (base_dir / ".state").exists()
# Check source-specific directories
sources = ['Wordpress', 'Youtube', 'Podcast']
for source in sources:
assert (base_dir / "markdown_archives" / source).exists()
assert (base_dir / "media" / source).exists()
assert (logs_dir / source).exists()
def test_state_persistence_workflow(self, orchestrator_with_mock_data):
"""Test incremental updates with state persistence"""
orchestrator = orchestrator_with_mock_data
# First run - should process all items
orchestrator.run_all_scrapers(parallel=False)
# Check state files were created
state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
assert len(state_files) == 3
# Verify state content
wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json"
assert wordpress_state_file.exists()
state_data = json.loads(wordpress_state_file.read_text())
assert 'last_id' in state_data
assert 'last_update' in state_data
assert 'last_item_count' in state_data
assert state_data['last_id'] == 'wp_1002' # Last item ID
assert state_data['last_item_count'] == 2 # Both items processed
# Add new item to WordPress scraper
new_item = {
'id': 'wp_1003',
'title': 'New HVAC Article',
'url': 'https://hvacknowitall.com/new-article/',
'description': 'Brand new article about HVAC.',
'author': 'HVAC Expert',
'tags': ['new'],
'categories': ['News']
}
orchestrator.scrapers['wordpress'].mock_data.append(new_item)
# Archive existing files to simulate next run
for scraper in orchestrator.scrapers.values():
scraper.archive_current_file()
# Second run - should only process new item
orchestrator.run_all_scrapers(parallel=False)
# Check that only incremental content was processed
updated_state = json.loads(wordpress_state_file.read_text())
assert updated_state['last_id'] == 'wp_1003'
assert updated_state['last_item_count'] == 1 # Only new item
# Verify new markdown contains only new item
new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md"))
assert len(new_markdown_files) == 1
new_content = new_markdown_files[0].read_text()
assert "New HVAC Article" in new_content
assert "Understanding HVAC System Efficiency" not in new_content # Old content not repeated
@patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data):
"""Test NAS sync is called with correct parameters"""
orchestrator = orchestrator_with_mock_data
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
# Verify sync was called
mock_sync.assert_called_once()
def test_error_recovery_workflow(self, orchestrator_with_mock_data):
"""Test that workflow continues when one source fails"""
orchestrator = orchestrator_with_mock_data
# Make YouTube scraper fail
def failing_run():
raise Exception("YouTube API error")
orchestrator.scrapers['youtube'].run = failing_run
# Run workflow - should not crash
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
# Verify other sources still completed
markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
# Should have 2 files (WordPress and Podcast), not 3
assert len(markdown_files) == 2
source_names = [f.name for f in markdown_files]
assert any('wordpress' in name for name in source_names)
assert any('podcast' in name for name in source_names)
assert not any('youtube' in name for name in source_names)
def test_logging_integration(self, orchestrator_with_mock_data):
"""Test that comprehensive logging works throughout workflow"""
orchestrator = orchestrator_with_mock_data
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
# Check log files exist for each source
sources = ['wordpress', 'youtube', 'podcast']
for source in sources:
log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log"
assert log_file.exists()
log_content = log_file.read_text()
assert f"Starting {source} scraper" in log_content
assert "Successfully processed" in log_content
assert "Saved markdown to" in log_content
@patch('requests.Session.request')
def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data):
"""Test media downloading integration"""
# Mock successful media download
mock_response = Mock()
mock_response.status_code = 200
mock_response.iter_content.return_value = [b'fake image data']
mock_request.return_value = mock_response
# Add media URLs to mock data
orchestrator = orchestrator_with_mock_data
podcast_scraper = orchestrator.scrapers['podcast']
podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg'
# Override download_media to actually download
original_run = podcast_scraper.run
def run_with_media():
original_run()
# Simulate media download
for item in podcast_scraper.mock_data:
if 'image' in item:
podcast_scraper.download_media(item['image'], item['id'], 'image')
podcast_scraper.run = run_with_media
orchestrator.run_all_scrapers(parallel=False)
# Verify media directory and file
media_dir = orchestrator.config.data_dir / "media" / "Podcast"
assert media_dir.exists()
# Check if media file was created
media_files = list(media_dir.glob("*"))
if media_files: # Media download attempted
assert len(media_files) > 0
def test_archive_workflow(self, orchestrator_with_mock_data):
"""Test that archiving works correctly in full workflow"""
orchestrator = orchestrator_with_mock_data
# Create some existing files to archive
current_dir = orchestrator.config.data_dir / "markdown_current"
current_dir.mkdir(parents=True, exist_ok=True)
old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md"
old_file.write_text("Old content")
# Run workflow
orchestrator.run_all_scrapers(parallel=False)
# Check that old file was archived
archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress"
archived_files = list(archive_dir.glob("*.md"))
# Should have archived the old file
assert any("2024-01-01" in f.name for f in archived_files)
# Current directory should have new files
current_files = list(current_dir.glob("*.md"))
assert len(current_files) == 3 # One per source
assert not any("2024-01-01" in f.name for f in current_files)
if __name__ == '__main__':
pytest.main([__file__, '-v'])