#!/usr/bin/env python3 """ End-to-end tests with mock data for full workflow validation """ import pytest import json import time import tempfile from pathlib import Path from unittest.mock import Mock, patch, MagicMock import requests # Add project to path import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.orchestrator import ContentOrchestrator from src.base_scraper import BaseScraper, ScraperConfig from src.rss_scraper import BaseRSSScraper class MockEndToEndScraper(BaseScraper): """End-to-end mock scraper with realistic data""" def __init__(self, config: ScraperConfig, mock_data: list): super().__init__(config) self.mock_data = mock_data def fetch_content(self): return self.mock_data def get_incremental_items(self, items, state): if not state.get('last_id'): return items # Find items after last_id last_seen = False new_items = [] for item in items: if last_seen: new_items.append(item) elif item['id'] == state['last_id']: last_seen = True return new_items def update_state(self, state, items): if items: state['last_id'] = items[-1]['id'] return state class TestEndToEnd: """End-to-end workflow tests""" @pytest.fixture def temp_config(self): """Create temporary config for testing""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) config = ScraperConfig( source_name="e2e_test", brand_name="hvacknowitall", data_dir=temp_path / "data", logs_dir=temp_path / "logs", timezone="America/Halifax" ) yield config @pytest.fixture def mock_wordpress_data(self): """Mock WordPress blog post data""" return [ { 'id': 'wp_1001', 'title': 'Understanding HVAC System Efficiency', 'url': 'https://hvacknowitall.com/hvac-efficiency/', 'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.', 'author': 'HVAC Expert', 'publish_date': '2024-01-15T10:30:00', 'word_count': 1250, 'tags': ['efficiency', 'energy-saving', 'maintenance'], 'categories': ['HVAC Tips', 'Energy Efficiency'] }, { 'id': 'wp_1002', 'title': 'Common HVAC Problems in Winter', 'url': 'https://hvacknowitall.com/winter-hvac-problems/', 'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.', 'author': 'HVAC Expert', 'publish_date': '2024-01-20T14:15:00', 'word_count': 980, 'tags': ['troubleshooting', 'winter', 'maintenance'], 'categories': ['Troubleshooting', 'Seasonal Tips'] } ] @pytest.fixture def mock_youtube_data(self): """Mock YouTube video data""" return [ { 'id': 'yt_abc123', 'title': 'How to Replace Your Air Filter - DIY HVAC', 'url': 'https://youtube.com/watch?v=abc123', 'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!', 'author': 'HVAC Know It All', 'type': 'video', 'views': 15420, 'likes': 247, 'comments': 18, 'shares': 12, 'duration': '8:45', 'tags': ['DIY', 'air filter', 'maintenance'] }, { 'id': 'yt_def456', 'title': 'HVAC Short: Quick Thermostat Tip', 'url': 'https://youtube.com/shorts/def456', 'description': 'Quick tip for optimizing your thermostat settings.', 'author': 'HVAC Know It All', 'type': 'short', 'views': 8934, 'likes': 156, 'comments': 7, 'shares': 23, 'duration': '0:58', 'tags': ['thermostat', 'tips', 'energy-saving'] } ] @pytest.fixture def mock_podcast_data(self): """Mock podcast episode data""" return [ { 'id': 'pod_ep101', 'title': 'Episode 101: Heat Pump vs Furnace Debate', 'url': 'https://hvacknowitall.com/podcast/ep101/', 'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.', 'author': 'HVAC Know It All Podcast', 'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3', 'duration': '45:32', 'publish_date': '2024-01-18T09:00:00', 'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg', 'tags': ['heat pump', 'furnace', 'comparison'] } ] @pytest.fixture def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data): """Create orchestrator with realistic mock data""" orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir) # Replace scrapers with mock versions orchestrator.scrapers = { 'wordpress': MockEndToEndScraper( ScraperConfig( source_name="wordpress", brand_name=temp_config.brand_name, data_dir=temp_config.data_dir, logs_dir=temp_config.logs_dir, timezone=temp_config.timezone ), mock_wordpress_data ), 'youtube': MockEndToEndScraper( ScraperConfig( source_name="youtube", brand_name=temp_config.brand_name, data_dir=temp_config.data_dir, logs_dir=temp_config.logs_dir, timezone=temp_config.timezone ), mock_youtube_data ), 'podcast': MockEndToEndScraper( ScraperConfig( source_name="podcast", brand_name=temp_config.brand_name, data_dir=temp_config.data_dir, logs_dir=temp_config.logs_dir, timezone=temp_config.timezone ), mock_podcast_data ) } return orchestrator def test_full_workflow_execution(self, orchestrator_with_mock_data): """Test complete workflow from start to finish""" orchestrator = orchestrator_with_mock_data # Run full workflow orchestrator.run_all_scrapers(parallel=True, max_workers=3) # Verify markdown files were created markdown_dir = orchestrator.config.data_dir / "markdown_current" markdown_files = list(markdown_dir.glob("*.md")) assert len(markdown_files) == 3 # Verify file naming convention for file_path in markdown_files: filename = file_path.name assert filename.startswith("hvacknowitall_") assert any(source in filename for source in ['wordpress', 'youtube', 'podcast']) assert ".md" in filename # Check timestamp format (YYYY-DD-MM-THHMMSS) assert len(filename.split('_')) >= 3 def test_markdown_format_compliance(self, orchestrator_with_mock_data): """Test that generated markdown follows specification exactly""" orchestrator = orchestrator_with_mock_data orchestrator.run_all_scrapers(parallel=True, max_workers=3) # Check each markdown file markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md")) for file_path in markdown_files: content = file_path.read_text(encoding='utf-8') # Verify spec format for each item assert "# ID:" in content assert "## Title:" in content assert "## Type:" in content assert "## Permalink:" in content assert "## Description:" in content assert "## Metadata:" in content assert "### Comments:" in content assert "### Likes:" in content assert "### Tags:" in content # Verify separator between items if content.count("# ID:") > 1: assert "--------------" in content # Verify specific content based on source if "wordpress" in file_path.name: assert "Understanding HVAC System Efficiency" in content assert "energy-saving" in content assert "1250" in content # word count should be preserved elif "youtube" in file_path.name: assert "How to Replace Your Air Filter" in content assert "15420" in content # view count assert "247" in content # like count elif "podcast" in file_path.name: assert "Heat Pump vs Furnace Debate" in content assert "45:32" in content # duration def test_directory_structure_creation(self, orchestrator_with_mock_data): """Test that proper directory structure is created""" orchestrator = orchestrator_with_mock_data orchestrator.run_all_scrapers(parallel=True, max_workers=3) base_dir = orchestrator.config.data_dir logs_dir = orchestrator.config.logs_dir # Check main directories assert (base_dir / "markdown_current").exists() assert (base_dir / "markdown_archives").exists() assert (base_dir / "media").exists() assert (base_dir / ".state").exists() # Check source-specific directories sources = ['Wordpress', 'Youtube', 'Podcast'] for source in sources: assert (base_dir / "markdown_archives" / source).exists() assert (base_dir / "media" / source).exists() assert (logs_dir / source).exists() def test_state_persistence_workflow(self, orchestrator_with_mock_data): """Test incremental updates with state persistence""" orchestrator = orchestrator_with_mock_data # First run - should process all items orchestrator.run_all_scrapers(parallel=False) # Check state files were created state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json")) assert len(state_files) == 3 # Verify state content wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json" assert wordpress_state_file.exists() state_data = json.loads(wordpress_state_file.read_text()) assert 'last_id' in state_data assert 'last_update' in state_data assert 'last_item_count' in state_data assert state_data['last_id'] == 'wp_1002' # Last item ID assert state_data['last_item_count'] == 2 # Both items processed # Add new item to WordPress scraper new_item = { 'id': 'wp_1003', 'title': 'New HVAC Article', 'url': 'https://hvacknowitall.com/new-article/', 'description': 'Brand new article about HVAC.', 'author': 'HVAC Expert', 'tags': ['new'], 'categories': ['News'] } orchestrator.scrapers['wordpress'].mock_data.append(new_item) # Archive existing files to simulate next run for scraper in orchestrator.scrapers.values(): scraper.archive_current_file() # Second run - should only process new item orchestrator.run_all_scrapers(parallel=False) # Check that only incremental content was processed updated_state = json.loads(wordpress_state_file.read_text()) assert updated_state['last_id'] == 'wp_1003' assert updated_state['last_item_count'] == 1 # Only new item # Verify new markdown contains only new item new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md")) assert len(new_markdown_files) == 1 new_content = new_markdown_files[0].read_text() assert "New HVAC Article" in new_content assert "Understanding HVAC System Efficiency" not in new_content # Old content not repeated @patch('src.orchestrator.ContentOrchestrator.sync_to_nas') def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data): """Test NAS sync is called with correct parameters""" orchestrator = orchestrator_with_mock_data orchestrator.run_all_scrapers(parallel=True, max_workers=3) # Verify sync was called mock_sync.assert_called_once() def test_error_recovery_workflow(self, orchestrator_with_mock_data): """Test that workflow continues when one source fails""" orchestrator = orchestrator_with_mock_data # Make YouTube scraper fail def failing_run(): raise Exception("YouTube API error") orchestrator.scrapers['youtube'].run = failing_run # Run workflow - should not crash orchestrator.run_all_scrapers(parallel=True, max_workers=3) # Verify other sources still completed markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md")) # Should have 2 files (WordPress and Podcast), not 3 assert len(markdown_files) == 2 source_names = [f.name for f in markdown_files] assert any('wordpress' in name for name in source_names) assert any('podcast' in name for name in source_names) assert not any('youtube' in name for name in source_names) def test_logging_integration(self, orchestrator_with_mock_data): """Test that comprehensive logging works throughout workflow""" orchestrator = orchestrator_with_mock_data orchestrator.run_all_scrapers(parallel=True, max_workers=3) # Check log files exist for each source sources = ['wordpress', 'youtube', 'podcast'] for source in sources: log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log" assert log_file.exists() log_content = log_file.read_text() assert f"Starting {source} scraper" in log_content assert "Successfully processed" in log_content assert "Saved markdown to" in log_content @patch('requests.Session.request') def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data): """Test media downloading integration""" # Mock successful media download mock_response = Mock() mock_response.status_code = 200 mock_response.iter_content.return_value = [b'fake image data'] mock_request.return_value = mock_response # Add media URLs to mock data orchestrator = orchestrator_with_mock_data podcast_scraper = orchestrator.scrapers['podcast'] podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg' # Override download_media to actually download original_run = podcast_scraper.run def run_with_media(): original_run() # Simulate media download for item in podcast_scraper.mock_data: if 'image' in item: podcast_scraper.download_media(item['image'], item['id'], 'image') podcast_scraper.run = run_with_media orchestrator.run_all_scrapers(parallel=False) # Verify media directory and file media_dir = orchestrator.config.data_dir / "media" / "Podcast" assert media_dir.exists() # Check if media file was created media_files = list(media_dir.glob("*")) if media_files: # Media download attempted assert len(media_files) > 0 def test_archive_workflow(self, orchestrator_with_mock_data): """Test that archiving works correctly in full workflow""" orchestrator = orchestrator_with_mock_data # Create some existing files to archive current_dir = orchestrator.config.data_dir / "markdown_current" current_dir.mkdir(parents=True, exist_ok=True) old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md" old_file.write_text("Old content") # Run workflow orchestrator.run_all_scrapers(parallel=False) # Check that old file was archived archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress" archived_files = list(archive_dir.glob("*.md")) # Should have archived the old file assert any("2024-01-01" in f.name for f in archived_files) # Current directory should have new files current_files = list(current_dir.glob("*.md")) assert len(current_files) == 3 # One per source assert not any("2024-01-01" in f.name for f in current_files) if __name__ == '__main__': pytest.main([__file__, '-v'])