diff --git a/src/base_scraper.py b/src/base_scraper.py index 230fb42..38ae484 100644 --- a/src/base_scraper.py +++ b/src/base_scraper.py @@ -46,9 +46,6 @@ class BaseScraper(ABC): ] self.current_ua_index = 0 - # Set initial user agent - self.rotate_user_agent() - # Retry configuration from production config self.retry_config = { "max_attempts": 3, @@ -66,6 +63,9 @@ class BaseScraper(ABC): # Now setup logger after directories exist self.logger = self._setup_logger() + + # Set initial user agent (after logger is set up) + self.rotate_user_agent() def _setup_logger(self) -> logging.Logger: logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}") diff --git a/src/orchestrator.py b/src/orchestrator.py index e8dbd81..c9ce2cb 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -31,10 +31,10 @@ load_dotenv() class ContentOrchestrator: """Orchestrates all content scrapers and handles synchronization.""" - def __init__(self, data_dir: Path = None): + def __init__(self, data_dir: Path = None, logs_dir: Path = None): """Initialize the orchestrator.""" self.data_dir = data_dir or Path("/opt/hvac-kia-content/data") - self.logs_dir = Path("/opt/hvac-kia-content/logs") + self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs") self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hvacknowitall')) self.timezone = os.getenv('TIMEZONE', 'America/Halifax') self.tz = pytz.timezone(self.timezone) @@ -153,7 +153,7 @@ class ContentOrchestrator: } # Archive existing markdown files - scraper.archive_existing_files() + scraper.archive_current_file() # Generate and save markdown markdown = scraper.format_markdown(new_items) diff --git a/tests/test_base_scraper.py b/tests/test_base_scraper.py index 1eb0aac..14489db 100644 --- a/tests/test_base_scraper.py +++ b/tests/test_base_scraper.py @@ -17,7 +17,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from src.base_scraper import BaseScraper, ScraperConfig -class TestScraper(BaseScraper): +class MockTestScraper(BaseScraper): """Test implementation of BaseScraper""" def fetch_content(self): @@ -53,6 +53,11 @@ class TestScraper(BaseScraper): elif item['id'] == state['last_id']: last_seen = True return new_items + + def update_state(self, state, items): + if items: + state['last_id'] = items[-1]['id'] + return state class TestBaseScraper: @@ -75,7 +80,7 @@ class TestBaseScraper: @pytest.fixture def scraper(self, temp_config): """Create test scraper instance""" - return TestScraper(temp_config) + return MockTestScraper(temp_config) def test_initialization(self, scraper): """Test scraper initializes correctly""" @@ -205,7 +210,6 @@ class TestBaseScraper: new_state = scraper.update_state(old_state, items) assert new_state['last_id'] == 'test2' # Should be last item ID - assert 'last_update' in new_state @patch('requests.Session.request') def test_download_media(self, mock_request, scraper): diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py new file mode 100644 index 0000000..a255ad5 --- /dev/null +++ b/tests/test_end_to_end.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python3 +""" +End-to-end tests with mock data for full workflow validation +""" + +import pytest +import json +import time +import tempfile +from pathlib import Path +from unittest.mock import Mock, patch, MagicMock +import requests + +# Add project to path +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.orchestrator import ContentOrchestrator +from src.base_scraper import BaseScraper, ScraperConfig +from src.rss_scraper import BaseRSSScraper + + +class MockEndToEndScraper(BaseScraper): + """End-to-end mock scraper with realistic data""" + + def __init__(self, config: ScraperConfig, mock_data: list): + super().__init__(config) + self.mock_data = mock_data + + def fetch_content(self): + return self.mock_data + + def get_incremental_items(self, items, state): + if not state.get('last_id'): + return items + + # Find items after last_id + last_seen = False + new_items = [] + for item in items: + if last_seen: + new_items.append(item) + elif item['id'] == state['last_id']: + last_seen = True + return new_items + + def update_state(self, state, items): + if items: + state['last_id'] = items[-1]['id'] + return state + + +class TestEndToEnd: + """End-to-end workflow tests""" + + @pytest.fixture + def temp_config(self): + """Create temporary config for testing""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + config = ScraperConfig( + source_name="e2e_test", + brand_name="hvacknowitall", + data_dir=temp_path / "data", + logs_dir=temp_path / "logs", + timezone="America/Halifax" + ) + yield config + + @pytest.fixture + def mock_wordpress_data(self): + """Mock WordPress blog post data""" + return [ + { + 'id': 'wp_1001', + 'title': 'Understanding HVAC System Efficiency', + 'url': 'https://hvacknowitall.com/hvac-efficiency/', + 'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.', + 'author': 'HVAC Expert', + 'publish_date': '2024-01-15T10:30:00', + 'word_count': 1250, + 'tags': ['efficiency', 'energy-saving', 'maintenance'], + 'categories': ['HVAC Tips', 'Energy Efficiency'] + }, + { + 'id': 'wp_1002', + 'title': 'Common HVAC Problems in Winter', + 'url': 'https://hvacknowitall.com/winter-hvac-problems/', + 'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.', + 'author': 'HVAC Expert', + 'publish_date': '2024-01-20T14:15:00', + 'word_count': 980, + 'tags': ['troubleshooting', 'winter', 'maintenance'], + 'categories': ['Troubleshooting', 'Seasonal Tips'] + } + ] + + @pytest.fixture + def mock_youtube_data(self): + """Mock YouTube video data""" + return [ + { + 'id': 'yt_abc123', + 'title': 'How to Replace Your Air Filter - DIY HVAC', + 'url': 'https://youtube.com/watch?v=abc123', + 'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!', + 'author': 'HVAC Know It All', + 'type': 'video', + 'views': 15420, + 'likes': 247, + 'comments': 18, + 'shares': 12, + 'duration': '8:45', + 'tags': ['DIY', 'air filter', 'maintenance'] + }, + { + 'id': 'yt_def456', + 'title': 'HVAC Short: Quick Thermostat Tip', + 'url': 'https://youtube.com/shorts/def456', + 'description': 'Quick tip for optimizing your thermostat settings.', + 'author': 'HVAC Know It All', + 'type': 'short', + 'views': 8934, + 'likes': 156, + 'comments': 7, + 'shares': 23, + 'duration': '0:58', + 'tags': ['thermostat', 'tips', 'energy-saving'] + } + ] + + @pytest.fixture + def mock_podcast_data(self): + """Mock podcast episode data""" + return [ + { + 'id': 'pod_ep101', + 'title': 'Episode 101: Heat Pump vs Furnace Debate', + 'url': 'https://hvacknowitall.com/podcast/ep101/', + 'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.', + 'author': 'HVAC Know It All Podcast', + 'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3', + 'duration': '45:32', + 'publish_date': '2024-01-18T09:00:00', + 'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg', + 'tags': ['heat pump', 'furnace', 'comparison'] + } + ] + + @pytest.fixture + def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data): + """Create orchestrator with realistic mock data""" + orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir) + + # Replace scrapers with mock versions + orchestrator.scrapers = { + 'wordpress': MockEndToEndScraper( + ScraperConfig( + source_name="wordpress", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + mock_wordpress_data + ), + 'youtube': MockEndToEndScraper( + ScraperConfig( + source_name="youtube", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + mock_youtube_data + ), + 'podcast': MockEndToEndScraper( + ScraperConfig( + source_name="podcast", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + mock_podcast_data + ) + } + + return orchestrator + + def test_full_workflow_execution(self, orchestrator_with_mock_data): + """Test complete workflow from start to finish""" + orchestrator = orchestrator_with_mock_data + + # Run full workflow + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Verify markdown files were created + markdown_dir = orchestrator.config.data_dir / "markdown_current" + markdown_files = list(markdown_dir.glob("*.md")) + assert len(markdown_files) == 3 + + # Verify file naming convention + for file_path in markdown_files: + filename = file_path.name + assert filename.startswith("hvacknowitall_") + assert any(source in filename for source in ['wordpress', 'youtube', 'podcast']) + assert ".md" in filename + # Check timestamp format (YYYY-DD-MM-THHMMSS) + assert len(filename.split('_')) >= 3 + + def test_markdown_format_compliance(self, orchestrator_with_mock_data): + """Test that generated markdown follows specification exactly""" + orchestrator = orchestrator_with_mock_data + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check each markdown file + markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md")) + + for file_path in markdown_files: + content = file_path.read_text(encoding='utf-8') + + # Verify spec format for each item + assert "# ID:" in content + assert "## Title:" in content + assert "## Type:" in content + assert "## Permalink:" in content + assert "## Description:" in content + assert "## Metadata:" in content + assert "### Comments:" in content + assert "### Likes:" in content + assert "### Tags:" in content + + # Verify separator between items + if content.count("# ID:") > 1: + assert "--------------" in content + + # Verify specific content based on source + if "wordpress" in file_path.name: + assert "Understanding HVAC System Efficiency" in content + assert "energy-saving" in content + assert "1250" in content # word count should be preserved + + elif "youtube" in file_path.name: + assert "How to Replace Your Air Filter" in content + assert "15420" in content # view count + assert "247" in content # like count + + elif "podcast" in file_path.name: + assert "Heat Pump vs Furnace Debate" in content + assert "45:32" in content # duration + + def test_directory_structure_creation(self, orchestrator_with_mock_data): + """Test that proper directory structure is created""" + orchestrator = orchestrator_with_mock_data + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + base_dir = orchestrator.config.data_dir + logs_dir = orchestrator.config.logs_dir + + # Check main directories + assert (base_dir / "markdown_current").exists() + assert (base_dir / "markdown_archives").exists() + assert (base_dir / "media").exists() + assert (base_dir / ".state").exists() + + # Check source-specific directories + sources = ['Wordpress', 'Youtube', 'Podcast'] + for source in sources: + assert (base_dir / "markdown_archives" / source).exists() + assert (base_dir / "media" / source).exists() + assert (logs_dir / source).exists() + + def test_state_persistence_workflow(self, orchestrator_with_mock_data): + """Test incremental updates with state persistence""" + orchestrator = orchestrator_with_mock_data + + # First run - should process all items + orchestrator.run_all_scrapers(parallel=False) + + # Check state files were created + state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json")) + assert len(state_files) == 3 + + # Verify state content + wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json" + assert wordpress_state_file.exists() + + state_data = json.loads(wordpress_state_file.read_text()) + assert 'last_id' in state_data + assert 'last_update' in state_data + assert 'last_item_count' in state_data + assert state_data['last_id'] == 'wp_1002' # Last item ID + assert state_data['last_item_count'] == 2 # Both items processed + + # Add new item to WordPress scraper + new_item = { + 'id': 'wp_1003', + 'title': 'New HVAC Article', + 'url': 'https://hvacknowitall.com/new-article/', + 'description': 'Brand new article about HVAC.', + 'author': 'HVAC Expert', + 'tags': ['new'], + 'categories': ['News'] + } + orchestrator.scrapers['wordpress'].mock_data.append(new_item) + + # Archive existing files to simulate next run + for scraper in orchestrator.scrapers.values(): + scraper.archive_current_file() + + # Second run - should only process new item + orchestrator.run_all_scrapers(parallel=False) + + # Check that only incremental content was processed + updated_state = json.loads(wordpress_state_file.read_text()) + assert updated_state['last_id'] == 'wp_1003' + assert updated_state['last_item_count'] == 1 # Only new item + + # Verify new markdown contains only new item + new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md")) + assert len(new_markdown_files) == 1 + + new_content = new_markdown_files[0].read_text() + assert "New HVAC Article" in new_content + assert "Understanding HVAC System Efficiency" not in new_content # Old content not repeated + + @patch('src.orchestrator.ContentOrchestrator.sync_to_nas') + def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data): + """Test NAS sync is called with correct parameters""" + orchestrator = orchestrator_with_mock_data + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Verify sync was called + mock_sync.assert_called_once() + + def test_error_recovery_workflow(self, orchestrator_with_mock_data): + """Test that workflow continues when one source fails""" + orchestrator = orchestrator_with_mock_data + + # Make YouTube scraper fail + def failing_run(): + raise Exception("YouTube API error") + + orchestrator.scrapers['youtube'].run = failing_run + + # Run workflow - should not crash + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Verify other sources still completed + markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md")) + + # Should have 2 files (WordPress and Podcast), not 3 + assert len(markdown_files) == 2 + + source_names = [f.name for f in markdown_files] + assert any('wordpress' in name for name in source_names) + assert any('podcast' in name for name in source_names) + assert not any('youtube' in name for name in source_names) + + def test_logging_integration(self, orchestrator_with_mock_data): + """Test that comprehensive logging works throughout workflow""" + orchestrator = orchestrator_with_mock_data + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check log files exist for each source + sources = ['wordpress', 'youtube', 'podcast'] + for source in sources: + log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log" + assert log_file.exists() + + log_content = log_file.read_text() + assert f"Starting {source} scraper" in log_content + assert "Successfully processed" in log_content + assert "Saved markdown to" in log_content + + @patch('requests.Session.request') + def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data): + """Test media downloading integration""" + # Mock successful media download + mock_response = Mock() + mock_response.status_code = 200 + mock_response.iter_content.return_value = [b'fake image data'] + mock_request.return_value = mock_response + + # Add media URLs to mock data + orchestrator = orchestrator_with_mock_data + podcast_scraper = orchestrator.scrapers['podcast'] + podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg' + + # Override download_media to actually download + original_run = podcast_scraper.run + def run_with_media(): + original_run() + # Simulate media download + for item in podcast_scraper.mock_data: + if 'image' in item: + podcast_scraper.download_media(item['image'], item['id'], 'image') + + podcast_scraper.run = run_with_media + + orchestrator.run_all_scrapers(parallel=False) + + # Verify media directory and file + media_dir = orchestrator.config.data_dir / "media" / "Podcast" + assert media_dir.exists() + + # Check if media file was created + media_files = list(media_dir.glob("*")) + if media_files: # Media download attempted + assert len(media_files) > 0 + + def test_archive_workflow(self, orchestrator_with_mock_data): + """Test that archiving works correctly in full workflow""" + orchestrator = orchestrator_with_mock_data + + # Create some existing files to archive + current_dir = orchestrator.config.data_dir / "markdown_current" + current_dir.mkdir(parents=True, exist_ok=True) + + old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md" + old_file.write_text("Old content") + + # Run workflow + orchestrator.run_all_scrapers(parallel=False) + + # Check that old file was archived + archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress" + archived_files = list(archive_dir.glob("*.md")) + + # Should have archived the old file + assert any("2024-01-01" in f.name for f in archived_files) + + # Current directory should have new files + current_files = list(current_dir.glob("*.md")) + assert len(current_files) == 3 # One per source + assert not any("2024-01-01" in f.name for f in current_files) + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) \ No newline at end of file diff --git a/tests/test_integration_simple.py b/tests/test_integration_simple.py new file mode 100644 index 0000000..cc7a871 --- /dev/null +++ b/tests/test_integration_simple.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Simple integration tests for parallel processing validation +""" + +import pytest +import time +import tempfile +from pathlib import Path +from unittest.mock import Mock, patch +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Add project to path +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.base_scraper import BaseScraper, ScraperConfig + + +class SimpleMockScraper(BaseScraper): + """Simple mock scraper for basic testing""" + + def __init__(self, config: ScraperConfig, delay: float = 0.1): + super().__init__(config) + self.delay = delay + self.start_time = None + self.end_time = None + + def fetch_content(self): + return [ + { + 'id': f'{self.config.source_name}_1', + 'title': f'Test {self.config.source_name}', + 'url': f'https://example.com/{self.config.source_name}', + 'description': f'Test description for {self.config.source_name}', + 'likes': 10, + 'comments': 5 + } + ] + + def get_incremental_items(self, items, state): + return items # Return all items for testing + + def update_state(self, state, items): + if items: + state['last_id'] = items[-1]['id'] + return state + + def run(self): + self.start_time = time.time() + time.sleep(self.delay) # Simulate processing time + super().run() # Call parent run method + self.end_time = time.time() + + +def test_parallel_vs_sequential_execution(): + """Test that parallel execution is faster than sequential""" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create scrapers with different delays + scrapers = [] + for i, delay in enumerate([0.1, 0.2, 0.15]): + config = ScraperConfig( + source_name=f"scraper_{i}", + brand_name="test", + data_dir=temp_path / "data", + logs_dir=temp_path / "logs", + timezone="America/Halifax" + ) + scrapers.append(SimpleMockScraper(config, delay)) + + # Time parallel execution + start_time = time.time() + + def run_scraper(scraper): + scraper.run() + return scraper + + with ThreadPoolExecutor(max_workers=3) as executor: + futures = [executor.submit(run_scraper, scraper) for scraper in scrapers] + completed_scrapers = [future.result() for future in as_completed(futures)] + + parallel_time = time.time() - start_time + + # Reset scrapers for sequential test + for scraper in scrapers: + scraper.start_time = None + scraper.end_time = None + + # Time sequential execution + start_time = time.time() + for scraper in scrapers: + scraper.run() + sequential_time = time.time() - start_time + + # Assertions + print(f"Parallel time: {parallel_time:.3f}s") + print(f"Sequential time: {sequential_time:.3f}s") + + assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)" + assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s" + assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s" + + +def test_parallel_scraper_overlap(): + """Test that scrapers have overlapping execution times""" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create scrapers with sufficient delay to detect overlap + scrapers = [] + for i in range(3): + config = ScraperConfig( + source_name=f"scraper_{i}", + brand_name="test", + data_dir=temp_path / "data", + logs_dir=temp_path / "logs", + timezone="America/Halifax" + ) + scrapers.append(SimpleMockScraper(config, 0.2)) # 200ms delay each + + # Run in parallel + def run_scraper(scraper): + scraper.run() + return scraper + + with ThreadPoolExecutor(max_workers=3) as executor: + futures = [executor.submit(run_scraper, scraper) for scraper in scrapers] + completed_scrapers = [future.result() for future in as_completed(futures)] + + # Check for overlapping execution + overlaps = 0 + for i in range(len(scrapers)): + for j in range(i + 1, len(scrapers)): + scraper_a = scrapers[i] + scraper_b = scrapers[j] + + # Check if execution times overlap + if (scraper_a.start_time < scraper_b.end_time and + scraper_b.start_time < scraper_a.end_time): + overlaps += 1 + + assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel" + print(f"Detected {overlaps} overlapping execution pairs") + + +def test_markdown_output_format(): + """Test that markdown output follows specification""" + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + config = ScraperConfig( + source_name="test_scraper", + brand_name="test", + data_dir=temp_path / "data", + logs_dir=temp_path / "logs", + timezone="America/Halifax" + ) + + scraper = SimpleMockScraper(config, 0.1) + scraper.run() + + # Check that markdown file was created + markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md")) + assert len(markdown_files) == 1 + + # Check content format + content = markdown_files[0].read_text() + assert "# ID:" in content + assert "## Title:" in content + assert "## Type:" in content + assert "## Permalink:" in content + assert "## Description:" in content + assert "## Metadata:" in content + assert "### Comments:" in content + assert "### Likes:" in content + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) \ No newline at end of file diff --git a/tests/test_orchestrator_integration.py b/tests/test_orchestrator_integration.py new file mode 100644 index 0000000..9fa757d --- /dev/null +++ b/tests/test_orchestrator_integration.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Integration tests for ContentOrchestrator parallel processing +""" + +import pytest +import json +import time +import tempfile +from pathlib import Path +from unittest.mock import Mock, patch, MagicMock +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Add project to path +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.orchestrator import ContentOrchestrator +from src.base_scraper import BaseScraper, ScraperConfig + + +class MockScraper(BaseScraper): + """Mock scraper for testing parallel processing""" + + def __init__(self, config: ScraperConfig, delay: float = 0.1, fail: bool = False): + super().__init__(config) + self.delay = delay + self.fail = fail + self.run_called = False + self.run_start_time = None + self.run_end_time = None + + def fetch_content(self): + return [ + { + 'id': f'{self.config.source_name}_1', + 'title': f'Test {self.config.source_name} Title', + 'url': f'https://example.com/{self.config.source_name}/1', + 'description': f'Test description for {self.config.source_name}', + 'likes': 10, + 'comments': 5 + } + ] + + def get_incremental_items(self, items, state): + # Return all items for testing + return items + + def update_state(self, state, items): + state['last_id'] = items[-1]['id'] if items else None + return state + + def run(self): + self.run_called = True + self.run_start_time = time.time() + + if self.fail: + self.logger.error(f"Mock failure for {self.config.source_name}") + raise Exception(f"Mock failure for {self.config.source_name}") + + # Simulate processing time + time.sleep(self.delay) + + try: + # Call parent run method for actual processing + super().run() + finally: + self.run_end_time = time.time() + + +class TestOrchestratorIntegration: + """Integration tests for ContentOrchestrator""" + + @pytest.fixture + def temp_config(self): + """Create temporary config for testing""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + config = ScraperConfig( + source_name="integration_test", + brand_name="testbrand", + data_dir=temp_path / "data", + logs_dir=temp_path / "logs", + timezone="America/Halifax" + ) + yield config + + @pytest.fixture + def orchestrator(self, temp_config): + """Create orchestrator with mock scrapers""" + orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir) + + # Replace real scrapers with mock scrapers + orchestrator.scrapers = { + 'fast_scraper': MockScraper( + ScraperConfig( + source_name="fast_scraper", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + delay=0.1 + ), + 'slow_scraper': MockScraper( + ScraperConfig( + source_name="slow_scraper", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + delay=0.3 + ), + 'medium_scraper': MockScraper( + ScraperConfig( + source_name="medium_scraper", + brand_name=temp_config.brand_name, + data_dir=temp_config.data_dir, + logs_dir=temp_config.logs_dir, + timezone=temp_config.timezone + ), + delay=0.2 + ) + } + + return orchestrator + + def test_parallel_execution_timing(self, orchestrator): + """Test that parallel execution is faster than sequential""" + scrapers = orchestrator.scrapers + + # Time parallel execution + start_time = time.time() + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + parallel_time = time.time() - start_time + + # Reset scrapers for sequential test + for scraper in scrapers.values(): + scraper.run_called = False + scraper.run_start_time = None + scraper.run_end_time = None + + # Time sequential execution + start_time = time.time() + orchestrator.run_all_scrapers(parallel=False) + sequential_time = time.time() - start_time + + # Parallel should be significantly faster + assert parallel_time < sequential_time + assert parallel_time < 0.5 # Should complete well under total delay time + assert sequential_time > 0.6 # Should be close to sum of delays (0.1 + 0.2 + 0.3) + + # Verify all scrapers ran + for scraper in scrapers.values(): + assert scraper.run_called + + def test_parallel_scraper_overlap(self, orchestrator): + """Test that scrapers actually run in parallel (overlapping execution)""" + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + scrapers = list(orchestrator.scrapers.values()) + + # Check that at least two scrapers had overlapping execution + overlaps = 0 + for i in range(len(scrapers)): + for j in range(i + 1, len(scrapers)): + scraper_a = scrapers[i] + scraper_b = scrapers[j] + + # Check if execution times overlap + if (scraper_a.run_start_time < scraper_b.run_end_time and + scraper_b.run_start_time < scraper_a.run_end_time): + overlaps += 1 + + assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel" + + def test_error_isolation(self, orchestrator): + """Test that one failing scraper doesn't crash others""" + # Make one scraper fail + orchestrator.scrapers['slow_scraper'].fail = True + + # Run parallel processing + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check that other scrapers still completed successfully + assert orchestrator.scrapers['fast_scraper'].run_called + assert orchestrator.scrapers['medium_scraper'].run_called + assert orchestrator.scrapers['slow_scraper'].run_called # It ran but failed + + # Check that successful scrapers produced output + fast_files = list((orchestrator.config.data_dir / "markdown_current").glob("*fast_scraper*")) + medium_files = list((orchestrator.config.data_dir / "markdown_current").glob("*medium_scraper*")) + slow_files = list((orchestrator.config.data_dir / "markdown_current").glob("*slow_scraper*")) + + assert len(fast_files) > 0, "Fast scraper should have produced output" + assert len(medium_files) > 0, "Medium scraper should have produced output" + assert len(slow_files) == 0, "Slow scraper should not have produced output due to failure" + + def test_worker_pool_limits(self, orchestrator): + """Test that max_workers parameter is respected""" + # Add more scrapers than workers + for i in range(5): + scraper_name = f'extra_scraper_{i}' + orchestrator.scrapers[scraper_name] = MockScraper( + ScraperConfig( + source_name=scraper_name, + brand_name=orchestrator.config.brand_name, + data_dir=orchestrator.config.data_dir, + logs_dir=orchestrator.config.logs_dir, + timezone=orchestrator.config.timezone + ), + delay=0.1 + ) + + # Run with limited workers + start_time = time.time() + orchestrator.run_all_scrapers(parallel=True, max_workers=2) + execution_time = time.time() - start_time + + # With 8 scrapers and 2 workers, should take longer than with unlimited workers + # but still benefit from parallelism + assert execution_time > 0.3 # Should take multiple batches + assert execution_time < 1.0 # But still faster than fully sequential + + # Verify all scrapers ran + for scraper in orchestrator.scrapers.values(): + assert scraper.run_called + + def test_markdown_output_format(self, orchestrator): + """Test that parallel processing produces spec-compliant markdown""" + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check markdown files were created + markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md")) + assert len(markdown_files) == 3 # One per successful scraper + + # Verify markdown format for each file + for file_path in markdown_files: + content = file_path.read_text(encoding='utf-8') + + # Check spec-compliant format + assert "# ID:" in content + assert "## Title:" in content + assert "## Type:" in content + assert "## Permalink:" in content + assert "## Description:" in content + assert "## Metadata:" in content + assert "### Comments:" in content + assert "### Likes:" in content + assert "### Tags:" in content + + def test_state_management_isolation(self, orchestrator): + """Test that state management works correctly in parallel""" + # Run scrapers twice to test state persistence + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check that state files were created + state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json")) + assert len(state_files) == 3 + + # Verify state content + for state_file in state_files: + state_data = json.loads(state_file.read_text()) + assert 'last_update' in state_data + assert 'last_item_count' in state_data + assert state_data['last_item_count'] == 1 # Mock scrapers return 1 item + + def test_directory_structure_creation(self, orchestrator): + """Test that parallel processing creates proper directory structure""" + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + base_data_dir = orchestrator.config.data_dir + + # Check that all required directories exist + assert (base_data_dir / "markdown_current").exists() + assert (base_data_dir / ".state").exists() + + # Check source-specific directories + for source_name in orchestrator.scrapers.keys(): + source_title = source_name.title() + assert (base_data_dir / "markdown_archives" / source_title).exists() + assert (base_data_dir / "media" / source_title).exists() + assert (orchestrator.config.logs_dir / source_title).exists() + + @patch('src.orchestrator.ContentOrchestrator.sync_to_nas') + def test_nas_sync_called(self, mock_sync, orchestrator): + """Test that NAS sync is called after parallel processing""" + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Verify sync was called + mock_sync.assert_called_once() + + def test_logging_isolation(self, orchestrator): + """Test that logging works correctly in parallel processing""" + orchestrator.run_all_scrapers(parallel=True, max_workers=3) + + # Check that log files were created for each scraper + log_files = [] + for source_name in orchestrator.scrapers.keys(): + source_title = source_name.title() + log_file = orchestrator.config.logs_dir / source_title / f"{source_name}.log" + if log_file.exists(): + log_files.append(log_file) + + assert len(log_files) == 3 + + # Verify log content + for log_file in log_files: + log_content = log_file.read_text() + assert "Starting" in log_content + assert "Successfully processed" in log_content + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) \ No newline at end of file