Add comprehensive test infrastructure

- Created unit tests for BaseScraper with mocking - Added integration tests for parallel processing - Created end-to-end tests with realistic mock data - Fixed initialization order in BaseScraper (logger before user agent) - Fixed orchestrator method name (archive_current_file) - Added tenacity dependency for retry logic - Validated parallel processing performance and overlap detection - Confirmed spec-compliant markdown formatting in tests Tests cover: - Base scraper functionality (state, markdown, retry logic, media downloads) - Parallel vs sequential execution timing - Error isolation between scrapers - Directory structure creation - State management across runs - Full workflow with realistic data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 21:16:14 -03:00 · 2025-08-18 21:16:14 -03:00 · 8d5750b1d1
commit 8d5750b1d1
parent b6273ca934
6 changed files with 954 additions and 9 deletions
--- a/src/base_scraper.py
+++ b/src/base_scraper.py
@ -46,9 +46,6 @@ class BaseScraper(ABC):
        ]
        self.current_ua_index = 0
        # Set initial user agent
        self.rotate_user_agent()
        # Retry configuration from production config
        self.retry_config = {
            "max_attempts": 3,
@ -66,6 +63,9 @@ class BaseScraper(ABC):
        # Now setup logger after directories exist
        self.logger = self._setup_logger()
        # Set initial user agent (after logger is set up)
        self.rotate_user_agent()
    def _setup_logger(self) -> logging.Logger:
        logger = logging.getLogger(f"{self.config.brand_name}_{self.config.source_name}")
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -31,10 +31,10 @@ load_dotenv()
 class ContentOrchestrator:
    """Orchestrates all content scrapers and handles synchronization."""
-    def __init__(self, data_dir: Path = None):
+    def __init__(self, data_dir: Path = None, logs_dir: Path = None):
        """Initialize the orchestrator."""
        self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
-        self.logs_dir = Path("/opt/hvac-kia-content/logs")
+        self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
        self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hvacknowitall'))
        self.timezone = os.getenv('TIMEZONE', 'America/Halifax')
        self.tz = pytz.timezone(self.timezone)
@ -153,7 +153,7 @@ class ContentOrchestrator:
                }
            # Archive existing markdown files
-            scraper.archive_existing_files()
+            scraper.archive_current_file()
            # Generate and save markdown
            markdown = scraper.format_markdown(new_items)
--- a/tests/test_base_scraper.py
+++ b/tests/test_base_scraper.py
@ -17,7 +17,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from src.base_scraper import BaseScraper, ScraperConfig
-class TestScraper(BaseScraper):
+class MockTestScraper(BaseScraper):
    """Test implementation of BaseScraper"""
    def fetch_content(self):
@ -53,6 +53,11 @@ class TestScraper(BaseScraper):
            elif item['id'] == state['last_id']:
                last_seen = True
        return new_items
    def update_state(self, state, items):
        if items:
            state['last_id'] = items[-1]['id']
        return state
 class TestBaseScraper:
@ -75,7 +80,7 @@ class TestBaseScraper:
    @pytest.fixture
    def scraper(self, temp_config):
        """Create test scraper instance"""
-        return TestScraper(temp_config)
+        return MockTestScraper(temp_config)
    def test_initialization(self, scraper):
        """Test scraper initializes correctly"""
@ -205,7 +210,6 @@ class TestBaseScraper:
        new_state = scraper.update_state(old_state, items)
        assert new_state['last_id'] == 'test2'  # Should be last item ID
        assert 'last_update' in new_state
    @patch('requests.Session.request')
    def test_download_media(self, mock_request, scraper):
--- a/tests/test_end_to_end.py
+++ b/tests/test_end_to_end.py
@ -0,0 +1,441 @@
 #!/usr/bin/env python3
 """
 End-to-end tests with mock data for full workflow validation
 """
 import pytest
 import json
 import time
 import tempfile
 from pathlib import Path
 from unittest.mock import Mock, patch, MagicMock
 import requests
 # Add project to path
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from src.orchestrator import ContentOrchestrator
 from src.base_scraper import BaseScraper, ScraperConfig
 from src.rss_scraper import BaseRSSScraper
 class MockEndToEndScraper(BaseScraper):
    """End-to-end mock scraper with realistic data"""
    def __init__(self, config: ScraperConfig, mock_data: list):
        super().__init__(config)
        self.mock_data = mock_data
    def fetch_content(self):
        return self.mock_data
    def get_incremental_items(self, items, state):
        if not state.get('last_id'):
            return items
        # Find items after last_id
        last_seen = False
        new_items = []
        for item in items:
            if last_seen:
                new_items.append(item)
            elif item['id'] == state['last_id']:
                last_seen = True
        return new_items
    def update_state(self, state, items):
        if items:
            state['last_id'] = items[-1]['id']
        return state
 class TestEndToEnd:
    """End-to-end workflow tests"""
    @pytest.fixture
    def temp_config(self):
        """Create temporary config for testing"""
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            config = ScraperConfig(
                source_name="e2e_test",
                brand_name="hvacknowitall",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            yield config
    @pytest.fixture
    def mock_wordpress_data(self):
        """Mock WordPress blog post data"""
        return [
            {
                'id': 'wp_1001',
                'title': 'Understanding HVAC System Efficiency',
                'url': 'https://hvacknowitall.com/hvac-efficiency/',
                'description': 'A comprehensive guide to improving your HVAC system efficiency and reducing energy costs.',
                'author': 'HVAC Expert',
                'publish_date': '2024-01-15T10:30:00',
                'word_count': 1250,
                'tags': ['efficiency', 'energy-saving', 'maintenance'],
                'categories': ['HVAC Tips', 'Energy Efficiency']
            },
            {
                'id': 'wp_1002', 
                'title': 'Common HVAC Problems in Winter',
                'url': 'https://hvacknowitall.com/winter-hvac-problems/',
                'description': 'Identify and troubleshoot the most common HVAC issues during cold weather.',
                'author': 'HVAC Expert',
                'publish_date': '2024-01-20T14:15:00',
                'word_count': 980,
                'tags': ['troubleshooting', 'winter', 'maintenance'],
                'categories': ['Troubleshooting', 'Seasonal Tips']
            }
        ]
    @pytest.fixture
    def mock_youtube_data(self):
        """Mock YouTube video data"""
        return [
            {
                'id': 'yt_abc123',
                'title': 'How to Replace Your Air Filter - DIY HVAC',
                'url': 'https://youtube.com/watch?v=abc123',
                'description': 'Step-by-step guide to replacing your HVAC air filter. Save money and improve air quality!',
                'author': 'HVAC Know It All',
                'type': 'video',
                'views': 15420,
                'likes': 247,
                'comments': 18,
                'shares': 12,
                'duration': '8:45',
                'tags': ['DIY', 'air filter', 'maintenance']
            },
            {
                'id': 'yt_def456',
                'title': 'HVAC Short: Quick Thermostat Tip',
                'url': 'https://youtube.com/shorts/def456',
                'description': 'Quick tip for optimizing your thermostat settings.',
                'author': 'HVAC Know It All',
                'type': 'short',
                'views': 8934,
                'likes': 156,
                'comments': 7,
                'shares': 23,
                'duration': '0:58',
                'tags': ['thermostat', 'tips', 'energy-saving']
            }
        ]
    @pytest.fixture
    def mock_podcast_data(self):
        """Mock podcast episode data"""
        return [
            {
                'id': 'pod_ep101',
                'title': 'Episode 101: Heat Pump vs Furnace Debate',
                'url': 'https://hvacknowitall.com/podcast/ep101/',
                'description': 'We dive deep into the pros and cons of heat pumps versus traditional furnaces.',
                'author': 'HVAC Know It All Podcast',
                'audio_link': 'https://hvacknowitall.com/podcast/ep101.mp3',
                'duration': '45:32',
                'publish_date': '2024-01-18T09:00:00',
                'image': 'https://hvacknowitall.com/podcast/ep101-cover.jpg',
                'tags': ['heat pump', 'furnace', 'comparison']
            }
        ]
    @pytest.fixture
    def orchestrator_with_mock_data(self, temp_config, mock_wordpress_data, mock_youtube_data, mock_podcast_data):
        """Create orchestrator with realistic mock data"""
        orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)
        # Replace scrapers with mock versions
        orchestrator.scrapers = {
            'wordpress': MockEndToEndScraper(
                ScraperConfig(
                    source_name="wordpress",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_wordpress_data
            ),
            'youtube': MockEndToEndScraper(
                ScraperConfig(
                    source_name="youtube",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_youtube_data
            ),
            'podcast': MockEndToEndScraper(
                ScraperConfig(
                    source_name="podcast",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                mock_podcast_data
            )
        }
        return orchestrator
    def test_full_workflow_execution(self, orchestrator_with_mock_data):
        """Test complete workflow from start to finish"""
        orchestrator = orchestrator_with_mock_data
        # Run full workflow
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Verify markdown files were created
        markdown_dir = orchestrator.config.data_dir / "markdown_current"
        markdown_files = list(markdown_dir.glob("*.md"))
        assert len(markdown_files) == 3
        # Verify file naming convention
        for file_path in markdown_files:
            filename = file_path.name
            assert filename.startswith("hvacknowitall_")
            assert any(source in filename for source in ['wordpress', 'youtube', 'podcast'])
            assert ".md" in filename
            # Check timestamp format (YYYY-DD-MM-THHMMSS)
            assert len(filename.split('_')) >= 3
    def test_markdown_format_compliance(self, orchestrator_with_mock_data):
        """Test that generated markdown follows specification exactly"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check each markdown file
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
        for file_path in markdown_files:
            content = file_path.read_text(encoding='utf-8')
            # Verify spec format for each item
            assert "# ID:" in content
            assert "## Title:" in content
            assert "## Type:" in content
            assert "## Permalink:" in content
            assert "## Description:" in content
            assert "## Metadata:" in content
            assert "### Comments:" in content
            assert "### Likes:" in content
            assert "### Tags:" in content
            # Verify separator between items
            if content.count("# ID:") > 1:
                assert "--------------" in content
            # Verify specific content based on source
            if "wordpress" in file_path.name:
                assert "Understanding HVAC System Efficiency" in content
                assert "energy-saving" in content
                assert "1250" in content  # word count should be preserved
            elif "youtube" in file_path.name:
                assert "How to Replace Your Air Filter" in content
                assert "15420" in content  # view count
                assert "247" in content   # like count
            elif "podcast" in file_path.name:
                assert "Heat Pump vs Furnace Debate" in content
                assert "45:32" in content  # duration
    def test_directory_structure_creation(self, orchestrator_with_mock_data):
        """Test that proper directory structure is created"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        base_dir = orchestrator.config.data_dir
        logs_dir = orchestrator.config.logs_dir
        # Check main directories
        assert (base_dir / "markdown_current").exists()
        assert (base_dir / "markdown_archives").exists()
        assert (base_dir / "media").exists()
        assert (base_dir / ".state").exists()
        # Check source-specific directories
        sources = ['Wordpress', 'Youtube', 'Podcast']
        for source in sources:
            assert (base_dir / "markdown_archives" / source).exists()
            assert (base_dir / "media" / source).exists()
            assert (logs_dir / source).exists()
    def test_state_persistence_workflow(self, orchestrator_with_mock_data):
        """Test incremental updates with state persistence"""
        orchestrator = orchestrator_with_mock_data
        # First run - should process all items
        orchestrator.run_all_scrapers(parallel=False)
        # Check state files were created
        state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
        assert len(state_files) == 3
        # Verify state content
        wordpress_state_file = orchestrator.config.data_dir / ".state" / "wordpress_state.json"
        assert wordpress_state_file.exists()
        state_data = json.loads(wordpress_state_file.read_text())
        assert 'last_id' in state_data
        assert 'last_update' in state_data
        assert 'last_item_count' in state_data
        assert state_data['last_id'] == 'wp_1002'  # Last item ID
        assert state_data['last_item_count'] == 2  # Both items processed
        # Add new item to WordPress scraper
        new_item = {
            'id': 'wp_1003',
            'title': 'New HVAC Article',
            'url': 'https://hvacknowitall.com/new-article/',
            'description': 'Brand new article about HVAC.',
            'author': 'HVAC Expert',
            'tags': ['new'],
            'categories': ['News']
        }
        orchestrator.scrapers['wordpress'].mock_data.append(new_item)
        # Archive existing files to simulate next run
        for scraper in orchestrator.scrapers.values():
            scraper.archive_current_file()
        # Second run - should only process new item
        orchestrator.run_all_scrapers(parallel=False)
        # Check that only incremental content was processed
        updated_state = json.loads(wordpress_state_file.read_text())
        assert updated_state['last_id'] == 'wp_1003'
        assert updated_state['last_item_count'] == 1  # Only new item
        # Verify new markdown contains only new item
        new_markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*wordpress*.md"))
        assert len(new_markdown_files) == 1
        new_content = new_markdown_files[0].read_text()
        assert "New HVAC Article" in new_content
        assert "Understanding HVAC System Efficiency" not in new_content  # Old content not repeated
    @patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
    def test_nas_sync_integration(self, mock_sync, orchestrator_with_mock_data):
        """Test NAS sync is called with correct parameters"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Verify sync was called
        mock_sync.assert_called_once()
    def test_error_recovery_workflow(self, orchestrator_with_mock_data):
        """Test that workflow continues when one source fails"""
        orchestrator = orchestrator_with_mock_data
        # Make YouTube scraper fail
        def failing_run():
            raise Exception("YouTube API error")
        orchestrator.scrapers['youtube'].run = failing_run
        # Run workflow - should not crash
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Verify other sources still completed
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
        # Should have 2 files (WordPress and Podcast), not 3
        assert len(markdown_files) == 2
        source_names = [f.name for f in markdown_files]
        assert any('wordpress' in name for name in source_names)
        assert any('podcast' in name for name in source_names)
        assert not any('youtube' in name for name in source_names)
    def test_logging_integration(self, orchestrator_with_mock_data):
        """Test that comprehensive logging works throughout workflow"""
        orchestrator = orchestrator_with_mock_data
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check log files exist for each source
        sources = ['wordpress', 'youtube', 'podcast']
        for source in sources:
            log_file = orchestrator.config.logs_dir / source.title() / f"{source}.log"
            assert log_file.exists()
            log_content = log_file.read_text()
            assert f"Starting {source} scraper" in log_content
            assert "Successfully processed" in log_content
            assert "Saved markdown to" in log_content
    @patch('requests.Session.request')
    def test_media_download_workflow(self, mock_request, orchestrator_with_mock_data):
        """Test media downloading integration"""
        # Mock successful media download
        mock_response = Mock()
        mock_response.status_code = 200
        mock_response.iter_content.return_value = [b'fake image data']
        mock_request.return_value = mock_response
        # Add media URLs to mock data
        orchestrator = orchestrator_with_mock_data
        podcast_scraper = orchestrator.scrapers['podcast']
        podcast_scraper.mock_data[0]['image'] = 'https://example.com/podcast-cover.jpg'
        # Override download_media to actually download
        original_run = podcast_scraper.run
        def run_with_media():
            original_run()
            # Simulate media download
            for item in podcast_scraper.mock_data:
                if 'image' in item:
                    podcast_scraper.download_media(item['image'], item['id'], 'image')
        podcast_scraper.run = run_with_media
        orchestrator.run_all_scrapers(parallel=False)
        # Verify media directory and file
        media_dir = orchestrator.config.data_dir / "media" / "Podcast"
        assert media_dir.exists()
        # Check if media file was created
        media_files = list(media_dir.glob("*"))
        if media_files:  # Media download attempted
            assert len(media_files) > 0
    def test_archive_workflow(self, orchestrator_with_mock_data):
        """Test that archiving works correctly in full workflow"""
        orchestrator = orchestrator_with_mock_data
        # Create some existing files to archive
        current_dir = orchestrator.config.data_dir / "markdown_current"
        current_dir.mkdir(parents=True, exist_ok=True)
        old_file = current_dir / "hvacknowitall_wordpress_2024-01-01-T120000.md"
        old_file.write_text("Old content")
        # Run workflow
        orchestrator.run_all_scrapers(parallel=False)
        # Check that old file was archived
        archive_dir = orchestrator.config.data_dir / "markdown_archives" / "Wordpress"
        archived_files = list(archive_dir.glob("*.md"))
        # Should have archived the old file
        assert any("2024-01-01" in f.name for f in archived_files)
        # Current directory should have new files
        current_files = list(current_dir.glob("*.md"))
        assert len(current_files) == 3  # One per source
        assert not any("2024-01-01" in f.name for f in current_files)
 if __name__ == '__main__':
    pytest.main([__file__, '-v'])
--- a/tests/test_integration_simple.py
+++ b/tests/test_integration_simple.py
@ -0,0 +1,184 @@
 #!/usr/bin/env python3
 """
 Simple integration tests for parallel processing validation
 """
 import pytest
 import time
 import tempfile
 from pathlib import Path
 from unittest.mock import Mock, patch
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # Add project to path
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from src.base_scraper import BaseScraper, ScraperConfig
 class SimpleMockScraper(BaseScraper):
    """Simple mock scraper for basic testing"""
    def __init__(self, config: ScraperConfig, delay: float = 0.1):
        super().__init__(config)
        self.delay = delay
        self.start_time = None
        self.end_time = None
    def fetch_content(self):
        return [
            {
                'id': f'{self.config.source_name}_1',
                'title': f'Test {self.config.source_name}',
                'url': f'https://example.com/{self.config.source_name}',
                'description': f'Test description for {self.config.source_name}',
                'likes': 10,
                'comments': 5
            }
        ]
    def get_incremental_items(self, items, state):
        return items  # Return all items for testing
    def update_state(self, state, items):
        if items:
            state['last_id'] = items[-1]['id']
        return state
    def run(self):
        self.start_time = time.time()
        time.sleep(self.delay)  # Simulate processing time
        super().run()  # Call parent run method
        self.end_time = time.time()
 def test_parallel_vs_sequential_execution():
    """Test that parallel execution is faster than sequential"""
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        # Create scrapers with different delays
        scrapers = []
        for i, delay in enumerate([0.1, 0.2, 0.15]):
            config = ScraperConfig(
                source_name=f"scraper_{i}",
                brand_name="test",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            scrapers.append(SimpleMockScraper(config, delay))
        # Time parallel execution
        start_time = time.time()
        def run_scraper(scraper):
            scraper.run()
            return scraper
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
            completed_scrapers = [future.result() for future in as_completed(futures)]
        parallel_time = time.time() - start_time
        # Reset scrapers for sequential test
        for scraper in scrapers:
            scraper.start_time = None
            scraper.end_time = None
        # Time sequential execution
        start_time = time.time()
        for scraper in scrapers:
            scraper.run()
        sequential_time = time.time() - start_time
        # Assertions
        print(f"Parallel time: {parallel_time:.3f}s")
        print(f"Sequential time: {sequential_time:.3f}s")
        assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)"
        assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s"
        assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s"
 def test_parallel_scraper_overlap():
    """Test that scrapers have overlapping execution times"""
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        # Create scrapers with sufficient delay to detect overlap
        scrapers = []
        for i in range(3):
            config = ScraperConfig(
                source_name=f"scraper_{i}",
                brand_name="test",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            scrapers.append(SimpleMockScraper(config, 0.2))  # 200ms delay each
        # Run in parallel
        def run_scraper(scraper):
            scraper.run()
            return scraper
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
            completed_scrapers = [future.result() for future in as_completed(futures)]
        # Check for overlapping execution
        overlaps = 0
        for i in range(len(scrapers)):
            for j in range(i + 1, len(scrapers)):
                scraper_a = scrapers[i]
                scraper_b = scrapers[j]
                # Check if execution times overlap
                if (scraper_a.start_time < scraper_b.end_time and 
                    scraper_b.start_time < scraper_a.end_time):
                    overlaps += 1
        assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
        print(f"Detected {overlaps} overlapping execution pairs")
 def test_markdown_output_format():
    """Test that markdown output follows specification"""
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        config = ScraperConfig(
            source_name="test_scraper",
            brand_name="test",
            data_dir=temp_path / "data",
            logs_dir=temp_path / "logs",
            timezone="America/Halifax"
        )
        scraper = SimpleMockScraper(config, 0.1)
        scraper.run()
        # Check that markdown file was created
        markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md"))
        assert len(markdown_files) == 1
        # Check content format
        content = markdown_files[0].read_text()
        assert "# ID:" in content
        assert "## Title:" in content
        assert "## Type:" in content
        assert "## Permalink:" in content
        assert "## Description:" in content
        assert "## Metadata:" in content
        assert "### Comments:" in content
        assert "### Likes:" in content
 if __name__ == '__main__':
    pytest.main([__file__, '-v'])
--- a/tests/test_orchestrator_integration.py
+++ b/tests/test_orchestrator_integration.py
@ -0,0 +1,316 @@
 #!/usr/bin/env python3
 """
 Integration tests for ContentOrchestrator parallel processing
 """
 import pytest
 import json
 import time
 import tempfile
 from pathlib import Path
 from unittest.mock import Mock, patch, MagicMock
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # Add project to path
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from src.orchestrator import ContentOrchestrator
 from src.base_scraper import BaseScraper, ScraperConfig
 class MockScraper(BaseScraper):
    """Mock scraper for testing parallel processing"""
    def __init__(self, config: ScraperConfig, delay: float = 0.1, fail: bool = False):
        super().__init__(config)
        self.delay = delay
        self.fail = fail
        self.run_called = False
        self.run_start_time = None
        self.run_end_time = None
    def fetch_content(self):
        return [
            {
                'id': f'{self.config.source_name}_1',
                'title': f'Test {self.config.source_name} Title',
                'url': f'https://example.com/{self.config.source_name}/1',
                'description': f'Test description for {self.config.source_name}',
                'likes': 10,
                'comments': 5
            }
        ]
    def get_incremental_items(self, items, state):
        # Return all items for testing
        return items
    def update_state(self, state, items):
        state['last_id'] = items[-1]['id'] if items else None
        return state
    def run(self):
        self.run_called = True
        self.run_start_time = time.time()
        if self.fail:
            self.logger.error(f"Mock failure for {self.config.source_name}")
            raise Exception(f"Mock failure for {self.config.source_name}")
        # Simulate processing time
        time.sleep(self.delay)
        try:
            # Call parent run method for actual processing
            super().run()
        finally:
            self.run_end_time = time.time()
 class TestOrchestratorIntegration:
    """Integration tests for ContentOrchestrator"""
    @pytest.fixture
    def temp_config(self):
        """Create temporary config for testing"""
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            config = ScraperConfig(
                source_name="integration_test",
                brand_name="testbrand",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            yield config
    @pytest.fixture
    def orchestrator(self, temp_config):
        """Create orchestrator with mock scrapers"""
        orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)
        # Replace real scrapers with mock scrapers
        orchestrator.scrapers = {
            'fast_scraper': MockScraper(
                ScraperConfig(
                    source_name="fast_scraper",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.1
            ),
            'slow_scraper': MockScraper(
                ScraperConfig(
                    source_name="slow_scraper", 
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.3
            ),
            'medium_scraper': MockScraper(
                ScraperConfig(
                    source_name="medium_scraper",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.2
            )
        }
        return orchestrator
    def test_parallel_execution_timing(self, orchestrator):
        """Test that parallel execution is faster than sequential"""
        scrapers = orchestrator.scrapers
        # Time parallel execution
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        parallel_time = time.time() - start_time
        # Reset scrapers for sequential test
        for scraper in scrapers.values():
            scraper.run_called = False
            scraper.run_start_time = None
            scraper.run_end_time = None
        # Time sequential execution
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=False)
        sequential_time = time.time() - start_time
        # Parallel should be significantly faster
        assert parallel_time < sequential_time
        assert parallel_time < 0.5  # Should complete well under total delay time
        assert sequential_time > 0.6  # Should be close to sum of delays (0.1 + 0.2 + 0.3)
        # Verify all scrapers ran
        for scraper in scrapers.values():
            assert scraper.run_called
    def test_parallel_scraper_overlap(self, orchestrator):
        """Test that scrapers actually run in parallel (overlapping execution)"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        scrapers = list(orchestrator.scrapers.values())
        # Check that at least two scrapers had overlapping execution
        overlaps = 0
        for i in range(len(scrapers)):
            for j in range(i + 1, len(scrapers)):
                scraper_a = scrapers[i]
                scraper_b = scrapers[j]
                # Check if execution times overlap
                if (scraper_a.run_start_time < scraper_b.run_end_time and 
                    scraper_b.run_start_time < scraper_a.run_end_time):
                    overlaps += 1
        assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
    def test_error_isolation(self, orchestrator):
        """Test that one failing scraper doesn't crash others"""
        # Make one scraper fail
        orchestrator.scrapers['slow_scraper'].fail = True
        # Run parallel processing
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check that other scrapers still completed successfully
        assert orchestrator.scrapers['fast_scraper'].run_called
        assert orchestrator.scrapers['medium_scraper'].run_called
        assert orchestrator.scrapers['slow_scraper'].run_called  # It ran but failed
        # Check that successful scrapers produced output
        fast_files = list((orchestrator.config.data_dir / "markdown_current").glob("*fast_scraper*"))
        medium_files = list((orchestrator.config.data_dir / "markdown_current").glob("*medium_scraper*"))
        slow_files = list((orchestrator.config.data_dir / "markdown_current").glob("*slow_scraper*"))
        assert len(fast_files) > 0, "Fast scraper should have produced output"
        assert len(medium_files) > 0, "Medium scraper should have produced output"
        assert len(slow_files) == 0, "Slow scraper should not have produced output due to failure"
    def test_worker_pool_limits(self, orchestrator):
        """Test that max_workers parameter is respected"""
        # Add more scrapers than workers
        for i in range(5):
            scraper_name = f'extra_scraper_{i}'
            orchestrator.scrapers[scraper_name] = MockScraper(
                ScraperConfig(
                    source_name=scraper_name,
                    brand_name=orchestrator.config.brand_name,
                    data_dir=orchestrator.config.data_dir,
                    logs_dir=orchestrator.config.logs_dir,
                    timezone=orchestrator.config.timezone
                ),
                delay=0.1
            )
        # Run with limited workers
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=True, max_workers=2)
        execution_time = time.time() - start_time
        # With 8 scrapers and 2 workers, should take longer than with unlimited workers
        # but still benefit from parallelism
        assert execution_time > 0.3  # Should take multiple batches
        assert execution_time < 1.0   # But still faster than fully sequential
        # Verify all scrapers ran
        for scraper in orchestrator.scrapers.values():
            assert scraper.run_called
    def test_markdown_output_format(self, orchestrator):
        """Test that parallel processing produces spec-compliant markdown"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check markdown files were created
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
        assert len(markdown_files) == 3  # One per successful scraper
        # Verify markdown format for each file
        for file_path in markdown_files:
            content = file_path.read_text(encoding='utf-8')
            # Check spec-compliant format
            assert "# ID:" in content
            assert "## Title:" in content
            assert "## Type:" in content
            assert "## Permalink:" in content
            assert "## Description:" in content
            assert "## Metadata:" in content
            assert "### Comments:" in content
            assert "### Likes:" in content
            assert "### Tags:" in content
    def test_state_management_isolation(self, orchestrator):
        """Test that state management works correctly in parallel"""
        # Run scrapers twice to test state persistence
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check that state files were created
        state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
        assert len(state_files) == 3
        # Verify state content
        for state_file in state_files:
            state_data = json.loads(state_file.read_text())
            assert 'last_update' in state_data
            assert 'last_item_count' in state_data
            assert state_data['last_item_count'] == 1  # Mock scrapers return 1 item
    def test_directory_structure_creation(self, orchestrator):
        """Test that parallel processing creates proper directory structure"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        base_data_dir = orchestrator.config.data_dir
        # Check that all required directories exist
        assert (base_data_dir / "markdown_current").exists()
        assert (base_data_dir / ".state").exists()
        # Check source-specific directories
        for source_name in orchestrator.scrapers.keys():
            source_title = source_name.title()
            assert (base_data_dir / "markdown_archives" / source_title).exists()
            assert (base_data_dir / "media" / source_title).exists()
            assert (orchestrator.config.logs_dir / source_title).exists()
    @patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
    def test_nas_sync_called(self, mock_sync, orchestrator):
        """Test that NAS sync is called after parallel processing"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Verify sync was called
        mock_sync.assert_called_once()
    def test_logging_isolation(self, orchestrator):
        """Test that logging works correctly in parallel processing"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        # Check that log files were created for each scraper
        log_files = []
        for source_name in orchestrator.scrapers.keys():
            source_title = source_name.title()
            log_file = orchestrator.config.logs_dir / source_title / f"{source_name}.log"
            if log_file.exists():
                log_files.append(log_file)
        assert len(log_files) == 3
        # Verify log content
        for log_file in log_files:
            log_content = log_file.read_text()
            assert "Starting" in log_content
            assert "Successfully processed" in log_content
 if __name__ == '__main__':
    pytest.main([__file__, '-v'])