hvac-kia-content/tests/test_orchestrator_integration.py

#!/usr/bin/env python3
"""
Integration tests for ContentOrchestrator parallel processing
"""

import pytest
import json
import time
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.orchestrator import ContentOrchestrator
from src.base_scraper import BaseScraper, ScraperConfig


class MockScraper(BaseScraper):
    """Mock scraper for testing parallel processing"""

    def __init__(self, config: ScraperConfig, delay: float = 0.1, fail: bool = False):
        super().__init__(config)
        self.delay = delay
        self.fail = fail
        self.run_called = False
        self.run_start_time = None
        self.run_end_time = None

    def fetch_content(self):
        return [
            {
                'id': f'{self.config.source_name}_1',
                'title': f'Test {self.config.source_name} Title',
                'url': f'https://example.com/{self.config.source_name}/1',
                'description': f'Test description for {self.config.source_name}',
                'likes': 10,
                'comments': 5
            }
        ]

    def get_incremental_items(self, items, state):
        # Return all items for testing
        return items

    def update_state(self, state, items):
        state['last_id'] = items[-1]['id'] if items else None
        return state

    def run(self):
        self.run_called = True
        self.run_start_time = time.time()

        if self.fail:
            self.logger.error(f"Mock failure for {self.config.source_name}")
            raise Exception(f"Mock failure for {self.config.source_name}")

        # Simulate processing time
        time.sleep(self.delay)

        try:
            # Call parent run method for actual processing
            super().run()
        finally:
            self.run_end_time = time.time()


class TestOrchestratorIntegration:
    """Integration tests for ContentOrchestrator"""

    @pytest.fixture
    def temp_config(self):
        """Create temporary config for testing"""
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            config = ScraperConfig(
                source_name="integration_test",
                brand_name="testbrand",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            yield config

    @pytest.fixture
    def orchestrator(self, temp_config):
        """Create orchestrator with mock scrapers"""
        orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)

        # Replace real scrapers with mock scrapers
        orchestrator.scrapers = {
            'fast_scraper': MockScraper(
                ScraperConfig(
                    source_name="fast_scraper",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.1
            ),
            'slow_scraper': MockScraper(
                ScraperConfig(
                    source_name="slow_scraper",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.3
            ),
            'medium_scraper': MockScraper(
                ScraperConfig(
                    source_name="medium_scraper",
                    brand_name=temp_config.brand_name,
                    data_dir=temp_config.data_dir,
                    logs_dir=temp_config.logs_dir,
                    timezone=temp_config.timezone
                ),
                delay=0.2
            )
        }

        return orchestrator

    def test_parallel_execution_timing(self, orchestrator):
        """Test that parallel execution is faster than sequential"""
        scrapers = orchestrator.scrapers

        # Time parallel execution
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)
        parallel_time = time.time() - start_time

        # Reset scrapers for sequential test
        for scraper in scrapers.values():
            scraper.run_called = False
            scraper.run_start_time = None
            scraper.run_end_time = None

        # Time sequential execution
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=False)
        sequential_time = time.time() - start_time

        # Parallel should be significantly faster
        assert parallel_time < sequential_time
        assert parallel_time < 0.5  # Should complete well under total delay time
        assert sequential_time > 0.6  # Should be close to sum of delays (0.1 + 0.2 + 0.3)

        # Verify all scrapers ran
        for scraper in scrapers.values():
            assert scraper.run_called

    def test_parallel_scraper_overlap(self, orchestrator):
        """Test that scrapers actually run in parallel (overlapping execution)"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        scrapers = list(orchestrator.scrapers.values())

        # Check that at least two scrapers had overlapping execution
        overlaps = 0
        for i in range(len(scrapers)):
            for j in range(i + 1, len(scrapers)):
                scraper_a = scrapers[i]
                scraper_b = scrapers[j]

                # Check if execution times overlap
                if (scraper_a.run_start_time < scraper_b.run_end_time and
                    scraper_b.run_start_time < scraper_a.run_end_time):
                    overlaps += 1

        assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"

    def test_error_isolation(self, orchestrator):
        """Test that one failing scraper doesn't crash others"""
        # Make one scraper fail
        orchestrator.scrapers['slow_scraper'].fail = True

        # Run parallel processing
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check that other scrapers still completed successfully
        assert orchestrator.scrapers['fast_scraper'].run_called
        assert orchestrator.scrapers['medium_scraper'].run_called
        assert orchestrator.scrapers['slow_scraper'].run_called  # It ran but failed

        # Check that successful scrapers produced output
        fast_files = list((orchestrator.config.data_dir / "markdown_current").glob("*fast_scraper*"))
        medium_files = list((orchestrator.config.data_dir / "markdown_current").glob("*medium_scraper*"))
        slow_files = list((orchestrator.config.data_dir / "markdown_current").glob("*slow_scraper*"))

        assert len(fast_files) > 0, "Fast scraper should have produced output"
        assert len(medium_files) > 0, "Medium scraper should have produced output"
        assert len(slow_files) == 0, "Slow scraper should not have produced output due to failure"

    def test_worker_pool_limits(self, orchestrator):
        """Test that max_workers parameter is respected"""
        # Add more scrapers than workers
        for i in range(5):
            scraper_name = f'extra_scraper_{i}'
            orchestrator.scrapers[scraper_name] = MockScraper(
                ScraperConfig(
                    source_name=scraper_name,
                    brand_name=orchestrator.config.brand_name,
                    data_dir=orchestrator.config.data_dir,
                    logs_dir=orchestrator.config.logs_dir,
                    timezone=orchestrator.config.timezone
                ),
                delay=0.1
            )

        # Run with limited workers
        start_time = time.time()
        orchestrator.run_all_scrapers(parallel=True, max_workers=2)
        execution_time = time.time() - start_time

        # With 8 scrapers and 2 workers, should take longer than with unlimited workers
        # but still benefit from parallelism
        assert execution_time > 0.3  # Should take multiple batches
        assert execution_time < 1.0   # But still faster than fully sequential

        # Verify all scrapers ran
        for scraper in orchestrator.scrapers.values():
            assert scraper.run_called

    def test_markdown_output_format(self, orchestrator):
        """Test that parallel processing produces spec-compliant markdown"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check markdown files were created
        markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
        assert len(markdown_files) == 3  # One per successful scraper

        # Verify markdown format for each file
        for file_path in markdown_files:
            content = file_path.read_text(encoding='utf-8')

            # Check spec-compliant format
            assert "# ID:" in content
            assert "## Title:" in content
            assert "## Type:" in content
            assert "## Permalink:" in content
            assert "## Description:" in content
            assert "## Metadata:" in content
            assert "### Comments:" in content
            assert "### Likes:" in content
            assert "### Tags:" in content

    def test_state_management_isolation(self, orchestrator):
        """Test that state management works correctly in parallel"""
        # Run scrapers twice to test state persistence
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check that state files were created
        state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
        assert len(state_files) == 3

        # Verify state content
        for state_file in state_files:
            state_data = json.loads(state_file.read_text())
            assert 'last_update' in state_data
            assert 'last_item_count' in state_data
            assert state_data['last_item_count'] == 1  # Mock scrapers return 1 item

    def test_directory_structure_creation(self, orchestrator):
        """Test that parallel processing creates proper directory structure"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        base_data_dir = orchestrator.config.data_dir

        # Check that all required directories exist
        assert (base_data_dir / "markdown_current").exists()
        assert (base_data_dir / ".state").exists()

        # Check source-specific directories
        for source_name in orchestrator.scrapers.keys():
            source_title = source_name.title()
            assert (base_data_dir / "markdown_archives" / source_title).exists()
            assert (base_data_dir / "media" / source_title).exists()
            assert (orchestrator.config.logs_dir / source_title).exists()

    @patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
    def test_nas_sync_called(self, mock_sync, orchestrator):
        """Test that NAS sync is called after parallel processing"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Verify sync was called
        mock_sync.assert_called_once()

    def test_logging_isolation(self, orchestrator):
        """Test that logging works correctly in parallel processing"""
        orchestrator.run_all_scrapers(parallel=True, max_workers=3)

        # Check that log files were created for each scraper
        log_files = []
        for source_name in orchestrator.scrapers.keys():
            source_title = source_name.title()
            log_file = orchestrator.config.logs_dir / source_title / f"{source_name}.log"
            if log_file.exists():
                log_files.append(log_file)

        assert len(log_files) == 3

        # Verify log content
        for log_file in log_files:
            log_content = log_file.read_text()
            assert "Starting" in log_content
            assert "Successfully processed" in log_content


if __name__ == '__main__':
    pytest.main([__file__, '-v'])