hvac-kia-content/tests/test_integration_simple.py

#!/usr/bin/env python3
"""
Simple integration tests for parallel processing validation
"""

import pytest
import time
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.base_scraper import BaseScraper, ScraperConfig


class SimpleMockScraper(BaseScraper):
    """Simple mock scraper for basic testing"""

    def __init__(self, config: ScraperConfig, delay: float = 0.1):
        super().__init__(config)
        self.delay = delay
        self.start_time = None
        self.end_time = None

    def fetch_content(self):
        return [
            {
                'id': f'{self.config.source_name}_1',
                'title': f'Test {self.config.source_name}',
                'url': f'https://example.com/{self.config.source_name}',
                'description': f'Test description for {self.config.source_name}',
                'likes': 10,
                'comments': 5
            }
        ]

    def get_incremental_items(self, items, state):
        return items  # Return all items for testing

    def update_state(self, state, items):
        if items:
            state['last_id'] = items[-1]['id']
        return state

    def run(self):
        self.start_time = time.time()
        time.sleep(self.delay)  # Simulate processing time
        super().run()  # Call parent run method
        self.end_time = time.time()


def test_parallel_vs_sequential_execution():
    """Test that parallel execution is faster than sequential"""

    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        # Create scrapers with different delays
        scrapers = []
        for i, delay in enumerate([0.1, 0.2, 0.15]):
            config = ScraperConfig(
                source_name=f"scraper_{i}",
                brand_name="test",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            scrapers.append(SimpleMockScraper(config, delay))

        # Time parallel execution
        start_time = time.time()

        def run_scraper(scraper):
            scraper.run()
            return scraper

        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
            completed_scrapers = [future.result() for future in as_completed(futures)]

        parallel_time = time.time() - start_time

        # Reset scrapers for sequential test
        for scraper in scrapers:
            scraper.start_time = None
            scraper.end_time = None

        # Time sequential execution
        start_time = time.time()
        for scraper in scrapers:
            scraper.run()
        sequential_time = time.time() - start_time

        # Assertions
        print(f"Parallel time: {parallel_time:.3f}s")
        print(f"Sequential time: {sequential_time:.3f}s")

        assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)"
        assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s"
        assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s"


def test_parallel_scraper_overlap():
    """Test that scrapers have overlapping execution times"""

    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        # Create scrapers with sufficient delay to detect overlap
        scrapers = []
        for i in range(3):
            config = ScraperConfig(
                source_name=f"scraper_{i}",
                brand_name="test",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            scrapers.append(SimpleMockScraper(config, 0.2))  # 200ms delay each

        # Run in parallel
        def run_scraper(scraper):
            scraper.run()
            return scraper

        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
            completed_scrapers = [future.result() for future in as_completed(futures)]

        # Check for overlapping execution
        overlaps = 0
        for i in range(len(scrapers)):
            for j in range(i + 1, len(scrapers)):
                scraper_a = scrapers[i]
                scraper_b = scrapers[j]

                # Check if execution times overlap
                if (scraper_a.start_time < scraper_b.end_time and
                    scraper_b.start_time < scraper_a.end_time):
                    overlaps += 1

        assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
        print(f"Detected {overlaps} overlapping execution pairs")


def test_markdown_output_format():
    """Test that markdown output follows specification"""

    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)

        config = ScraperConfig(
            source_name="test_scraper",
            brand_name="test",
            data_dir=temp_path / "data",
            logs_dir=temp_path / "logs",
            timezone="America/Halifax"
        )

        scraper = SimpleMockScraper(config, 0.1)
        scraper.run()

        # Check that markdown file was created
        markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md"))
        assert len(markdown_files) == 1

        # Check content format
        content = markdown_files[0].read_text()
        assert "# ID:" in content
        assert "## Title:" in content
        assert "## Type:" in content
        assert "## Permalink:" in content
        assert "## Description:" in content
        assert "## Metadata:" in content
        assert "### Comments:" in content
        assert "### Likes:" in content


if __name__ == '__main__':
    pytest.main([__file__, '-v'])