#!/usr/bin/env python3 """ Simple integration tests for parallel processing validation """ import pytest import time import tempfile from pathlib import Path from unittest.mock import Mock, patch from concurrent.futures import ThreadPoolExecutor, as_completed # Add project to path import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.base_scraper import BaseScraper, ScraperConfig class SimpleMockScraper(BaseScraper): """Simple mock scraper for basic testing""" def __init__(self, config: ScraperConfig, delay: float = 0.1): super().__init__(config) self.delay = delay self.start_time = None self.end_time = None def fetch_content(self): return [ { 'id': f'{self.config.source_name}_1', 'title': f'Test {self.config.source_name}', 'url': f'https://example.com/{self.config.source_name}', 'description': f'Test description for {self.config.source_name}', 'likes': 10, 'comments': 5 } ] def get_incremental_items(self, items, state): return items # Return all items for testing def update_state(self, state, items): if items: state['last_id'] = items[-1]['id'] return state def run(self): self.start_time = time.time() time.sleep(self.delay) # Simulate processing time super().run() # Call parent run method self.end_time = time.time() def test_parallel_vs_sequential_execution(): """Test that parallel execution is faster than sequential""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Create scrapers with different delays scrapers = [] for i, delay in enumerate([0.1, 0.2, 0.15]): config = ScraperConfig( source_name=f"scraper_{i}", brand_name="test", data_dir=temp_path / "data", logs_dir=temp_path / "logs", timezone="America/Halifax" ) scrapers.append(SimpleMockScraper(config, delay)) # Time parallel execution start_time = time.time() def run_scraper(scraper): scraper.run() return scraper with ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(run_scraper, scraper) for scraper in scrapers] completed_scrapers = [future.result() for future in as_completed(futures)] parallel_time = time.time() - start_time # Reset scrapers for sequential test for scraper in scrapers: scraper.start_time = None scraper.end_time = None # Time sequential execution start_time = time.time() for scraper in scrapers: scraper.run() sequential_time = time.time() - start_time # Assertions print(f"Parallel time: {parallel_time:.3f}s") print(f"Sequential time: {sequential_time:.3f}s") assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)" assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s" assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s" def test_parallel_scraper_overlap(): """Test that scrapers have overlapping execution times""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Create scrapers with sufficient delay to detect overlap scrapers = [] for i in range(3): config = ScraperConfig( source_name=f"scraper_{i}", brand_name="test", data_dir=temp_path / "data", logs_dir=temp_path / "logs", timezone="America/Halifax" ) scrapers.append(SimpleMockScraper(config, 0.2)) # 200ms delay each # Run in parallel def run_scraper(scraper): scraper.run() return scraper with ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(run_scraper, scraper) for scraper in scrapers] completed_scrapers = [future.result() for future in as_completed(futures)] # Check for overlapping execution overlaps = 0 for i in range(len(scrapers)): for j in range(i + 1, len(scrapers)): scraper_a = scrapers[i] scraper_b = scrapers[j] # Check if execution times overlap if (scraper_a.start_time < scraper_b.end_time and scraper_b.start_time < scraper_a.end_time): overlaps += 1 assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel" print(f"Detected {overlaps} overlapping execution pairs") def test_markdown_output_format(): """Test that markdown output follows specification""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) config = ScraperConfig( source_name="test_scraper", brand_name="test", data_dir=temp_path / "data", logs_dir=temp_path / "logs", timezone="America/Halifax" ) scraper = SimpleMockScraper(config, 0.1) scraper.run() # Check that markdown file was created markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md")) assert len(markdown_files) == 1 # Check content format content = markdown_files[0].read_text() assert "# ID:" in content assert "## Title:" in content assert "## Type:" in content assert "## Permalink:" in content assert "## Description:" in content assert "## Metadata:" in content assert "### Comments:" in content assert "### Likes:" in content if __name__ == '__main__': pytest.main([__file__, '-v'])