hvac-kia-content/tests/test_integration_simple.py
Ben Reed 8d5750b1d1 Add comprehensive test infrastructure
- Created unit tests for BaseScraper with mocking
- Added integration tests for parallel processing
- Created end-to-end tests with realistic mock data
- Fixed initialization order in BaseScraper (logger before user agent)
- Fixed orchestrator method name (archive_current_file)
- Added tenacity dependency for retry logic
- Validated parallel processing performance and overlap detection
- Confirmed spec-compliant markdown formatting in tests

Tests cover:
- Base scraper functionality (state, markdown, retry logic, media downloads)
- Parallel vs sequential execution timing
- Error isolation between scrapers
- Directory structure creation
- State management across runs
- Full workflow with realistic data

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 21:16:14 -03:00

184 lines
No EOL
6.2 KiB
Python

#!/usr/bin/env python3
"""
Simple integration tests for parallel processing validation
"""
import pytest
import time
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch
from concurrent.futures import ThreadPoolExecutor, as_completed
# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.base_scraper import BaseScraper, ScraperConfig
class SimpleMockScraper(BaseScraper):
"""Simple mock scraper for basic testing"""
def __init__(self, config: ScraperConfig, delay: float = 0.1):
super().__init__(config)
self.delay = delay
self.start_time = None
self.end_time = None
def fetch_content(self):
return [
{
'id': f'{self.config.source_name}_1',
'title': f'Test {self.config.source_name}',
'url': f'https://example.com/{self.config.source_name}',
'description': f'Test description for {self.config.source_name}',
'likes': 10,
'comments': 5
}
]
def get_incremental_items(self, items, state):
return items # Return all items for testing
def update_state(self, state, items):
if items:
state['last_id'] = items[-1]['id']
return state
def run(self):
self.start_time = time.time()
time.sleep(self.delay) # Simulate processing time
super().run() # Call parent run method
self.end_time = time.time()
def test_parallel_vs_sequential_execution():
"""Test that parallel execution is faster than sequential"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Create scrapers with different delays
scrapers = []
for i, delay in enumerate([0.1, 0.2, 0.15]):
config = ScraperConfig(
source_name=f"scraper_{i}",
brand_name="test",
data_dir=temp_path / "data",
logs_dir=temp_path / "logs",
timezone="America/Halifax"
)
scrapers.append(SimpleMockScraper(config, delay))
# Time parallel execution
start_time = time.time()
def run_scraper(scraper):
scraper.run()
return scraper
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
completed_scrapers = [future.result() for future in as_completed(futures)]
parallel_time = time.time() - start_time
# Reset scrapers for sequential test
for scraper in scrapers:
scraper.start_time = None
scraper.end_time = None
# Time sequential execution
start_time = time.time()
for scraper in scrapers:
scraper.run()
sequential_time = time.time() - start_time
# Assertions
print(f"Parallel time: {parallel_time:.3f}s")
print(f"Sequential time: {sequential_time:.3f}s")
assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)"
assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s"
assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s"
def test_parallel_scraper_overlap():
"""Test that scrapers have overlapping execution times"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Create scrapers with sufficient delay to detect overlap
scrapers = []
for i in range(3):
config = ScraperConfig(
source_name=f"scraper_{i}",
brand_name="test",
data_dir=temp_path / "data",
logs_dir=temp_path / "logs",
timezone="America/Halifax"
)
scrapers.append(SimpleMockScraper(config, 0.2)) # 200ms delay each
# Run in parallel
def run_scraper(scraper):
scraper.run()
return scraper
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
completed_scrapers = [future.result() for future in as_completed(futures)]
# Check for overlapping execution
overlaps = 0
for i in range(len(scrapers)):
for j in range(i + 1, len(scrapers)):
scraper_a = scrapers[i]
scraper_b = scrapers[j]
# Check if execution times overlap
if (scraper_a.start_time < scraper_b.end_time and
scraper_b.start_time < scraper_a.end_time):
overlaps += 1
assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
print(f"Detected {overlaps} overlapping execution pairs")
def test_markdown_output_format():
"""Test that markdown output follows specification"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
config = ScraperConfig(
source_name="test_scraper",
brand_name="test",
data_dir=temp_path / "data",
logs_dir=temp_path / "logs",
timezone="America/Halifax"
)
scraper = SimpleMockScraper(config, 0.1)
scraper.run()
# Check that markdown file was created
markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md"))
assert len(markdown_files) == 1
# Check content format
content = markdown_files[0].read_text()
assert "# ID:" in content
assert "## Title:" in content
assert "## Type:" in content
assert "## Permalink:" in content
assert "## Description:" in content
assert "## Metadata:" in content
assert "### Comments:" in content
assert "### Likes:" in content
if __name__ == '__main__':
pytest.main([__file__, '-v'])