- Created unit tests for BaseScraper with mocking - Added integration tests for parallel processing - Created end-to-end tests with realistic mock data - Fixed initialization order in BaseScraper (logger before user agent) - Fixed orchestrator method name (archive_current_file) - Added tenacity dependency for retry logic - Validated parallel processing performance and overlap detection - Confirmed spec-compliant markdown formatting in tests Tests cover: - Base scraper functionality (state, markdown, retry logic, media downloads) - Parallel vs sequential execution timing - Error isolation between scrapers - Directory structure creation - State management across runs - Full workflow with realistic data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
316 lines
No EOL
12 KiB
Python
316 lines
No EOL
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Integration tests for ContentOrchestrator parallel processing
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
import time
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
# Add project to path
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.orchestrator import ContentOrchestrator
|
|
from src.base_scraper import BaseScraper, ScraperConfig
|
|
|
|
|
|
class MockScraper(BaseScraper):
|
|
"""Mock scraper for testing parallel processing"""
|
|
|
|
def __init__(self, config: ScraperConfig, delay: float = 0.1, fail: bool = False):
|
|
super().__init__(config)
|
|
self.delay = delay
|
|
self.fail = fail
|
|
self.run_called = False
|
|
self.run_start_time = None
|
|
self.run_end_time = None
|
|
|
|
def fetch_content(self):
|
|
return [
|
|
{
|
|
'id': f'{self.config.source_name}_1',
|
|
'title': f'Test {self.config.source_name} Title',
|
|
'url': f'https://example.com/{self.config.source_name}/1',
|
|
'description': f'Test description for {self.config.source_name}',
|
|
'likes': 10,
|
|
'comments': 5
|
|
}
|
|
]
|
|
|
|
def get_incremental_items(self, items, state):
|
|
# Return all items for testing
|
|
return items
|
|
|
|
def update_state(self, state, items):
|
|
state['last_id'] = items[-1]['id'] if items else None
|
|
return state
|
|
|
|
def run(self):
|
|
self.run_called = True
|
|
self.run_start_time = time.time()
|
|
|
|
if self.fail:
|
|
self.logger.error(f"Mock failure for {self.config.source_name}")
|
|
raise Exception(f"Mock failure for {self.config.source_name}")
|
|
|
|
# Simulate processing time
|
|
time.sleep(self.delay)
|
|
|
|
try:
|
|
# Call parent run method for actual processing
|
|
super().run()
|
|
finally:
|
|
self.run_end_time = time.time()
|
|
|
|
|
|
class TestOrchestratorIntegration:
|
|
"""Integration tests for ContentOrchestrator"""
|
|
|
|
@pytest.fixture
|
|
def temp_config(self):
|
|
"""Create temporary config for testing"""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
config = ScraperConfig(
|
|
source_name="integration_test",
|
|
brand_name="testbrand",
|
|
data_dir=temp_path / "data",
|
|
logs_dir=temp_path / "logs",
|
|
timezone="America/Halifax"
|
|
)
|
|
yield config
|
|
|
|
@pytest.fixture
|
|
def orchestrator(self, temp_config):
|
|
"""Create orchestrator with mock scrapers"""
|
|
orchestrator = ContentOrchestrator(temp_config.data_dir, temp_config.logs_dir)
|
|
|
|
# Replace real scrapers with mock scrapers
|
|
orchestrator.scrapers = {
|
|
'fast_scraper': MockScraper(
|
|
ScraperConfig(
|
|
source_name="fast_scraper",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
delay=0.1
|
|
),
|
|
'slow_scraper': MockScraper(
|
|
ScraperConfig(
|
|
source_name="slow_scraper",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
delay=0.3
|
|
),
|
|
'medium_scraper': MockScraper(
|
|
ScraperConfig(
|
|
source_name="medium_scraper",
|
|
brand_name=temp_config.brand_name,
|
|
data_dir=temp_config.data_dir,
|
|
logs_dir=temp_config.logs_dir,
|
|
timezone=temp_config.timezone
|
|
),
|
|
delay=0.2
|
|
)
|
|
}
|
|
|
|
return orchestrator
|
|
|
|
def test_parallel_execution_timing(self, orchestrator):
|
|
"""Test that parallel execution is faster than sequential"""
|
|
scrapers = orchestrator.scrapers
|
|
|
|
# Time parallel execution
|
|
start_time = time.time()
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
parallel_time = time.time() - start_time
|
|
|
|
# Reset scrapers for sequential test
|
|
for scraper in scrapers.values():
|
|
scraper.run_called = False
|
|
scraper.run_start_time = None
|
|
scraper.run_end_time = None
|
|
|
|
# Time sequential execution
|
|
start_time = time.time()
|
|
orchestrator.run_all_scrapers(parallel=False)
|
|
sequential_time = time.time() - start_time
|
|
|
|
# Parallel should be significantly faster
|
|
assert parallel_time < sequential_time
|
|
assert parallel_time < 0.5 # Should complete well under total delay time
|
|
assert sequential_time > 0.6 # Should be close to sum of delays (0.1 + 0.2 + 0.3)
|
|
|
|
# Verify all scrapers ran
|
|
for scraper in scrapers.values():
|
|
assert scraper.run_called
|
|
|
|
def test_parallel_scraper_overlap(self, orchestrator):
|
|
"""Test that scrapers actually run in parallel (overlapping execution)"""
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
scrapers = list(orchestrator.scrapers.values())
|
|
|
|
# Check that at least two scrapers had overlapping execution
|
|
overlaps = 0
|
|
for i in range(len(scrapers)):
|
|
for j in range(i + 1, len(scrapers)):
|
|
scraper_a = scrapers[i]
|
|
scraper_b = scrapers[j]
|
|
|
|
# Check if execution times overlap
|
|
if (scraper_a.run_start_time < scraper_b.run_end_time and
|
|
scraper_b.run_start_time < scraper_a.run_end_time):
|
|
overlaps += 1
|
|
|
|
assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
|
|
|
|
def test_error_isolation(self, orchestrator):
|
|
"""Test that one failing scraper doesn't crash others"""
|
|
# Make one scraper fail
|
|
orchestrator.scrapers['slow_scraper'].fail = True
|
|
|
|
# Run parallel processing
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check that other scrapers still completed successfully
|
|
assert orchestrator.scrapers['fast_scraper'].run_called
|
|
assert orchestrator.scrapers['medium_scraper'].run_called
|
|
assert orchestrator.scrapers['slow_scraper'].run_called # It ran but failed
|
|
|
|
# Check that successful scrapers produced output
|
|
fast_files = list((orchestrator.config.data_dir / "markdown_current").glob("*fast_scraper*"))
|
|
medium_files = list((orchestrator.config.data_dir / "markdown_current").glob("*medium_scraper*"))
|
|
slow_files = list((orchestrator.config.data_dir / "markdown_current").glob("*slow_scraper*"))
|
|
|
|
assert len(fast_files) > 0, "Fast scraper should have produced output"
|
|
assert len(medium_files) > 0, "Medium scraper should have produced output"
|
|
assert len(slow_files) == 0, "Slow scraper should not have produced output due to failure"
|
|
|
|
def test_worker_pool_limits(self, orchestrator):
|
|
"""Test that max_workers parameter is respected"""
|
|
# Add more scrapers than workers
|
|
for i in range(5):
|
|
scraper_name = f'extra_scraper_{i}'
|
|
orchestrator.scrapers[scraper_name] = MockScraper(
|
|
ScraperConfig(
|
|
source_name=scraper_name,
|
|
brand_name=orchestrator.config.brand_name,
|
|
data_dir=orchestrator.config.data_dir,
|
|
logs_dir=orchestrator.config.logs_dir,
|
|
timezone=orchestrator.config.timezone
|
|
),
|
|
delay=0.1
|
|
)
|
|
|
|
# Run with limited workers
|
|
start_time = time.time()
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=2)
|
|
execution_time = time.time() - start_time
|
|
|
|
# With 8 scrapers and 2 workers, should take longer than with unlimited workers
|
|
# but still benefit from parallelism
|
|
assert execution_time > 0.3 # Should take multiple batches
|
|
assert execution_time < 1.0 # But still faster than fully sequential
|
|
|
|
# Verify all scrapers ran
|
|
for scraper in orchestrator.scrapers.values():
|
|
assert scraper.run_called
|
|
|
|
def test_markdown_output_format(self, orchestrator):
|
|
"""Test that parallel processing produces spec-compliant markdown"""
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check markdown files were created
|
|
markdown_files = list((orchestrator.config.data_dir / "markdown_current").glob("*.md"))
|
|
assert len(markdown_files) == 3 # One per successful scraper
|
|
|
|
# Verify markdown format for each file
|
|
for file_path in markdown_files:
|
|
content = file_path.read_text(encoding='utf-8')
|
|
|
|
# Check spec-compliant format
|
|
assert "# ID:" in content
|
|
assert "## Title:" in content
|
|
assert "## Type:" in content
|
|
assert "## Permalink:" in content
|
|
assert "## Description:" in content
|
|
assert "## Metadata:" in content
|
|
assert "### Comments:" in content
|
|
assert "### Likes:" in content
|
|
assert "### Tags:" in content
|
|
|
|
def test_state_management_isolation(self, orchestrator):
|
|
"""Test that state management works correctly in parallel"""
|
|
# Run scrapers twice to test state persistence
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check that state files were created
|
|
state_files = list((orchestrator.config.data_dir / ".state").glob("*_state.json"))
|
|
assert len(state_files) == 3
|
|
|
|
# Verify state content
|
|
for state_file in state_files:
|
|
state_data = json.loads(state_file.read_text())
|
|
assert 'last_update' in state_data
|
|
assert 'last_item_count' in state_data
|
|
assert state_data['last_item_count'] == 1 # Mock scrapers return 1 item
|
|
|
|
def test_directory_structure_creation(self, orchestrator):
|
|
"""Test that parallel processing creates proper directory structure"""
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
base_data_dir = orchestrator.config.data_dir
|
|
|
|
# Check that all required directories exist
|
|
assert (base_data_dir / "markdown_current").exists()
|
|
assert (base_data_dir / ".state").exists()
|
|
|
|
# Check source-specific directories
|
|
for source_name in orchestrator.scrapers.keys():
|
|
source_title = source_name.title()
|
|
assert (base_data_dir / "markdown_archives" / source_title).exists()
|
|
assert (base_data_dir / "media" / source_title).exists()
|
|
assert (orchestrator.config.logs_dir / source_title).exists()
|
|
|
|
@patch('src.orchestrator.ContentOrchestrator.sync_to_nas')
|
|
def test_nas_sync_called(self, mock_sync, orchestrator):
|
|
"""Test that NAS sync is called after parallel processing"""
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Verify sync was called
|
|
mock_sync.assert_called_once()
|
|
|
|
def test_logging_isolation(self, orchestrator):
|
|
"""Test that logging works correctly in parallel processing"""
|
|
orchestrator.run_all_scrapers(parallel=True, max_workers=3)
|
|
|
|
# Check that log files were created for each scraper
|
|
log_files = []
|
|
for source_name in orchestrator.scrapers.keys():
|
|
source_title = source_name.title()
|
|
log_file = orchestrator.config.logs_dir / source_title / f"{source_name}.log"
|
|
if log_file.exists():
|
|
log_files.append(log_file)
|
|
|
|
assert len(log_files) == 3
|
|
|
|
# Verify log content
|
|
for log_file in log_files:
|
|
log_content = log_file.read_text()
|
|
assert "Starting" in log_content
|
|
assert "Successfully processed" in log_content
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v']) |