- Created unit tests for BaseScraper with mocking - Added integration tests for parallel processing - Created end-to-end tests with realistic mock data - Fixed initialization order in BaseScraper (logger before user agent) - Fixed orchestrator method name (archive_current_file) - Added tenacity dependency for retry logic - Validated parallel processing performance and overlap detection - Confirmed spec-compliant markdown formatting in tests Tests cover: - Base scraper functionality (state, markdown, retry logic, media downloads) - Parallel vs sequential execution timing - Error isolation between scrapers - Directory structure creation - State management across runs - Full workflow with realistic data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
184 lines
No EOL
6.2 KiB
Python
184 lines
No EOL
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple integration tests for parallel processing validation
|
|
"""
|
|
|
|
import pytest
|
|
import time
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
# Add project to path
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.base_scraper import BaseScraper, ScraperConfig
|
|
|
|
|
|
class SimpleMockScraper(BaseScraper):
|
|
"""Simple mock scraper for basic testing"""
|
|
|
|
def __init__(self, config: ScraperConfig, delay: float = 0.1):
|
|
super().__init__(config)
|
|
self.delay = delay
|
|
self.start_time = None
|
|
self.end_time = None
|
|
|
|
def fetch_content(self):
|
|
return [
|
|
{
|
|
'id': f'{self.config.source_name}_1',
|
|
'title': f'Test {self.config.source_name}',
|
|
'url': f'https://example.com/{self.config.source_name}',
|
|
'description': f'Test description for {self.config.source_name}',
|
|
'likes': 10,
|
|
'comments': 5
|
|
}
|
|
]
|
|
|
|
def get_incremental_items(self, items, state):
|
|
return items # Return all items for testing
|
|
|
|
def update_state(self, state, items):
|
|
if items:
|
|
state['last_id'] = items[-1]['id']
|
|
return state
|
|
|
|
def run(self):
|
|
self.start_time = time.time()
|
|
time.sleep(self.delay) # Simulate processing time
|
|
super().run() # Call parent run method
|
|
self.end_time = time.time()
|
|
|
|
|
|
def test_parallel_vs_sequential_execution():
|
|
"""Test that parallel execution is faster than sequential"""
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Create scrapers with different delays
|
|
scrapers = []
|
|
for i, delay in enumerate([0.1, 0.2, 0.15]):
|
|
config = ScraperConfig(
|
|
source_name=f"scraper_{i}",
|
|
brand_name="test",
|
|
data_dir=temp_path / "data",
|
|
logs_dir=temp_path / "logs",
|
|
timezone="America/Halifax"
|
|
)
|
|
scrapers.append(SimpleMockScraper(config, delay))
|
|
|
|
# Time parallel execution
|
|
start_time = time.time()
|
|
|
|
def run_scraper(scraper):
|
|
scraper.run()
|
|
return scraper
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
|
|
completed_scrapers = [future.result() for future in as_completed(futures)]
|
|
|
|
parallel_time = time.time() - start_time
|
|
|
|
# Reset scrapers for sequential test
|
|
for scraper in scrapers:
|
|
scraper.start_time = None
|
|
scraper.end_time = None
|
|
|
|
# Time sequential execution
|
|
start_time = time.time()
|
|
for scraper in scrapers:
|
|
scraper.run()
|
|
sequential_time = time.time() - start_time
|
|
|
|
# Assertions
|
|
print(f"Parallel time: {parallel_time:.3f}s")
|
|
print(f"Sequential time: {sequential_time:.3f}s")
|
|
|
|
assert parallel_time < sequential_time, f"Parallel ({parallel_time:.3f}s) should be faster than sequential ({sequential_time:.3f}s)"
|
|
assert parallel_time < 0.5, f"Parallel execution should complete quickly: {parallel_time:.3f}s"
|
|
assert sequential_time > 0.4, f"Sequential execution should take longer: {sequential_time:.3f}s"
|
|
|
|
|
|
def test_parallel_scraper_overlap():
|
|
"""Test that scrapers have overlapping execution times"""
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Create scrapers with sufficient delay to detect overlap
|
|
scrapers = []
|
|
for i in range(3):
|
|
config = ScraperConfig(
|
|
source_name=f"scraper_{i}",
|
|
brand_name="test",
|
|
data_dir=temp_path / "data",
|
|
logs_dir=temp_path / "logs",
|
|
timezone="America/Halifax"
|
|
)
|
|
scrapers.append(SimpleMockScraper(config, 0.2)) # 200ms delay each
|
|
|
|
# Run in parallel
|
|
def run_scraper(scraper):
|
|
scraper.run()
|
|
return scraper
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
futures = [executor.submit(run_scraper, scraper) for scraper in scrapers]
|
|
completed_scrapers = [future.result() for future in as_completed(futures)]
|
|
|
|
# Check for overlapping execution
|
|
overlaps = 0
|
|
for i in range(len(scrapers)):
|
|
for j in range(i + 1, len(scrapers)):
|
|
scraper_a = scrapers[i]
|
|
scraper_b = scrapers[j]
|
|
|
|
# Check if execution times overlap
|
|
if (scraper_a.start_time < scraper_b.end_time and
|
|
scraper_b.start_time < scraper_a.end_time):
|
|
overlaps += 1
|
|
|
|
assert overlaps > 0, "No overlapping execution detected - scrapers may not be running in parallel"
|
|
print(f"Detected {overlaps} overlapping execution pairs")
|
|
|
|
|
|
def test_markdown_output_format():
|
|
"""Test that markdown output follows specification"""
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
config = ScraperConfig(
|
|
source_name="test_scraper",
|
|
brand_name="test",
|
|
data_dir=temp_path / "data",
|
|
logs_dir=temp_path / "logs",
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
scraper = SimpleMockScraper(config, 0.1)
|
|
scraper.run()
|
|
|
|
# Check that markdown file was created
|
|
markdown_files = list((temp_path / "data" / "markdown_current").glob("*.md"))
|
|
assert len(markdown_files) == 1
|
|
|
|
# Check content format
|
|
content = markdown_files[0].read_text()
|
|
assert "# ID:" in content
|
|
assert "## Title:" in content
|
|
assert "## Type:" in content
|
|
assert "## Permalink:" in content
|
|
assert "## Description:" in content
|
|
assert "## Metadata:" in content
|
|
assert "### Comments:" in content
|
|
assert "### Likes:" in content
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v']) |