hvac-kia-content/tests/test_base_scraper.py
Ben Reed 8d5750b1d1 Add comprehensive test infrastructure
- Created unit tests for BaseScraper with mocking
- Added integration tests for parallel processing
- Created end-to-end tests with realistic mock data
- Fixed initialization order in BaseScraper (logger before user agent)
- Fixed orchestrator method name (archive_current_file)
- Added tenacity dependency for retry logic
- Validated parallel processing performance and overlap detection
- Confirmed spec-compliant markdown formatting in tests

Tests cover:
- Base scraper functionality (state, markdown, retry logic, media downloads)
- Parallel vs sequential execution timing
- Error isolation between scrapers
- Directory structure creation
- State management across runs
- Full workflow with realistic data

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 21:16:14 -03:00

254 lines
No EOL
8.8 KiB
Python

#!/usr/bin/env python3
"""
Unit tests for BaseScraper
"""
import pytest
import json
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import requests
# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.base_scraper import BaseScraper, ScraperConfig
class MockTestScraper(BaseScraper):
"""Test implementation of BaseScraper"""
def fetch_content(self):
return [
{
'id': 'test1',
'title': 'Test Title 1',
'url': 'https://example.com/1',
'description': 'Test description 1',
'likes': 10,
'comments': 5,
'tags': ['tag1', 'tag2']
},
{
'id': 'test2',
'title': 'Test Title 2',
'url': 'https://example.com/2',
'description': 'Test description 2',
'views': 100
}
]
def get_incremental_items(self, items, state):
if not state.get('last_id'):
return items
# Return items after last_id
last_seen = False
new_items = []
for item in items:
if last_seen:
new_items.append(item)
elif item['id'] == state['last_id']:
last_seen = True
return new_items
def update_state(self, state, items):
if items:
state['last_id'] = items[-1]['id']
return state
class TestBaseScraper:
"""Test cases for BaseScraper"""
@pytest.fixture
def temp_config(self):
"""Create temporary config for testing"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
config = ScraperConfig(
source_name="test",
brand_name="testbrand",
data_dir=temp_path / "data",
logs_dir=temp_path / "logs",
timezone="America/Halifax"
)
yield config
@pytest.fixture
def scraper(self, temp_config):
"""Create test scraper instance"""
return MockTestScraper(temp_config)
def test_initialization(self, scraper):
"""Test scraper initializes correctly"""
assert scraper.config.source_name == "test"
assert scraper.config.brand_name == "testbrand"
assert scraper.session is not None
assert len(scraper.user_agents) > 0
assert scraper.retry_config['max_attempts'] == 3
def test_directory_creation(self, scraper):
"""Test required directories are created"""
assert scraper.config.data_dir.exists()
assert (scraper.config.data_dir / "markdown_current").exists()
assert (scraper.config.data_dir / "markdown_archives" / "Test").exists()
assert (scraper.config.data_dir / "media" / "Test").exists()
assert (scraper.config.logs_dir / "Test").exists()
assert scraper.state_file.parent.exists()
def test_user_agent_rotation(self, scraper):
"""Test user agent rotation works"""
initial_ua = scraper.session.headers['User-Agent']
scraper.rotate_user_agent()
new_ua = scraper.session.headers['User-Agent']
assert new_ua != initial_ua
def test_state_management(self, scraper):
"""Test state save/load functionality"""
# Test loading non-existent state
state = scraper.load_state()
assert state == {}
# Test saving and loading state
test_state = {'last_id': 'test123', 'last_update': '2024-01-01'}
scraper.save_state(test_state)
loaded_state = scraper.load_state()
assert loaded_state == test_state
def test_markdown_formatting(self, scraper):
"""Test markdown formatting matches specification"""
items = scraper.fetch_content()
markdown = scraper.format_markdown(items)
# Check for spec-compliant format
assert "# ID: test1" in markdown
assert "## Title: Test Title 1" in markdown
assert "## Type: test" in markdown
assert "## Permalink: https://example.com/1" in markdown
assert "## Description:" in markdown
assert "## Metadata:" in markdown
assert "### Comments: 5" in markdown
assert "### Likes: 10" in markdown
assert "### Tags:" in markdown
assert "- tag1" in markdown
assert "- tag2" in markdown
assert "### Views: 100" in markdown
assert "--------------" in markdown
def test_format_item_to_spec(self, scraper):
"""Test individual item formatting"""
item = {
'id': 'test123',
'title': 'Test Item',
'url': 'https://test.com',
'description': 'Test description',
'likes': 15,
'comments': 3,
'tags': ['test']
}
formatted = scraper.format_item_to_spec(item)
lines = formatted.split('\n')
assert "# ID: test123" in lines
assert "## Title: Test Item" in lines
assert "## Type: test" in lines
assert "## Permalink: https://test.com" in lines
assert "### Comments: 3" in lines
assert "### Likes: 15" in lines
assert "- test" in lines
@patch('requests.Session.request')
def test_make_request_with_retry(self, mock_request, scraper):
"""Test make_request method with retry logic"""
# Mock successful response
mock_response = Mock()
mock_response.status_code = 200
mock_request.return_value = mock_response
response = scraper.make_request('GET', 'https://test.com')
assert response == mock_response
mock_request.assert_called_once()
@patch('requests.Session.request')
def test_make_request_retry_on_failure(self, mock_request, scraper):
"""Test retry logic on request failure"""
# Mock failure then success
mock_request.side_effect = [
requests.RequestException("Connection failed"),
requests.RequestException("Still failing"),
Mock(status_code=200) # Success on third try
]
response = scraper.make_request('GET', 'https://test.com')
assert response.status_code == 200
assert mock_request.call_count == 3
def test_incremental_items(self, scraper):
"""Test incremental item filtering"""
items = scraper.fetch_content()
# Empty state should return all items
empty_state = {}
incremental = scraper.get_incremental_items(items, empty_state)
assert len(incremental) == 2
# State with last_id should filter items
state_with_last = {'last_id': 'test1'}
incremental = scraper.get_incremental_items(items, state_with_last)
assert len(incremental) == 1
assert incremental[0]['id'] == 'test2'
def test_update_state(self, scraper):
"""Test state update logic"""
items = scraper.fetch_content()
old_state = {'last_id': 'old'}
new_state = scraper.update_state(old_state, items)
assert new_state['last_id'] == 'test2' # Should be last item ID
@patch('requests.Session.request')
def test_download_media(self, mock_request, scraper):
"""Test media downloading functionality"""
# Mock successful download
mock_response = Mock()
mock_response.status_code = 200
mock_response.iter_content.return_value = [b'fake image data']
mock_request.return_value = mock_response
# Test download
url = 'https://example.com/image.jpg'
result = scraper.download_media(url, 'test_item', 'image')
assert result is not None
assert 'test_item_image.jpg' in result
# Verify file was created
file_path = Path(result)
assert file_path.exists()
assert file_path.read_bytes() == b'fake image data'
def test_sanitize_filename(self, scraper):
"""Test filename sanitization"""
dangerous_name = 'test<>:"/\\|?*file.jpg'
safe_name = scraper._sanitize_filename(dangerous_name)
assert '<' not in safe_name
assert '>' not in safe_name
assert ':' not in safe_name
assert safe_name == 'test_________file.jpg'
def test_guess_extension(self, scraper):
"""Test file extension guessing"""
assert scraper._guess_extension('test.jpg', 'image') == '.jpg'
assert scraper._guess_extension('test.mp4', 'video') == '.mp4'
assert scraper._guess_extension('test', 'image') == '.jpg'
assert scraper._guess_extension('test', 'video') == '.mp4'
assert scraper._guess_extension('test', 'unknown') == '.bin'
if __name__ == '__main__':
pytest.main([__file__])