#!/usr/bin/env python3 """ Unit tests for BaseScraper """ import pytest import json import tempfile from pathlib import Path from unittest.mock import Mock, patch, MagicMock import requests # Add project to path import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.base_scraper import BaseScraper, ScraperConfig class TestScraper(BaseScraper): """Test implementation of BaseScraper""" def fetch_content(self): return [ { 'id': 'test1', 'title': 'Test Title 1', 'url': 'https://example.com/1', 'description': 'Test description 1', 'likes': 10, 'comments': 5, 'tags': ['tag1', 'tag2'] }, { 'id': 'test2', 'title': 'Test Title 2', 'url': 'https://example.com/2', 'description': 'Test description 2', 'views': 100 } ] def get_incremental_items(self, items, state): if not state.get('last_id'): return items # Return items after last_id last_seen = False new_items = [] for item in items: if last_seen: new_items.append(item) elif item['id'] == state['last_id']: last_seen = True return new_items class TestBaseScraper: """Test cases for BaseScraper""" @pytest.fixture def temp_config(self): """Create temporary config for testing""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) config = ScraperConfig( source_name="test", brand_name="testbrand", data_dir=temp_path / "data", logs_dir=temp_path / "logs", timezone="America/Halifax" ) yield config @pytest.fixture def scraper(self, temp_config): """Create test scraper instance""" return TestScraper(temp_config) def test_initialization(self, scraper): """Test scraper initializes correctly""" assert scraper.config.source_name == "test" assert scraper.config.brand_name == "testbrand" assert scraper.session is not None assert len(scraper.user_agents) > 0 assert scraper.retry_config['max_attempts'] == 3 def test_directory_creation(self, scraper): """Test required directories are created""" assert scraper.config.data_dir.exists() assert (scraper.config.data_dir / "markdown_current").exists() assert (scraper.config.data_dir / "markdown_archives" / "Test").exists() assert (scraper.config.data_dir / "media" / "Test").exists() assert (scraper.config.logs_dir / "Test").exists() assert scraper.state_file.parent.exists() def test_user_agent_rotation(self, scraper): """Test user agent rotation works""" initial_ua = scraper.session.headers['User-Agent'] scraper.rotate_user_agent() new_ua = scraper.session.headers['User-Agent'] assert new_ua != initial_ua def test_state_management(self, scraper): """Test state save/load functionality""" # Test loading non-existent state state = scraper.load_state() assert state == {} # Test saving and loading state test_state = {'last_id': 'test123', 'last_update': '2024-01-01'} scraper.save_state(test_state) loaded_state = scraper.load_state() assert loaded_state == test_state def test_markdown_formatting(self, scraper): """Test markdown formatting matches specification""" items = scraper.fetch_content() markdown = scraper.format_markdown(items) # Check for spec-compliant format assert "# ID: test1" in markdown assert "## Title: Test Title 1" in markdown assert "## Type: test" in markdown assert "## Permalink: https://example.com/1" in markdown assert "## Description:" in markdown assert "## Metadata:" in markdown assert "### Comments: 5" in markdown assert "### Likes: 10" in markdown assert "### Tags:" in markdown assert "- tag1" in markdown assert "- tag2" in markdown assert "### Views: 100" in markdown assert "--------------" in markdown def test_format_item_to_spec(self, scraper): """Test individual item formatting""" item = { 'id': 'test123', 'title': 'Test Item', 'url': 'https://test.com', 'description': 'Test description', 'likes': 15, 'comments': 3, 'tags': ['test'] } formatted = scraper.format_item_to_spec(item) lines = formatted.split('\n') assert "# ID: test123" in lines assert "## Title: Test Item" in lines assert "## Type: test" in lines assert "## Permalink: https://test.com" in lines assert "### Comments: 3" in lines assert "### Likes: 15" in lines assert "- test" in lines @patch('requests.Session.request') def test_make_request_with_retry(self, mock_request, scraper): """Test make_request method with retry logic""" # Mock successful response mock_response = Mock() mock_response.status_code = 200 mock_request.return_value = mock_response response = scraper.make_request('GET', 'https://test.com') assert response == mock_response mock_request.assert_called_once() @patch('requests.Session.request') def test_make_request_retry_on_failure(self, mock_request, scraper): """Test retry logic on request failure""" # Mock failure then success mock_request.side_effect = [ requests.RequestException("Connection failed"), requests.RequestException("Still failing"), Mock(status_code=200) # Success on third try ] response = scraper.make_request('GET', 'https://test.com') assert response.status_code == 200 assert mock_request.call_count == 3 def test_incremental_items(self, scraper): """Test incremental item filtering""" items = scraper.fetch_content() # Empty state should return all items empty_state = {} incremental = scraper.get_incremental_items(items, empty_state) assert len(incremental) == 2 # State with last_id should filter items state_with_last = {'last_id': 'test1'} incremental = scraper.get_incremental_items(items, state_with_last) assert len(incremental) == 1 assert incremental[0]['id'] == 'test2' def test_update_state(self, scraper): """Test state update logic""" items = scraper.fetch_content() old_state = {'last_id': 'old'} new_state = scraper.update_state(old_state, items) assert new_state['last_id'] == 'test2' # Should be last item ID assert 'last_update' in new_state @patch('requests.Session.request') def test_download_media(self, mock_request, scraper): """Test media downloading functionality""" # Mock successful download mock_response = Mock() mock_response.status_code = 200 mock_response.iter_content.return_value = [b'fake image data'] mock_request.return_value = mock_response # Test download url = 'https://example.com/image.jpg' result = scraper.download_media(url, 'test_item', 'image') assert result is not None assert 'test_item_image.jpg' in result # Verify file was created file_path = Path(result) assert file_path.exists() assert file_path.read_bytes() == b'fake image data' def test_sanitize_filename(self, scraper): """Test filename sanitization""" dangerous_name = 'test<>:"/\\|?*file.jpg' safe_name = scraper._sanitize_filename(dangerous_name) assert '<' not in safe_name assert '>' not in safe_name assert ':' not in safe_name assert safe_name == 'test_________file.jpg' def test_guess_extension(self, scraper): """Test file extension guessing""" assert scraper._guess_extension('test.jpg', 'image') == '.jpg' assert scraper._guess_extension('test.mp4', 'video') == '.mp4' assert scraper._guess_extension('test', 'image') == '.jpg' assert scraper._guess_extension('test', 'video') == '.mp4' assert scraper._guess_extension('test', 'unknown') == '.bin' if __name__ == '__main__': pytest.main([__file__])