hvac-kia-content/tests/test_base_scraper.py

#!/usr/bin/env python3
"""
Unit tests for BaseScraper
"""

import pytest
import json
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import requests

# Add project to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.base_scraper import BaseScraper, ScraperConfig


class TestScraper(BaseScraper):
    """Test implementation of BaseScraper"""

    def fetch_content(self):
        return [
            {
                'id': 'test1',
                'title': 'Test Title 1',
                'url': 'https://example.com/1',
                'description': 'Test description 1',
                'likes': 10,
                'comments': 5,
                'tags': ['tag1', 'tag2']
            },
            {
                'id': 'test2',
                'title': 'Test Title 2',
                'url': 'https://example.com/2',
                'description': 'Test description 2',
                'views': 100
            }
        ]

    def get_incremental_items(self, items, state):
        if not state.get('last_id'):
            return items

        # Return items after last_id
        last_seen = False
        new_items = []
        for item in items:
            if last_seen:
                new_items.append(item)
            elif item['id'] == state['last_id']:
                last_seen = True
        return new_items


class TestBaseScraper:
    """Test cases for BaseScraper"""

    @pytest.fixture
    def temp_config(self):
        """Create temporary config for testing"""
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            config = ScraperConfig(
                source_name="test",
                brand_name="testbrand",
                data_dir=temp_path / "data",
                logs_dir=temp_path / "logs",
                timezone="America/Halifax"
            )
            yield config

    @pytest.fixture
    def scraper(self, temp_config):
        """Create test scraper instance"""
        return TestScraper(temp_config)

    def test_initialization(self, scraper):
        """Test scraper initializes correctly"""
        assert scraper.config.source_name == "test"
        assert scraper.config.brand_name == "testbrand"
        assert scraper.session is not None
        assert len(scraper.user_agents) > 0
        assert scraper.retry_config['max_attempts'] == 3

    def test_directory_creation(self, scraper):
        """Test required directories are created"""
        assert scraper.config.data_dir.exists()
        assert (scraper.config.data_dir / "markdown_current").exists()
        assert (scraper.config.data_dir / "markdown_archives" / "Test").exists()
        assert (scraper.config.data_dir / "media" / "Test").exists()
        assert (scraper.config.logs_dir / "Test").exists()
        assert scraper.state_file.parent.exists()

    def test_user_agent_rotation(self, scraper):
        """Test user agent rotation works"""
        initial_ua = scraper.session.headers['User-Agent']
        scraper.rotate_user_agent()
        new_ua = scraper.session.headers['User-Agent']
        assert new_ua != initial_ua

    def test_state_management(self, scraper):
        """Test state save/load functionality"""
        # Test loading non-existent state
        state = scraper.load_state()
        assert state == {}

        # Test saving and loading state
        test_state = {'last_id': 'test123', 'last_update': '2024-01-01'}
        scraper.save_state(test_state)

        loaded_state = scraper.load_state()
        assert loaded_state == test_state

    def test_markdown_formatting(self, scraper):
        """Test markdown formatting matches specification"""
        items = scraper.fetch_content()
        markdown = scraper.format_markdown(items)

        # Check for spec-compliant format
        assert "# ID: test1" in markdown
        assert "## Title: Test Title 1" in markdown
        assert "## Type: test" in markdown
        assert "## Permalink: https://example.com/1" in markdown
        assert "## Description:" in markdown
        assert "## Metadata:" in markdown
        assert "### Comments: 5" in markdown
        assert "### Likes: 10" in markdown
        assert "### Tags:" in markdown
        assert "- tag1" in markdown
        assert "- tag2" in markdown
        assert "### Views: 100" in markdown
        assert "--------------" in markdown

    def test_format_item_to_spec(self, scraper):
        """Test individual item formatting"""
        item = {
            'id': 'test123',
            'title': 'Test Item',
            'url': 'https://test.com',
            'description': 'Test description',
            'likes': 15,
            'comments': 3,
            'tags': ['test']
        }

        formatted = scraper.format_item_to_spec(item)
        lines = formatted.split('\n')

        assert "# ID: test123" in lines
        assert "## Title: Test Item" in lines
        assert "## Type: test" in lines
        assert "## Permalink: https://test.com" in lines
        assert "### Comments: 3" in lines
        assert "### Likes: 15" in lines
        assert "- test" in lines

    @patch('requests.Session.request')
    def test_make_request_with_retry(self, mock_request, scraper):
        """Test make_request method with retry logic"""
        # Mock successful response
        mock_response = Mock()
        mock_response.status_code = 200
        mock_request.return_value = mock_response

        response = scraper.make_request('GET', 'https://test.com')
        assert response == mock_response
        mock_request.assert_called_once()

    @patch('requests.Session.request')
    def test_make_request_retry_on_failure(self, mock_request, scraper):
        """Test retry logic on request failure"""
        # Mock failure then success
        mock_request.side_effect = [
            requests.RequestException("Connection failed"),
            requests.RequestException("Still failing"),
            Mock(status_code=200)  # Success on third try
        ]

        response = scraper.make_request('GET', 'https://test.com')
        assert response.status_code == 200
        assert mock_request.call_count == 3

    def test_incremental_items(self, scraper):
        """Test incremental item filtering"""
        items = scraper.fetch_content()

        # Empty state should return all items
        empty_state = {}
        incremental = scraper.get_incremental_items(items, empty_state)
        assert len(incremental) == 2

        # State with last_id should filter items
        state_with_last = {'last_id': 'test1'}
        incremental = scraper.get_incremental_items(items, state_with_last)
        assert len(incremental) == 1
        assert incremental[0]['id'] == 'test2'

    def test_update_state(self, scraper):
        """Test state update logic"""
        items = scraper.fetch_content()
        old_state = {'last_id': 'old'}

        new_state = scraper.update_state(old_state, items)
        assert new_state['last_id'] == 'test2'  # Should be last item ID
        assert 'last_update' in new_state

    @patch('requests.Session.request')
    def test_download_media(self, mock_request, scraper):
        """Test media downloading functionality"""
        # Mock successful download
        mock_response = Mock()
        mock_response.status_code = 200
        mock_response.iter_content.return_value = [b'fake image data']
        mock_request.return_value = mock_response

        # Test download
        url = 'https://example.com/image.jpg'
        result = scraper.download_media(url, 'test_item', 'image')

        assert result is not None
        assert 'test_item_image.jpg' in result

        # Verify file was created
        file_path = Path(result)
        assert file_path.exists()
        assert file_path.read_bytes() == b'fake image data'

    def test_sanitize_filename(self, scraper):
        """Test filename sanitization"""
        dangerous_name = 'test<>:"/\\|?*file.jpg'
        safe_name = scraper._sanitize_filename(dangerous_name)
        assert '<' not in safe_name
        assert '>' not in safe_name
        assert ':' not in safe_name
        assert safe_name == 'test_________file.jpg'

    def test_guess_extension(self, scraper):
        """Test file extension guessing"""
        assert scraper._guess_extension('test.jpg', 'image') == '.jpg'
        assert scraper._guess_extension('test.mp4', 'video') == '.mp4'
        assert scraper._guess_extension('test', 'image') == '.jpg'
        assert scraper._guess_extension('test', 'video') == '.mp4'
        assert scraper._guess_extension('test', 'unknown') == '.bin'


if __name__ == '__main__':
    pytest.main([__file__])