hvac-kia-content/tests/test_hvacrschool_scraper.py

import pytest
from unittest.mock import Mock, patch, MagicMock
from datetime import datetime
import json
from pathlib import Path
from src.hvacrschool_scraper import HVACRSchoolScraper
from src.base_scraper import ScraperConfig


class TestHVACRSchoolScraper:
    @pytest.fixture
    def config(self):
        return ScraperConfig(
            source_name="hvacrschool",
            brand_name="hkia",
            data_dir=Path("test_data"),
            logs_dir=Path("test_logs"),
            timezone="America/Halifax"
        )

    @pytest.fixture
    def mock_scraper(self, config):
        with patch('src.hvacrschool_scraper.StealthyFetcher') as mock_scraper_class:
            mock_scraper_instance = MagicMock()
            mock_scraper_class.return_value = mock_scraper_instance

            scraper = HVACRSchoolScraper(config)
            scraper.scraper = mock_scraper_instance
            return scraper

    @pytest.fixture
    def sample_sitemap_xml(self):
        return '''<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <url>
        <loc>http://www.hvacrschool.com/understanding-heat-transfer/</loc>
        <lastmod>2024-01-15T10:30:00Z</lastmod>
    </url>
    <url>
        <loc>http://www.hvacrschool.com/refrigeration-basics/</loc>
        <lastmod>2024-01-10T14:20:00Z</lastmod>
    </url>
    <url>
        <loc>http://www.hvacrschool.com/page/about/</loc>
        <lastmod>2024-01-01T12:00:00Z</lastmod>
    </url>
</urlset>'''

    @pytest.fixture
    def sample_article_html(self):
        return '''
        <html>
        <head>
            <title>Understanding Heat Transfer - HVACR School</title>
            <meta name="description" content="Learn the basics of heat transfer in HVAC systems">
            <script type="application/ld+json">
            {
                "@context": "https://schema.org",
                "@type": "Article",
                "headline": "Understanding Heat Transfer",
                "description": "Learn the basics of heat transfer in HVAC systems",
                "author": {"@type": "Person", "name": "Bryan Orr"},
                "datePublished": "2024-01-15T10:30:00Z"
            }
            </script>
        </head>
        <body>
            <article>
                <h1>Understanding Heat Transfer</h1>
                <div class="entry-content">
                    <p>Heat transfer is fundamental to HVAC systems...</p>
                    <p>There are three main types: conduction, convection, and radiation.</p>
                </div>
            </article>
        </body>
        </html>
        '''

    def test_initialization(self, config):
        """Test scraper initialization."""
        with patch('src.hvacrschool_scraper.StealthyFetcher'):
            scraper = HVACRSchoolScraper(config)
            assert scraper.base_url == "http://www.hvacrschool.com/"
            assert scraper.sitemap_url == "http://www.hvacrschool.com/sitemap-1.xml"
            assert scraper.request_delay == 2.0
            assert scraper.article_cache == {}

    @patch('src.hvacrschool_scraper.HVACRSchoolScraper.make_request')
    def test_fetch_sitemap_urls(self, mock_request, mock_scraper, sample_sitemap_xml):
        """Test fetching URLs from sitemap."""
        mock_response = Mock()
        mock_response.content = sample_sitemap_xml.encode()
        mock_response.raise_for_status.return_value = None
        mock_request.return_value = mock_response

        urls = mock_scraper.fetch_sitemap_urls()

        assert len(urls) == 2  # Should exclude the /page/ URL
        assert urls[0]['url'] == 'http://www.hvacrschool.com/understanding-heat-transfer/'
        assert urls[0]['lastmod'] == '2024-01-15T10:30:00Z'
        assert urls[1]['url'] == 'http://www.hvacrschool.com/refrigeration-basics/'

    def test_is_article_url(self, mock_scraper):
        """Test URL filtering logic."""
        # Valid article URLs
        assert mock_scraper._is_article_url('http://www.hvacrschool.com/understanding-heat-transfer/')
        assert mock_scraper._is_article_url('http://www.hvacrschool.com/refrigeration-basics/')

        # Invalid URLs
        assert not mock_scraper._is_article_url('http://www.hvacrschool.com/page/about/')
        assert not mock_scraper._is_article_url('http://www.hvacrschool.com/category/hvac/')
        assert not mock_scraper._is_article_url('http://www.hvacrschool.com/feed/')
        assert not mock_scraper._is_article_url('http://www.hvacrschool.com/')
        assert not mock_scraper._is_article_url('http://otherdomain.com/article/')

    def test_extract_article_data(self, mock_scraper, sample_article_html):
        """Test article data extraction."""
        mock_response = Mock()
        mock_response.css.side_effect = self._mock_css_selector(sample_article_html)

        url = 'http://www.hvacrschool.com/understanding-heat-transfer/'
        article_data = mock_scraper._extract_article_data(mock_response, url)

        assert article_data is not None
        assert article_data['title'] == 'Understanding Heat Transfer'
        assert article_data['author'] == 'Bryan Orr'
        assert article_data['publish_date'] == '2024-01-15T10:30:00Z'
        assert article_data['description'] == 'Learn the basics of heat transfer in HVAC systems'
        assert article_data['url'] == url
        assert article_data['type'] == 'blog_post'
        assert article_data['source'] == 'hvacrschool'

    def _mock_css_selector(self, html_content):
        """Helper to mock CSS selector responses."""
        def css_side_effect(selector):
            mock_elements = Mock()

            if selector == 'script[type="application/ld+json"]':
                mock_script = Mock()
                mock_script.text = '''
                {
                    "@context": "https://schema.org",
                    "@type": "Article",
                    "headline": "Understanding Heat Transfer",
                    "description": "Learn the basics of heat transfer in HVAC systems",
                    "author": {"@type": "Person", "name": "Bryan Orr"},
                    "datePublished": "2024-01-15T10:30:00Z"
                }
                '''
                mock_elements.__iter__ = Mock(return_value=iter([mock_script]))
                return mock_elements

            elif selector == 'article':
                mock_article = Mock()
                mock_article.html = '<div><p>Heat transfer is fundamental...</p></div>'
                mock_elements.first = mock_article
                return mock_elements

            elif selector == 'h1':
                mock_title = Mock()
                mock_title.text = 'Understanding Heat Transfer'
                mock_elements.first = mock_title
                return mock_elements

            else:
                mock_elements.first = None
                return mock_elements

        return css_side_effect

    def test_generate_article_id(self, mock_scraper):
        """Test article ID generation."""
        url1 = 'http://www.hvacrschool.com/understanding-heat-transfer/'
        url2 = 'http://www.hvacrschool.com/refrigeration-basics/'

        id1 = mock_scraper._generate_article_id(url1)
        id2 = mock_scraper._generate_article_id(url2)

        assert len(id1) == 12
        assert len(id2) == 12
        assert id1 != id2
        # Same URL should generate same ID
        assert id1 == mock_scraper._generate_article_id(url1)

    def test_get_incremental_items(self, mock_scraper):
        """Test incremental item filtering."""
        items = [
            {'publish_date': '2024-01-15T10:30:00Z', 'title': 'New Article'},
            {'publish_date': '2024-01-10T14:20:00Z', 'title': 'Old Article'},
            {'publish_date': '2024-01-20T08:00:00Z', 'title': 'Newer Article'},
        ]

        # Test with no state (should return all items)
        state = {}
        result = mock_scraper.get_incremental_items(items, state)
        assert len(result) == 3

        # Test with last sync date
        state = {'last_sync_date': '2024-01-12T00:00:00Z'}
        result = mock_scraper.get_incremental_items(items, state)
        assert len(result) == 2  # Should return items newer than 2024-01-12
        assert result[0]['title'] == 'New Article'
        assert result[1]['title'] == 'Newer Article'

    def test_update_state(self, mock_scraper):
        """Test state update logic."""
        items = [
            {'publish_date': '2024-01-10T14:20:00Z', 'title': 'Article 1'},
            {'publish_date': '2024-01-20T08:00:00Z', 'title': 'Article 2'},
            {'publish_date': '2024-01-15T10:30:00Z', 'title': 'Article 3'},
        ]

        state = {}
        updated_state = mock_scraper.update_state(state, items)

        assert updated_state['last_sync_date'] == '2024-01-20T08:00:00Z'  # Latest date
        assert updated_state['article_count'] == 3
        assert 'last_sync' in updated_state

    def test_format_markdown(self, mock_scraper):
        """Test markdown formatting."""
        articles = [
            {
                'id': 'test123',
                'title': 'Test Article',
                'author': 'Bryan Orr',
                'publish_date': '2024-01-15T10:30:00Z',
                'word_count': 250,
                'categories': ['HVAC', 'Heat Transfer'],
                'url': 'http://www.hvacrschool.com/test-article/',
                'content': '<p>Test content</p>',
                'description': 'Test description'
            }
        ]

        markdown = mock_scraper.format_markdown(articles)

        assert '# ID: test123' in markdown
        assert '## Title: Test Article' in markdown
        assert '## Author: Bryan Orr' in markdown
        assert '## Type: blog_post' in markdown
        assert '## Word Count: 250' in markdown
        assert '## Categories: HVAC, Heat Transfer' in markdown
        assert '## Permalink: http://www.hvacrschool.com/test-article/' in markdown
        assert '## Description:' in markdown

    @patch('time.sleep')
    def test_rate_limiting(self, mock_sleep, mock_scraper):
        """Test rate limiting functionality."""
        mock_scraper.last_request_time = 0
        mock_scraper.request_delay = 2.0

        # First call should not sleep
        with patch('time.time', return_value=10.0):
            mock_scraper._apply_rate_limit()
        mock_sleep.assert_not_called()

        # Second call within delay period should sleep
        with patch('time.time', return_value=11.0):  # 1 second later
            mock_scraper._apply_rate_limit()
        mock_sleep.assert_called_once_with(1.0)  # Should sleep for 1 more second

    @patch('src.hvacrschool_scraper.HVACRSchoolScraper.fetch_sitemap_urls')
    @patch('src.hvacrschool_scraper.HVACRSchoolScraper.scrape_article')
    def test_fetch_content(self, mock_scrape_article, mock_fetch_sitemap, mock_scraper):
        """Test content fetching with max_items limit."""
        # Mock sitemap URLs
        mock_fetch_sitemap.return_value = [
            {'url': 'http://www.hvacrschool.com/article1/', 'lastmod': '2024-01-20T10:00:00Z'},
            {'url': 'http://www.hvacrschool.com/article2/', 'lastmod': '2024-01-15T10:00:00Z'},
            {'url': 'http://www.hvacrschool.com/article3/', 'lastmod': '2024-01-10T10:00:00Z'},
        ]

        # Mock article scraping
        mock_scrape_article.side_effect = [
            {'title': 'Article 1', 'url': 'http://www.hvacrschool.com/article1/'},
            {'title': 'Article 2', 'url': 'http://www.hvacrschool.com/article2/'},
        ]

        # Test with max_items limit
        articles = mock_scraper.fetch_content(max_items=2)

        assert len(articles) == 2
        assert articles[0]['title'] == 'Article 1'
        assert articles[1]['title'] == 'Article 2'

        # Should have called scrape_article twice (limited by max_items)
        assert mock_scrape_article.call_count == 2