import pytest from unittest.mock import Mock, patch, MagicMock from datetime import datetime import json from pathlib import Path from src.hvacrschool_scraper import HVACRSchoolScraper from src.base_scraper import ScraperConfig class TestHVACRSchoolScraper: @pytest.fixture def config(self): return ScraperConfig( source_name="hvacrschool", brand_name="hkia", data_dir=Path("test_data"), logs_dir=Path("test_logs"), timezone="America/Halifax" ) @pytest.fixture def mock_scraper(self, config): with patch('src.hvacrschool_scraper.StealthyFetcher') as mock_scraper_class: mock_scraper_instance = MagicMock() mock_scraper_class.return_value = mock_scraper_instance scraper = HVACRSchoolScraper(config) scraper.scraper = mock_scraper_instance return scraper @pytest.fixture def sample_sitemap_xml(self): return ''' http://www.hvacrschool.com/understanding-heat-transfer/ 2024-01-15T10:30:00Z http://www.hvacrschool.com/refrigeration-basics/ 2024-01-10T14:20:00Z http://www.hvacrschool.com/page/about/ 2024-01-01T12:00:00Z ''' @pytest.fixture def sample_article_html(self): return ''' Understanding Heat Transfer - HVACR School

Understanding Heat Transfer

Heat transfer is fundamental to HVAC systems...

There are three main types: conduction, convection, and radiation.

''' def test_initialization(self, config): """Test scraper initialization.""" with patch('src.hvacrschool_scraper.StealthyFetcher'): scraper = HVACRSchoolScraper(config) assert scraper.base_url == "http://www.hvacrschool.com/" assert scraper.sitemap_url == "http://www.hvacrschool.com/sitemap-1.xml" assert scraper.request_delay == 2.0 assert scraper.article_cache == {} @patch('src.hvacrschool_scraper.HVACRSchoolScraper.make_request') def test_fetch_sitemap_urls(self, mock_request, mock_scraper, sample_sitemap_xml): """Test fetching URLs from sitemap.""" mock_response = Mock() mock_response.content = sample_sitemap_xml.encode() mock_response.raise_for_status.return_value = None mock_request.return_value = mock_response urls = mock_scraper.fetch_sitemap_urls() assert len(urls) == 2 # Should exclude the /page/ URL assert urls[0]['url'] == 'http://www.hvacrschool.com/understanding-heat-transfer/' assert urls[0]['lastmod'] == '2024-01-15T10:30:00Z' assert urls[1]['url'] == 'http://www.hvacrschool.com/refrigeration-basics/' def test_is_article_url(self, mock_scraper): """Test URL filtering logic.""" # Valid article URLs assert mock_scraper._is_article_url('http://www.hvacrschool.com/understanding-heat-transfer/') assert mock_scraper._is_article_url('http://www.hvacrschool.com/refrigeration-basics/') # Invalid URLs assert not mock_scraper._is_article_url('http://www.hvacrschool.com/page/about/') assert not mock_scraper._is_article_url('http://www.hvacrschool.com/category/hvac/') assert not mock_scraper._is_article_url('http://www.hvacrschool.com/feed/') assert not mock_scraper._is_article_url('http://www.hvacrschool.com/') assert not mock_scraper._is_article_url('http://otherdomain.com/article/') def test_extract_article_data(self, mock_scraper, sample_article_html): """Test article data extraction.""" mock_response = Mock() mock_response.css.side_effect = self._mock_css_selector(sample_article_html) url = 'http://www.hvacrschool.com/understanding-heat-transfer/' article_data = mock_scraper._extract_article_data(mock_response, url) assert article_data is not None assert article_data['title'] == 'Understanding Heat Transfer' assert article_data['author'] == 'Bryan Orr' assert article_data['publish_date'] == '2024-01-15T10:30:00Z' assert article_data['description'] == 'Learn the basics of heat transfer in HVAC systems' assert article_data['url'] == url assert article_data['type'] == 'blog_post' assert article_data['source'] == 'hvacrschool' def _mock_css_selector(self, html_content): """Helper to mock CSS selector responses.""" def css_side_effect(selector): mock_elements = Mock() if selector == 'script[type="application/ld+json"]': mock_script = Mock() mock_script.text = ''' { "@context": "https://schema.org", "@type": "Article", "headline": "Understanding Heat Transfer", "description": "Learn the basics of heat transfer in HVAC systems", "author": {"@type": "Person", "name": "Bryan Orr"}, "datePublished": "2024-01-15T10:30:00Z" } ''' mock_elements.__iter__ = Mock(return_value=iter([mock_script])) return mock_elements elif selector == 'article': mock_article = Mock() mock_article.html = '

Heat transfer is fundamental...

' mock_elements.first = mock_article return mock_elements elif selector == 'h1': mock_title = Mock() mock_title.text = 'Understanding Heat Transfer' mock_elements.first = mock_title return mock_elements else: mock_elements.first = None return mock_elements return css_side_effect def test_generate_article_id(self, mock_scraper): """Test article ID generation.""" url1 = 'http://www.hvacrschool.com/understanding-heat-transfer/' url2 = 'http://www.hvacrschool.com/refrigeration-basics/' id1 = mock_scraper._generate_article_id(url1) id2 = mock_scraper._generate_article_id(url2) assert len(id1) == 12 assert len(id2) == 12 assert id1 != id2 # Same URL should generate same ID assert id1 == mock_scraper._generate_article_id(url1) def test_get_incremental_items(self, mock_scraper): """Test incremental item filtering.""" items = [ {'publish_date': '2024-01-15T10:30:00Z', 'title': 'New Article'}, {'publish_date': '2024-01-10T14:20:00Z', 'title': 'Old Article'}, {'publish_date': '2024-01-20T08:00:00Z', 'title': 'Newer Article'}, ] # Test with no state (should return all items) state = {} result = mock_scraper.get_incremental_items(items, state) assert len(result) == 3 # Test with last sync date state = {'last_sync_date': '2024-01-12T00:00:00Z'} result = mock_scraper.get_incremental_items(items, state) assert len(result) == 2 # Should return items newer than 2024-01-12 assert result[0]['title'] == 'New Article' assert result[1]['title'] == 'Newer Article' def test_update_state(self, mock_scraper): """Test state update logic.""" items = [ {'publish_date': '2024-01-10T14:20:00Z', 'title': 'Article 1'}, {'publish_date': '2024-01-20T08:00:00Z', 'title': 'Article 2'}, {'publish_date': '2024-01-15T10:30:00Z', 'title': 'Article 3'}, ] state = {} updated_state = mock_scraper.update_state(state, items) assert updated_state['last_sync_date'] == '2024-01-20T08:00:00Z' # Latest date assert updated_state['article_count'] == 3 assert 'last_sync' in updated_state def test_format_markdown(self, mock_scraper): """Test markdown formatting.""" articles = [ { 'id': 'test123', 'title': 'Test Article', 'author': 'Bryan Orr', 'publish_date': '2024-01-15T10:30:00Z', 'word_count': 250, 'categories': ['HVAC', 'Heat Transfer'], 'url': 'http://www.hvacrschool.com/test-article/', 'content': '

Test content

', 'description': 'Test description' } ] markdown = mock_scraper.format_markdown(articles) assert '# ID: test123' in markdown assert '## Title: Test Article' in markdown assert '## Author: Bryan Orr' in markdown assert '## Type: blog_post' in markdown assert '## Word Count: 250' in markdown assert '## Categories: HVAC, Heat Transfer' in markdown assert '## Permalink: http://www.hvacrschool.com/test-article/' in markdown assert '## Description:' in markdown @patch('time.sleep') def test_rate_limiting(self, mock_sleep, mock_scraper): """Test rate limiting functionality.""" mock_scraper.last_request_time = 0 mock_scraper.request_delay = 2.0 # First call should not sleep with patch('time.time', return_value=10.0): mock_scraper._apply_rate_limit() mock_sleep.assert_not_called() # Second call within delay period should sleep with patch('time.time', return_value=11.0): # 1 second later mock_scraper._apply_rate_limit() mock_sleep.assert_called_once_with(1.0) # Should sleep for 1 more second @patch('src.hvacrschool_scraper.HVACRSchoolScraper.fetch_sitemap_urls') @patch('src.hvacrschool_scraper.HVACRSchoolScraper.scrape_article') def test_fetch_content(self, mock_scrape_article, mock_fetch_sitemap, mock_scraper): """Test content fetching with max_items limit.""" # Mock sitemap URLs mock_fetch_sitemap.return_value = [ {'url': 'http://www.hvacrschool.com/article1/', 'lastmod': '2024-01-20T10:00:00Z'}, {'url': 'http://www.hvacrschool.com/article2/', 'lastmod': '2024-01-15T10:00:00Z'}, {'url': 'http://www.hvacrschool.com/article3/', 'lastmod': '2024-01-10T10:00:00Z'}, ] # Mock article scraping mock_scrape_article.side_effect = [ {'title': 'Article 1', 'url': 'http://www.hvacrschool.com/article1/'}, {'title': 'Article 2', 'url': 'http://www.hvacrschool.com/article2/'}, ] # Test with max_items limit articles = mock_scraper.fetch_content(max_items=2) assert len(articles) == 2 assert articles[0]['title'] == 'Article 1' assert articles[1]['title'] == 'Article 2' # Should have called scrape_article twice (limited by max_items) assert mock_scrape_article.call_count == 2