hvac-kia-content/tests/test_hvacrschool_scraper.py
Ben Reed 34fd853874 feat: Add HVACRSchool scraper and fix all source connectivity
- Add new HVACRSchool scraper for technical articles (6th source)
- Fix WordPress API connectivity (corrected URL to hvacknowitall.com)
- Fix MailChimp RSS processing after environment consolidation
- Implement YouTube hybrid scraper (API + yt-dlp) with PO token support
- Disable YouTube transcripts due to platform restrictions (Aug 2025)
- Update orchestrator to use all 6 active sources
- Consolidate environment variables into single .env file
- Full system sync completed with all sources updating successfully
- Update documentation with current system status and capabilities

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-27 18:11:00 -03:00

288 lines
No EOL
12 KiB
Python

import pytest
from unittest.mock import Mock, patch, MagicMock
from datetime import datetime
import json
from pathlib import Path
from src.hvacrschool_scraper import HVACRSchoolScraper
from src.base_scraper import ScraperConfig
class TestHVACRSchoolScraper:
@pytest.fixture
def config(self):
return ScraperConfig(
source_name="hvacrschool",
brand_name="hkia",
data_dir=Path("test_data"),
logs_dir=Path("test_logs"),
timezone="America/Halifax"
)
@pytest.fixture
def mock_scraper(self, config):
with patch('src.hvacrschool_scraper.StealthyFetcher') as mock_scraper_class:
mock_scraper_instance = MagicMock()
mock_scraper_class.return_value = mock_scraper_instance
scraper = HVACRSchoolScraper(config)
scraper.scraper = mock_scraper_instance
return scraper
@pytest.fixture
def sample_sitemap_xml(self):
return '''<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.hvacrschool.com/understanding-heat-transfer/</loc>
<lastmod>2024-01-15T10:30:00Z</lastmod>
</url>
<url>
<loc>http://www.hvacrschool.com/refrigeration-basics/</loc>
<lastmod>2024-01-10T14:20:00Z</lastmod>
</url>
<url>
<loc>http://www.hvacrschool.com/page/about/</loc>
<lastmod>2024-01-01T12:00:00Z</lastmod>
</url>
</urlset>'''
@pytest.fixture
def sample_article_html(self):
return '''
<html>
<head>
<title>Understanding Heat Transfer - HVACR School</title>
<meta name="description" content="Learn the basics of heat transfer in HVAC systems">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Understanding Heat Transfer",
"description": "Learn the basics of heat transfer in HVAC systems",
"author": {"@type": "Person", "name": "Bryan Orr"},
"datePublished": "2024-01-15T10:30:00Z"
}
</script>
</head>
<body>
<article>
<h1>Understanding Heat Transfer</h1>
<div class="entry-content">
<p>Heat transfer is fundamental to HVAC systems...</p>
<p>There are three main types: conduction, convection, and radiation.</p>
</div>
</article>
</body>
</html>
'''
def test_initialization(self, config):
"""Test scraper initialization."""
with patch('src.hvacrschool_scraper.StealthyFetcher'):
scraper = HVACRSchoolScraper(config)
assert scraper.base_url == "http://www.hvacrschool.com/"
assert scraper.sitemap_url == "http://www.hvacrschool.com/sitemap-1.xml"
assert scraper.request_delay == 2.0
assert scraper.article_cache == {}
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.make_request')
def test_fetch_sitemap_urls(self, mock_request, mock_scraper, sample_sitemap_xml):
"""Test fetching URLs from sitemap."""
mock_response = Mock()
mock_response.content = sample_sitemap_xml.encode()
mock_response.raise_for_status.return_value = None
mock_request.return_value = mock_response
urls = mock_scraper.fetch_sitemap_urls()
assert len(urls) == 2 # Should exclude the /page/ URL
assert urls[0]['url'] == 'http://www.hvacrschool.com/understanding-heat-transfer/'
assert urls[0]['lastmod'] == '2024-01-15T10:30:00Z'
assert urls[1]['url'] == 'http://www.hvacrschool.com/refrigeration-basics/'
def test_is_article_url(self, mock_scraper):
"""Test URL filtering logic."""
# Valid article URLs
assert mock_scraper._is_article_url('http://www.hvacrschool.com/understanding-heat-transfer/')
assert mock_scraper._is_article_url('http://www.hvacrschool.com/refrigeration-basics/')
# Invalid URLs
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/page/about/')
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/category/hvac/')
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/feed/')
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/')
assert not mock_scraper._is_article_url('http://otherdomain.com/article/')
def test_extract_article_data(self, mock_scraper, sample_article_html):
"""Test article data extraction."""
mock_response = Mock()
mock_response.css.side_effect = self._mock_css_selector(sample_article_html)
url = 'http://www.hvacrschool.com/understanding-heat-transfer/'
article_data = mock_scraper._extract_article_data(mock_response, url)
assert article_data is not None
assert article_data['title'] == 'Understanding Heat Transfer'
assert article_data['author'] == 'Bryan Orr'
assert article_data['publish_date'] == '2024-01-15T10:30:00Z'
assert article_data['description'] == 'Learn the basics of heat transfer in HVAC systems'
assert article_data['url'] == url
assert article_data['type'] == 'blog_post'
assert article_data['source'] == 'hvacrschool'
def _mock_css_selector(self, html_content):
"""Helper to mock CSS selector responses."""
def css_side_effect(selector):
mock_elements = Mock()
if selector == 'script[type="application/ld+json"]':
mock_script = Mock()
mock_script.text = '''
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Understanding Heat Transfer",
"description": "Learn the basics of heat transfer in HVAC systems",
"author": {"@type": "Person", "name": "Bryan Orr"},
"datePublished": "2024-01-15T10:30:00Z"
}
'''
mock_elements.__iter__ = Mock(return_value=iter([mock_script]))
return mock_elements
elif selector == 'article':
mock_article = Mock()
mock_article.html = '<div><p>Heat transfer is fundamental...</p></div>'
mock_elements.first = mock_article
return mock_elements
elif selector == 'h1':
mock_title = Mock()
mock_title.text = 'Understanding Heat Transfer'
mock_elements.first = mock_title
return mock_elements
else:
mock_elements.first = None
return mock_elements
return css_side_effect
def test_generate_article_id(self, mock_scraper):
"""Test article ID generation."""
url1 = 'http://www.hvacrschool.com/understanding-heat-transfer/'
url2 = 'http://www.hvacrschool.com/refrigeration-basics/'
id1 = mock_scraper._generate_article_id(url1)
id2 = mock_scraper._generate_article_id(url2)
assert len(id1) == 12
assert len(id2) == 12
assert id1 != id2
# Same URL should generate same ID
assert id1 == mock_scraper._generate_article_id(url1)
def test_get_incremental_items(self, mock_scraper):
"""Test incremental item filtering."""
items = [
{'publish_date': '2024-01-15T10:30:00Z', 'title': 'New Article'},
{'publish_date': '2024-01-10T14:20:00Z', 'title': 'Old Article'},
{'publish_date': '2024-01-20T08:00:00Z', 'title': 'Newer Article'},
]
# Test with no state (should return all items)
state = {}
result = mock_scraper.get_incremental_items(items, state)
assert len(result) == 3
# Test with last sync date
state = {'last_sync_date': '2024-01-12T00:00:00Z'}
result = mock_scraper.get_incremental_items(items, state)
assert len(result) == 2 # Should return items newer than 2024-01-12
assert result[0]['title'] == 'New Article'
assert result[1]['title'] == 'Newer Article'
def test_update_state(self, mock_scraper):
"""Test state update logic."""
items = [
{'publish_date': '2024-01-10T14:20:00Z', 'title': 'Article 1'},
{'publish_date': '2024-01-20T08:00:00Z', 'title': 'Article 2'},
{'publish_date': '2024-01-15T10:30:00Z', 'title': 'Article 3'},
]
state = {}
updated_state = mock_scraper.update_state(state, items)
assert updated_state['last_sync_date'] == '2024-01-20T08:00:00Z' # Latest date
assert updated_state['article_count'] == 3
assert 'last_sync' in updated_state
def test_format_markdown(self, mock_scraper):
"""Test markdown formatting."""
articles = [
{
'id': 'test123',
'title': 'Test Article',
'author': 'Bryan Orr',
'publish_date': '2024-01-15T10:30:00Z',
'word_count': 250,
'categories': ['HVAC', 'Heat Transfer'],
'url': 'http://www.hvacrschool.com/test-article/',
'content': '<p>Test content</p>',
'description': 'Test description'
}
]
markdown = mock_scraper.format_markdown(articles)
assert '# ID: test123' in markdown
assert '## Title: Test Article' in markdown
assert '## Author: Bryan Orr' in markdown
assert '## Type: blog_post' in markdown
assert '## Word Count: 250' in markdown
assert '## Categories: HVAC, Heat Transfer' in markdown
assert '## Permalink: http://www.hvacrschool.com/test-article/' in markdown
assert '## Description:' in markdown
@patch('time.sleep')
def test_rate_limiting(self, mock_sleep, mock_scraper):
"""Test rate limiting functionality."""
mock_scraper.last_request_time = 0
mock_scraper.request_delay = 2.0
# First call should not sleep
with patch('time.time', return_value=10.0):
mock_scraper._apply_rate_limit()
mock_sleep.assert_not_called()
# Second call within delay period should sleep
with patch('time.time', return_value=11.0): # 1 second later
mock_scraper._apply_rate_limit()
mock_sleep.assert_called_once_with(1.0) # Should sleep for 1 more second
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.fetch_sitemap_urls')
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.scrape_article')
def test_fetch_content(self, mock_scrape_article, mock_fetch_sitemap, mock_scraper):
"""Test content fetching with max_items limit."""
# Mock sitemap URLs
mock_fetch_sitemap.return_value = [
{'url': 'http://www.hvacrschool.com/article1/', 'lastmod': '2024-01-20T10:00:00Z'},
{'url': 'http://www.hvacrschool.com/article2/', 'lastmod': '2024-01-15T10:00:00Z'},
{'url': 'http://www.hvacrschool.com/article3/', 'lastmod': '2024-01-10T10:00:00Z'},
]
# Mock article scraping
mock_scrape_article.side_effect = [
{'title': 'Article 1', 'url': 'http://www.hvacrschool.com/article1/'},
{'title': 'Article 2', 'url': 'http://www.hvacrschool.com/article2/'},
]
# Test with max_items limit
articles = mock_scraper.fetch_content(max_items=2)
assert len(articles) == 2
assert articles[0]['title'] == 'Article 1'
assert articles[1]['title'] == 'Article 2'
# Should have called scrape_article twice (limited by max_items)
assert mock_scrape_article.call_count == 2