- Add new HVACRSchool scraper for technical articles (6th source) - Fix WordPress API connectivity (corrected URL to hvacknowitall.com) - Fix MailChimp RSS processing after environment consolidation - Implement YouTube hybrid scraper (API + yt-dlp) with PO token support - Disable YouTube transcripts due to platform restrictions (Aug 2025) - Update orchestrator to use all 6 active sources - Consolidate environment variables into single .env file - Full system sync completed with all sources updating successfully - Update documentation with current system status and capabilities 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
288 lines
No EOL
12 KiB
Python
288 lines
No EOL
12 KiB
Python
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from datetime import datetime
|
|
import json
|
|
from pathlib import Path
|
|
from src.hvacrschool_scraper import HVACRSchoolScraper
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
|
|
class TestHVACRSchoolScraper:
|
|
@pytest.fixture
|
|
def config(self):
|
|
return ScraperConfig(
|
|
source_name="hvacrschool",
|
|
brand_name="hkia",
|
|
data_dir=Path("test_data"),
|
|
logs_dir=Path("test_logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_scraper(self, config):
|
|
with patch('src.hvacrschool_scraper.StealthyFetcher') as mock_scraper_class:
|
|
mock_scraper_instance = MagicMock()
|
|
mock_scraper_class.return_value = mock_scraper_instance
|
|
|
|
scraper = HVACRSchoolScraper(config)
|
|
scraper.scraper = mock_scraper_instance
|
|
return scraper
|
|
|
|
@pytest.fixture
|
|
def sample_sitemap_xml(self):
|
|
return '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url>
|
|
<loc>http://www.hvacrschool.com/understanding-heat-transfer/</loc>
|
|
<lastmod>2024-01-15T10:30:00Z</lastmod>
|
|
</url>
|
|
<url>
|
|
<loc>http://www.hvacrschool.com/refrigeration-basics/</loc>
|
|
<lastmod>2024-01-10T14:20:00Z</lastmod>
|
|
</url>
|
|
<url>
|
|
<loc>http://www.hvacrschool.com/page/about/</loc>
|
|
<lastmod>2024-01-01T12:00:00Z</lastmod>
|
|
</url>
|
|
</urlset>'''
|
|
|
|
@pytest.fixture
|
|
def sample_article_html(self):
|
|
return '''
|
|
<html>
|
|
<head>
|
|
<title>Understanding Heat Transfer - HVACR School</title>
|
|
<meta name="description" content="Learn the basics of heat transfer in HVAC systems">
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"headline": "Understanding Heat Transfer",
|
|
"description": "Learn the basics of heat transfer in HVAC systems",
|
|
"author": {"@type": "Person", "name": "Bryan Orr"},
|
|
"datePublished": "2024-01-15T10:30:00Z"
|
|
}
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Understanding Heat Transfer</h1>
|
|
<div class="entry-content">
|
|
<p>Heat transfer is fundamental to HVAC systems...</p>
|
|
<p>There are three main types: conduction, convection, and radiation.</p>
|
|
</div>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
def test_initialization(self, config):
|
|
"""Test scraper initialization."""
|
|
with patch('src.hvacrschool_scraper.StealthyFetcher'):
|
|
scraper = HVACRSchoolScraper(config)
|
|
assert scraper.base_url == "http://www.hvacrschool.com/"
|
|
assert scraper.sitemap_url == "http://www.hvacrschool.com/sitemap-1.xml"
|
|
assert scraper.request_delay == 2.0
|
|
assert scraper.article_cache == {}
|
|
|
|
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.make_request')
|
|
def test_fetch_sitemap_urls(self, mock_request, mock_scraper, sample_sitemap_xml):
|
|
"""Test fetching URLs from sitemap."""
|
|
mock_response = Mock()
|
|
mock_response.content = sample_sitemap_xml.encode()
|
|
mock_response.raise_for_status.return_value = None
|
|
mock_request.return_value = mock_response
|
|
|
|
urls = mock_scraper.fetch_sitemap_urls()
|
|
|
|
assert len(urls) == 2 # Should exclude the /page/ URL
|
|
assert urls[0]['url'] == 'http://www.hvacrschool.com/understanding-heat-transfer/'
|
|
assert urls[0]['lastmod'] == '2024-01-15T10:30:00Z'
|
|
assert urls[1]['url'] == 'http://www.hvacrschool.com/refrigeration-basics/'
|
|
|
|
def test_is_article_url(self, mock_scraper):
|
|
"""Test URL filtering logic."""
|
|
# Valid article URLs
|
|
assert mock_scraper._is_article_url('http://www.hvacrschool.com/understanding-heat-transfer/')
|
|
assert mock_scraper._is_article_url('http://www.hvacrschool.com/refrigeration-basics/')
|
|
|
|
# Invalid URLs
|
|
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/page/about/')
|
|
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/category/hvac/')
|
|
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/feed/')
|
|
assert not mock_scraper._is_article_url('http://www.hvacrschool.com/')
|
|
assert not mock_scraper._is_article_url('http://otherdomain.com/article/')
|
|
|
|
def test_extract_article_data(self, mock_scraper, sample_article_html):
|
|
"""Test article data extraction."""
|
|
mock_response = Mock()
|
|
mock_response.css.side_effect = self._mock_css_selector(sample_article_html)
|
|
|
|
url = 'http://www.hvacrschool.com/understanding-heat-transfer/'
|
|
article_data = mock_scraper._extract_article_data(mock_response, url)
|
|
|
|
assert article_data is not None
|
|
assert article_data['title'] == 'Understanding Heat Transfer'
|
|
assert article_data['author'] == 'Bryan Orr'
|
|
assert article_data['publish_date'] == '2024-01-15T10:30:00Z'
|
|
assert article_data['description'] == 'Learn the basics of heat transfer in HVAC systems'
|
|
assert article_data['url'] == url
|
|
assert article_data['type'] == 'blog_post'
|
|
assert article_data['source'] == 'hvacrschool'
|
|
|
|
def _mock_css_selector(self, html_content):
|
|
"""Helper to mock CSS selector responses."""
|
|
def css_side_effect(selector):
|
|
mock_elements = Mock()
|
|
|
|
if selector == 'script[type="application/ld+json"]':
|
|
mock_script = Mock()
|
|
mock_script.text = '''
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"headline": "Understanding Heat Transfer",
|
|
"description": "Learn the basics of heat transfer in HVAC systems",
|
|
"author": {"@type": "Person", "name": "Bryan Orr"},
|
|
"datePublished": "2024-01-15T10:30:00Z"
|
|
}
|
|
'''
|
|
mock_elements.__iter__ = Mock(return_value=iter([mock_script]))
|
|
return mock_elements
|
|
|
|
elif selector == 'article':
|
|
mock_article = Mock()
|
|
mock_article.html = '<div><p>Heat transfer is fundamental...</p></div>'
|
|
mock_elements.first = mock_article
|
|
return mock_elements
|
|
|
|
elif selector == 'h1':
|
|
mock_title = Mock()
|
|
mock_title.text = 'Understanding Heat Transfer'
|
|
mock_elements.first = mock_title
|
|
return mock_elements
|
|
|
|
else:
|
|
mock_elements.first = None
|
|
return mock_elements
|
|
|
|
return css_side_effect
|
|
|
|
def test_generate_article_id(self, mock_scraper):
|
|
"""Test article ID generation."""
|
|
url1 = 'http://www.hvacrschool.com/understanding-heat-transfer/'
|
|
url2 = 'http://www.hvacrschool.com/refrigeration-basics/'
|
|
|
|
id1 = mock_scraper._generate_article_id(url1)
|
|
id2 = mock_scraper._generate_article_id(url2)
|
|
|
|
assert len(id1) == 12
|
|
assert len(id2) == 12
|
|
assert id1 != id2
|
|
# Same URL should generate same ID
|
|
assert id1 == mock_scraper._generate_article_id(url1)
|
|
|
|
def test_get_incremental_items(self, mock_scraper):
|
|
"""Test incremental item filtering."""
|
|
items = [
|
|
{'publish_date': '2024-01-15T10:30:00Z', 'title': 'New Article'},
|
|
{'publish_date': '2024-01-10T14:20:00Z', 'title': 'Old Article'},
|
|
{'publish_date': '2024-01-20T08:00:00Z', 'title': 'Newer Article'},
|
|
]
|
|
|
|
# Test with no state (should return all items)
|
|
state = {}
|
|
result = mock_scraper.get_incremental_items(items, state)
|
|
assert len(result) == 3
|
|
|
|
# Test with last sync date
|
|
state = {'last_sync_date': '2024-01-12T00:00:00Z'}
|
|
result = mock_scraper.get_incremental_items(items, state)
|
|
assert len(result) == 2 # Should return items newer than 2024-01-12
|
|
assert result[0]['title'] == 'New Article'
|
|
assert result[1]['title'] == 'Newer Article'
|
|
|
|
def test_update_state(self, mock_scraper):
|
|
"""Test state update logic."""
|
|
items = [
|
|
{'publish_date': '2024-01-10T14:20:00Z', 'title': 'Article 1'},
|
|
{'publish_date': '2024-01-20T08:00:00Z', 'title': 'Article 2'},
|
|
{'publish_date': '2024-01-15T10:30:00Z', 'title': 'Article 3'},
|
|
]
|
|
|
|
state = {}
|
|
updated_state = mock_scraper.update_state(state, items)
|
|
|
|
assert updated_state['last_sync_date'] == '2024-01-20T08:00:00Z' # Latest date
|
|
assert updated_state['article_count'] == 3
|
|
assert 'last_sync' in updated_state
|
|
|
|
def test_format_markdown(self, mock_scraper):
|
|
"""Test markdown formatting."""
|
|
articles = [
|
|
{
|
|
'id': 'test123',
|
|
'title': 'Test Article',
|
|
'author': 'Bryan Orr',
|
|
'publish_date': '2024-01-15T10:30:00Z',
|
|
'word_count': 250,
|
|
'categories': ['HVAC', 'Heat Transfer'],
|
|
'url': 'http://www.hvacrschool.com/test-article/',
|
|
'content': '<p>Test content</p>',
|
|
'description': 'Test description'
|
|
}
|
|
]
|
|
|
|
markdown = mock_scraper.format_markdown(articles)
|
|
|
|
assert '# ID: test123' in markdown
|
|
assert '## Title: Test Article' in markdown
|
|
assert '## Author: Bryan Orr' in markdown
|
|
assert '## Type: blog_post' in markdown
|
|
assert '## Word Count: 250' in markdown
|
|
assert '## Categories: HVAC, Heat Transfer' in markdown
|
|
assert '## Permalink: http://www.hvacrschool.com/test-article/' in markdown
|
|
assert '## Description:' in markdown
|
|
|
|
@patch('time.sleep')
|
|
def test_rate_limiting(self, mock_sleep, mock_scraper):
|
|
"""Test rate limiting functionality."""
|
|
mock_scraper.last_request_time = 0
|
|
mock_scraper.request_delay = 2.0
|
|
|
|
# First call should not sleep
|
|
with patch('time.time', return_value=10.0):
|
|
mock_scraper._apply_rate_limit()
|
|
mock_sleep.assert_not_called()
|
|
|
|
# Second call within delay period should sleep
|
|
with patch('time.time', return_value=11.0): # 1 second later
|
|
mock_scraper._apply_rate_limit()
|
|
mock_sleep.assert_called_once_with(1.0) # Should sleep for 1 more second
|
|
|
|
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.fetch_sitemap_urls')
|
|
@patch('src.hvacrschool_scraper.HVACRSchoolScraper.scrape_article')
|
|
def test_fetch_content(self, mock_scrape_article, mock_fetch_sitemap, mock_scraper):
|
|
"""Test content fetching with max_items limit."""
|
|
# Mock sitemap URLs
|
|
mock_fetch_sitemap.return_value = [
|
|
{'url': 'http://www.hvacrschool.com/article1/', 'lastmod': '2024-01-20T10:00:00Z'},
|
|
{'url': 'http://www.hvacrschool.com/article2/', 'lastmod': '2024-01-15T10:00:00Z'},
|
|
{'url': 'http://www.hvacrschool.com/article3/', 'lastmod': '2024-01-10T10:00:00Z'},
|
|
]
|
|
|
|
# Mock article scraping
|
|
mock_scrape_article.side_effect = [
|
|
{'title': 'Article 1', 'url': 'http://www.hvacrschool.com/article1/'},
|
|
{'title': 'Article 2', 'url': 'http://www.hvacrschool.com/article2/'},
|
|
]
|
|
|
|
# Test with max_items limit
|
|
articles = mock_scraper.fetch_content(max_items=2)
|
|
|
|
assert len(articles) == 2
|
|
assert articles[0]['title'] == 'Article 1'
|
|
assert articles[1]['title'] == 'Article 2'
|
|
|
|
# Should have called scrape_article twice (limited by max_items)
|
|
assert mock_scrape_article.call_count == 2 |