- Created base RSS scraper class with common functionality
- Implemented MailChimp RSS scraper for newsletters
- Implemented Podcast RSS scraper with audio/image extraction
- State management for incremental updates
- All 9 tests passing for RSS scrapers
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
235 lines
No EOL
8.7 KiB
Python
235 lines
No EOL
8.7 KiB
Python
import pytest
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
|
|
class TestRSSScraperMailChimp:
|
|
@pytest.fixture
|
|
def config(self):
|
|
return ScraperConfig(
|
|
source_name="mailchimp",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_env(self):
|
|
with patch.dict('os.environ', {
|
|
'MAILCHIMP_RSS_URL': 'https://hvacknowitall.com/feed/'
|
|
}):
|
|
yield
|
|
|
|
@pytest.fixture
|
|
def sample_rss_feed(self):
|
|
return """<?xml version="1.0" encoding="UTF-8"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<title>HVAC Know It All Newsletter</title>
|
|
<link>https://hvacknowitall.com</link>
|
|
<description>HVAC Tips and Tricks</description>
|
|
<item>
|
|
<title>Newsletter Issue 1</title>
|
|
<link>https://hvacknowitall.com/newsletter/1</link>
|
|
<description>Newsletter content 1</description>
|
|
<pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
|
|
<guid>newsletter-1</guid>
|
|
</item>
|
|
<item>
|
|
<title>Newsletter Issue 2</title>
|
|
<link>https://hvacknowitall.com/newsletter/2</link>
|
|
<description>Newsletter content 2</description>
|
|
<pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
|
|
<guid>newsletter-2</guid>
|
|
</item>
|
|
</channel>
|
|
</rss>"""
|
|
|
|
def test_initialization(self, config, mock_env):
|
|
scraper = RSSScraperMailChimp(config)
|
|
assert scraper.config == config
|
|
assert scraper.feed_url == 'https://hvacknowitall.com/feed/'
|
|
|
|
@patch('feedparser.parse')
|
|
def test_fetch_feed(self, mock_parse, config, mock_env, sample_rss_feed):
|
|
mock_feed = {
|
|
'entries': [
|
|
{
|
|
'title': 'Newsletter Issue 1',
|
|
'link': 'https://hvacknowitall.com/newsletter/1',
|
|
'description': 'Newsletter content 1',
|
|
'published': 'Mon, 01 Jan 2024 12:00:00 GMT',
|
|
'id': 'newsletter-1'
|
|
},
|
|
{
|
|
'title': 'Newsletter Issue 2',
|
|
'link': 'https://hvacknowitall.com/newsletter/2',
|
|
'description': 'Newsletter content 2',
|
|
'published': 'Tue, 02 Jan 2024 12:00:00 GMT',
|
|
'id': 'newsletter-2'
|
|
}
|
|
]
|
|
}
|
|
mock_parse.return_value = mock_feed
|
|
|
|
scraper = RSSScraperMailChimp(config)
|
|
entries = scraper.fetch_feed()
|
|
|
|
assert len(entries) == 2
|
|
assert entries[0]['title'] == 'Newsletter Issue 1'
|
|
mock_parse.assert_called_once_with(scraper.feed_url)
|
|
|
|
def test_format_markdown(self, config, mock_env):
|
|
scraper = RSSScraperMailChimp(config)
|
|
|
|
items = [
|
|
{
|
|
'id': 'newsletter-1',
|
|
'title': 'Newsletter Issue 1',
|
|
'link': 'https://hvacknowitall.com/newsletter/1',
|
|
'description': 'Newsletter content 1',
|
|
'published': 'Mon, 01 Jan 2024 12:00:00 GMT'
|
|
}
|
|
]
|
|
|
|
markdown = scraper.format_markdown(items)
|
|
|
|
assert '# ID: newsletter-1' in markdown
|
|
assert '## Title: Newsletter Issue 1' in markdown
|
|
assert '## Type: newsletter' in markdown
|
|
assert '## Link: https://hvacknowitall.com/newsletter/1' in markdown
|
|
assert '## Publish Date: Mon, 01 Jan 2024 12:00:00 GMT' in markdown
|
|
assert 'Newsletter content 1' in markdown
|
|
|
|
def test_get_incremental_items(self, config, mock_env):
|
|
scraper = RSSScraperMailChimp(config)
|
|
|
|
items = [
|
|
{'id': 'newsletter-3', 'published': 'Wed, 03 Jan 2024 12:00:00 GMT'},
|
|
{'id': 'newsletter-2', 'published': 'Tue, 02 Jan 2024 12:00:00 GMT'},
|
|
{'id': 'newsletter-1', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT'}
|
|
]
|
|
|
|
# Test with no previous state
|
|
state = {}
|
|
new_items = scraper.get_incremental_items(items, state)
|
|
assert len(new_items) == 3
|
|
|
|
# Test with existing state
|
|
state = {'last_item_id': 'newsletter-2'}
|
|
new_items = scraper.get_incremental_items(items, state)
|
|
assert len(new_items) == 1
|
|
assert new_items[0]['id'] == 'newsletter-3'
|
|
|
|
|
|
class TestRSSScraperPodcast:
|
|
@pytest.fixture
|
|
def config(self):
|
|
return ScraperConfig(
|
|
source_name="podcast",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data"),
|
|
logs_dir=Path("logs"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_env(self):
|
|
with patch.dict('os.environ', {
|
|
'PODCAST_RSS_URL': 'https://hvacknowitall.com/podcast/feed/'
|
|
}):
|
|
yield
|
|
|
|
@pytest.fixture
|
|
def sample_podcast_feed(self):
|
|
return {
|
|
'entries': [
|
|
{
|
|
'title': 'Episode 1: HVAC Basics',
|
|
'subtitle': 'Learn the basics',
|
|
'link': 'https://hvacknowitall.com/podcast/1',
|
|
'description': 'In this episode we discuss HVAC basics',
|
|
'published': 'Mon, 01 Jan 2024 12:00:00 GMT',
|
|
'id': 'episode-1',
|
|
'author': 'John Doe',
|
|
'enclosures': [{'href': 'https://hvacknowitall.com/audio/ep1.mp3'}],
|
|
'itunes_duration': '45:30',
|
|
'image': {'href': 'https://hvacknowitall.com/images/ep1.jpg'}
|
|
}
|
|
]
|
|
}
|
|
|
|
def test_initialization(self, config, mock_env):
|
|
scraper = RSSScraperPodcast(config)
|
|
assert scraper.config == config
|
|
assert scraper.feed_url == 'https://hvacknowitall.com/podcast/feed/'
|
|
|
|
@patch('feedparser.parse')
|
|
def test_fetch_feed_with_podcast_fields(self, mock_parse, config, mock_env, sample_podcast_feed):
|
|
mock_parse.return_value = sample_podcast_feed
|
|
|
|
scraper = RSSScraperPodcast(config)
|
|
entries = scraper.fetch_feed()
|
|
|
|
assert len(entries) == 1
|
|
entry = entries[0]
|
|
assert entry['title'] == 'Episode 1: HVAC Basics'
|
|
assert entry.get('subtitle') == 'Learn the basics'
|
|
assert entry.get('author') == 'John Doe'
|
|
assert entry.get('itunes_duration') == '45:30'
|
|
|
|
def test_extract_audio_link(self, config, mock_env):
|
|
scraper = RSSScraperPodcast(config)
|
|
|
|
item = {
|
|
'enclosures': [
|
|
{'href': 'https://hvacknowitall.com/audio/ep1.mp3', 'type': 'audio/mpeg'}
|
|
]
|
|
}
|
|
|
|
audio_link = scraper.extract_audio_link(item)
|
|
assert audio_link == 'https://hvacknowitall.com/audio/ep1.mp3'
|
|
|
|
def test_extract_image_link(self, config, mock_env):
|
|
scraper = RSSScraperPodcast(config)
|
|
|
|
item = {
|
|
'image': {'href': 'https://hvacknowitall.com/images/ep1.jpg'}
|
|
}
|
|
|
|
image_link = scraper.extract_image_link(item)
|
|
assert image_link == 'https://hvacknowitall.com/images/ep1.jpg'
|
|
|
|
def test_format_markdown_podcast(self, config, mock_env):
|
|
scraper = RSSScraperPodcast(config)
|
|
|
|
items = [
|
|
{
|
|
'id': 'episode-1',
|
|
'title': 'Episode 1: HVAC Basics',
|
|
'subtitle': 'Learn the basics',
|
|
'link': 'https://hvacknowitall.com/podcast/1',
|
|
'description': 'In this episode we discuss HVAC basics',
|
|
'published': 'Mon, 01 Jan 2024 12:00:00 GMT',
|
|
'author': 'John Doe',
|
|
'audio_link': 'https://hvacknowitall.com/audio/ep1.mp3',
|
|
'duration': '45:30',
|
|
'image_link': 'https://hvacknowitall.com/images/ep1.jpg'
|
|
}
|
|
]
|
|
|
|
markdown = scraper.format_markdown(items)
|
|
|
|
assert '# ID: episode-1' in markdown
|
|
assert '## Title: Episode 1: HVAC Basics' in markdown
|
|
assert '## Subtitle: Learn the basics' in markdown
|
|
assert '## Type: podcast' in markdown
|
|
assert '## Author: John Doe' in markdown
|
|
assert '## Duration: 45:30' in markdown
|
|
assert '## Audio Link: https://hvacknowitall.com/audio/ep1.mp3' in markdown
|
|
assert '## Image: https://hvacknowitall.com/images/ep1.jpg' in markdown
|
|
assert '## Episode Link: https://hvacknowitall.com/podcast/1' in markdown |