import pytest from unittest.mock import Mock, patch, MagicMock from datetime import datetime from pathlib import Path from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast from src.base_scraper import ScraperConfig class TestRSSScraperMailChimp: @pytest.fixture def config(self): return ScraperConfig( source_name="mailchimp", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) @pytest.fixture def mock_env(self): with patch.dict('os.environ', { 'MAILCHIMP_RSS_URL': 'https://hvacknowitall.com/feed/' }): yield @pytest.fixture def sample_rss_feed(self): return """ HVAC Know It All Newsletter https://hvacknowitall.com HVAC Tips and Tricks Newsletter Issue 1 https://hvacknowitall.com/newsletter/1 Newsletter content 1 Mon, 01 Jan 2024 12:00:00 GMT newsletter-1 Newsletter Issue 2 https://hvacknowitall.com/newsletter/2 Newsletter content 2 Tue, 02 Jan 2024 12:00:00 GMT newsletter-2 """ def test_initialization(self, config, mock_env): scraper = RSSScraperMailChimp(config) assert scraper.config == config assert scraper.feed_url == 'https://hvacknowitall.com/feed/' @patch('feedparser.parse') def test_fetch_feed(self, mock_parse, config, mock_env, sample_rss_feed): mock_feed = { 'entries': [ { 'title': 'Newsletter Issue 1', 'link': 'https://hvacknowitall.com/newsletter/1', 'description': 'Newsletter content 1', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT', 'id': 'newsletter-1' }, { 'title': 'Newsletter Issue 2', 'link': 'https://hvacknowitall.com/newsletter/2', 'description': 'Newsletter content 2', 'published': 'Tue, 02 Jan 2024 12:00:00 GMT', 'id': 'newsletter-2' } ] } mock_parse.return_value = mock_feed scraper = RSSScraperMailChimp(config) entries = scraper.fetch_feed() assert len(entries) == 2 assert entries[0]['title'] == 'Newsletter Issue 1' mock_parse.assert_called_once_with(scraper.feed_url) def test_format_markdown(self, config, mock_env): scraper = RSSScraperMailChimp(config) items = [ { 'id': 'newsletter-1', 'title': 'Newsletter Issue 1', 'link': 'https://hvacknowitall.com/newsletter/1', 'description': 'Newsletter content 1', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT' } ] markdown = scraper.format_markdown(items) assert '# ID: newsletter-1' in markdown assert '## Title: Newsletter Issue 1' in markdown assert '## Type: newsletter' in markdown assert '## Link: https://hvacknowitall.com/newsletter/1' in markdown assert '## Publish Date: Mon, 01 Jan 2024 12:00:00 GMT' in markdown assert 'Newsletter content 1' in markdown def test_get_incremental_items(self, config, mock_env): scraper = RSSScraperMailChimp(config) items = [ {'id': 'newsletter-3', 'published': 'Wed, 03 Jan 2024 12:00:00 GMT'}, {'id': 'newsletter-2', 'published': 'Tue, 02 Jan 2024 12:00:00 GMT'}, {'id': 'newsletter-1', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT'} ] # Test with no previous state state = {} new_items = scraper.get_incremental_items(items, state) assert len(new_items) == 3 # Test with existing state state = {'last_item_id': 'newsletter-2'} new_items = scraper.get_incremental_items(items, state) assert len(new_items) == 1 assert new_items[0]['id'] == 'newsletter-3' class TestRSSScraperPodcast: @pytest.fixture def config(self): return ScraperConfig( source_name="podcast", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) @pytest.fixture def mock_env(self): with patch.dict('os.environ', { 'PODCAST_RSS_URL': 'https://hvacknowitall.com/podcast/feed/' }): yield @pytest.fixture def sample_podcast_feed(self): return { 'entries': [ { 'title': 'Episode 1: HVAC Basics', 'subtitle': 'Learn the basics', 'link': 'https://hvacknowitall.com/podcast/1', 'description': 'In this episode we discuss HVAC basics', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT', 'id': 'episode-1', 'author': 'John Doe', 'enclosures': [{'href': 'https://hvacknowitall.com/audio/ep1.mp3'}], 'itunes_duration': '45:30', 'image': {'href': 'https://hvacknowitall.com/images/ep1.jpg'} } ] } def test_initialization(self, config, mock_env): scraper = RSSScraperPodcast(config) assert scraper.config == config assert scraper.feed_url == 'https://hvacknowitall.com/podcast/feed/' @patch('feedparser.parse') def test_fetch_feed_with_podcast_fields(self, mock_parse, config, mock_env, sample_podcast_feed): mock_parse.return_value = sample_podcast_feed scraper = RSSScraperPodcast(config) entries = scraper.fetch_feed() assert len(entries) == 1 entry = entries[0] assert entry['title'] == 'Episode 1: HVAC Basics' assert entry.get('subtitle') == 'Learn the basics' assert entry.get('author') == 'John Doe' assert entry.get('itunes_duration') == '45:30' def test_extract_audio_link(self, config, mock_env): scraper = RSSScraperPodcast(config) item = { 'enclosures': [ {'href': 'https://hvacknowitall.com/audio/ep1.mp3', 'type': 'audio/mpeg'} ] } audio_link = scraper.extract_audio_link(item) assert audio_link == 'https://hvacknowitall.com/audio/ep1.mp3' def test_extract_image_link(self, config, mock_env): scraper = RSSScraperPodcast(config) item = { 'image': {'href': 'https://hvacknowitall.com/images/ep1.jpg'} } image_link = scraper.extract_image_link(item) assert image_link == 'https://hvacknowitall.com/images/ep1.jpg' def test_format_markdown_podcast(self, config, mock_env): scraper = RSSScraperPodcast(config) items = [ { 'id': 'episode-1', 'title': 'Episode 1: HVAC Basics', 'subtitle': 'Learn the basics', 'link': 'https://hvacknowitall.com/podcast/1', 'description': 'In this episode we discuss HVAC basics', 'published': 'Mon, 01 Jan 2024 12:00:00 GMT', 'author': 'John Doe', 'audio_link': 'https://hvacknowitall.com/audio/ep1.mp3', 'duration': '45:30', 'image_link': 'https://hvacknowitall.com/images/ep1.jpg' } ] markdown = scraper.format_markdown(items) assert '# ID: episode-1' in markdown assert '## Title: Episode 1: HVAC Basics' in markdown assert '## Subtitle: Learn the basics' in markdown assert '## Type: podcast' in markdown assert '## Author: John Doe' in markdown assert '## Duration: 45:30' in markdown assert '## Audio Link: https://hvacknowitall.com/audio/ep1.mp3' in markdown assert '## Image: https://hvacknowitall.com/images/ep1.jpg' in markdown assert '## Episode Link: https://hvacknowitall.com/podcast/1' in markdown