import pytest from unittest.mock import Mock, patch, MagicMock, call from datetime import datetime from pathlib import Path import random from src.youtube_scraper import YouTubeScraper from src.base_scraper import ScraperConfig class TestYouTubeScraper: @pytest.fixture def config(self): return ScraperConfig( source_name="youtube", brand_name="hvacknowitall", data_dir=Path("data"), logs_dir=Path("logs"), timezone="America/Halifax" ) @pytest.fixture def mock_env(self): with patch.dict('os.environ', { 'YOUTUBE_USERNAME': 'test@example.com', 'YOUTUBE_PASSWORD': 'test_password', 'YOUTUBE_CHANNEL_URL': 'https://www.youtube.com/@HVACKnowItAll' }): yield @pytest.fixture def sample_video_info(self): return { 'id': 'abc123', 'title': 'HVAC Maintenance Tips', 'description': 'Learn how to maintain your HVAC system', 'uploader': 'HVAC Know It All', 'upload_date': '20240101', 'view_count': 1500, 'like_count': 100, 'comment_count': 25, 'duration': 600, 'webpage_url': 'https://www.youtube.com/watch?v=abc123', 'thumbnail': 'https://i.ytimg.com/vi/abc123/maxresdefault.jpg', 'tags': ['hvac', 'maintenance', 'tips'] } def test_initialization(self, config, mock_env): scraper = YouTubeScraper(config) assert scraper.config == config assert scraper.username == 'test@example.com' assert scraper.password == 'test_password' assert scraper.channel_url == 'https://www.youtube.com/@HVACKnowItAll' @patch('yt_dlp.YoutubeDL') def test_setup_ydl_options(self, mock_ydl_class, config, mock_env): scraper = YouTubeScraper(config) options = scraper._get_ydl_options() # Check key options assert options['quiet'] == True assert options['no_warnings'] == True assert options['extract_flat'] == False assert 'username' in options assert 'password' in options assert 'cookiefile' in options assert 'ratelimit' in options @patch('yt_dlp.YoutubeDL') def test_fetch_channel_videos(self, mock_ydl_class, config, mock_env, sample_video_info): mock_ydl = MagicMock() mock_ydl_class.return_value.__enter__.return_value = mock_ydl # Mock channel info with videos mock_ydl.extract_info.return_value = { 'entries': [ sample_video_info, {**sample_video_info, 'id': 'def456', 'title': 'Another Video'} ] } scraper = YouTubeScraper(config) videos = scraper.fetch_channel_videos() assert len(videos) == 2 assert videos[0]['id'] == 'abc123' assert videos[1]['id'] == 'def456' mock_ydl.extract_info.assert_called_once() @patch('yt_dlp.YoutubeDL') def test_fetch_video_details(self, mock_ydl_class, config, mock_env, sample_video_info): mock_ydl = MagicMock() mock_ydl_class.return_value.__enter__.return_value = mock_ydl mock_ydl.extract_info.return_value = sample_video_info scraper = YouTubeScraper(config) video_info = scraper.fetch_video_details('abc123') assert video_info['id'] == 'abc123' assert video_info['title'] == 'HVAC Maintenance Tips' mock_ydl.extract_info.assert_called_with( 'https://www.youtube.com/watch?v=abc123', download=False ) @patch('time.sleep') @patch('random.uniform') def test_humanized_delay(self, mock_uniform, mock_sleep, config, mock_env): mock_uniform.return_value = 3.5 scraper = YouTubeScraper(config) scraper._humanized_delay() mock_uniform.assert_called_with(2, 5) mock_sleep.assert_called_with(3.5) def test_format_video_type(self, config, mock_env): scraper = YouTubeScraper(config) # Test short video assert scraper._get_video_type({'duration': 50}) == 'short' # Test regular video assert scraper._get_video_type({'duration': 600}) == 'video' # Test live stream assert scraper._get_video_type({'is_live': True}) == 'live' # Test missing duration assert scraper._get_video_type({}) == 'video' def test_format_markdown(self, config, mock_env): scraper = YouTubeScraper(config) videos = [ { 'id': 'abc123', 'title': 'HVAC Tips', 'description': 'Learn HVAC basics', 'uploader': 'HVAC Know It All', 'upload_date': '20240101', 'view_count': 1500, 'like_count': 100, 'comment_count': 25, 'duration': 600, 'webpage_url': 'https://www.youtube.com/watch?v=abc123', 'tags': ['hvac', 'tips'], 'type': 'video' } ] markdown = scraper.format_markdown(videos) assert '# ID: abc123' in markdown assert '## Title: HVAC Tips' in markdown assert '## Type: video' in markdown assert '## Author: HVAC Know It All' in markdown assert '## Link: https://www.youtube.com/watch?v=abc123' in markdown assert '## Views: 1500' in markdown assert '## Likes: 100' in markdown assert '## Comments: 25' in markdown assert '## Duration: 600 seconds' in markdown assert '## Upload Date: 2024-01-01' in markdown assert '## Tags: hvac, tips' in markdown def test_get_incremental_items(self, config, mock_env): scraper = YouTubeScraper(config) videos = [ {'id': 'video3', 'upload_date': '20240103'}, {'id': 'video2', 'upload_date': '20240102'}, {'id': 'video1', 'upload_date': '20240101'} ] # Test with no previous state state = {} new_videos = scraper.get_incremental_items(videos, state) assert len(new_videos) == 3 # Test with existing state state = {'last_video_id': 'video2', 'last_video_date': '20240102'} new_videos = scraper.get_incremental_items(videos, state) assert len(new_videos) == 1 assert new_videos[0]['id'] == 'video3' def test_update_state(self, config, mock_env): scraper = YouTubeScraper(config) state = {} videos = [ {'id': 'video2', 'upload_date': '20240102'}, {'id': 'video1', 'upload_date': '20240101'} ] updated_state = scraper.update_state(state, videos) assert updated_state['last_video_id'] == 'video2' assert updated_state['last_video_date'] == '20240102' assert updated_state['video_count'] == 2 @patch('yt_dlp.YoutubeDL') def test_error_handling(self, mock_ydl_class, config, mock_env): mock_ydl = MagicMock() mock_ydl_class.return_value.__enter__.return_value = mock_ydl mock_ydl.extract_info.side_effect = Exception("Network error") scraper = YouTubeScraper(config) videos = scraper.fetch_channel_videos() assert videos == [] @patch('yt_dlp.YoutubeDL') @patch('time.sleep') def test_fetch_content_with_rate_limiting(self, mock_sleep, mock_ydl_class, config, mock_env, sample_video_info): mock_ydl = MagicMock() mock_ydl_class.return_value.__enter__.return_value = mock_ydl # Mock channel with multiple videos mock_ydl.extract_info.side_effect = [ {'entries': [ {'id': 'video1', 'title': 'Video 1'}, {'id': 'video2', 'title': 'Video 2'} ]}, {**sample_video_info, 'id': 'video1'}, {**sample_video_info, 'id': 'video2'} ] scraper = YouTubeScraper(config) with patch.object(scraper, '_humanized_delay') as mock_delay: videos = scraper.fetch_content() assert len(videos) == 2 # Check that delay was called between video fetches (once for second video) assert mock_delay.call_count >= 1