#!/usr/bin/env python3 """ Comprehensive test suite for YouTube API scraper with quota management Following TDD principles for robust implementation validation """ import pytest import json import os from unittest.mock import Mock, patch, MagicMock, call from datetime import datetime import pytz from pathlib import Path # Import the scraper import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.youtube_api_scraper import YouTubeAPIScraper from src.base_scraper import ScraperConfig class TestYouTubeAPIScraper: """Test suite for YouTube API scraper with quota management""" @pytest.fixture def config(self, tmp_path): """Create test configuration""" return ScraperConfig( source_name='youtube', brand_name='test_brand', data_dir=tmp_path / 'data', logs_dir=tmp_path / 'logs', timezone='America/Halifax' ) @pytest.fixture def mock_env_vars(self, monkeypatch): """Mock environment variables""" monkeypatch.setenv('YOUTUBE_API_KEY', 'test-youtube-api-key') monkeypatch.setenv('YOUTUBE_CHANNEL_URL', 'https://www.youtube.com/@TestChannel') @pytest.fixture def scraper(self, config, mock_env_vars): """Create scraper instance with mocked environment""" with patch('src.youtube_api_scraper.build'): return YouTubeAPIScraper(config) @pytest.fixture def sample_channel_response(self): """Sample channel details response""" return { 'items': [{ 'id': 'UC_test_channel_id', 'snippet': { 'title': 'Test Channel', 'description': 'Test channel description' }, 'statistics': { 'subscriberCount': '10000', 'viewCount': '1000000', 'videoCount': '370' }, 'contentDetails': { 'relatedPlaylists': { 'uploads': 'UU_test_channel_id' } } }] } @pytest.fixture def sample_playlist_response(self): """Sample playlist items response""" return { 'items': [ {'contentDetails': {'videoId': 'video1'}}, {'contentDetails': {'videoId': 'video2'}}, {'contentDetails': {'videoId': 'video3'}} ], 'nextPageToken': None } @pytest.fixture def sample_videos_response(self): """Sample videos details response""" return { 'items': [ { 'id': 'video1', 'snippet': { 'title': 'HVAC Maintenance Tips', 'description': 'Complete guide to maintaining your HVAC system for optimal performance and longevity.', 'publishedAt': '2025-08-15T10:00:00Z', 'channelId': 'UC_test_channel_id', 'channelTitle': 'Test Channel', 'tags': ['hvac', 'maintenance', 'tips', 'guide'], 'thumbnails': { 'maxres': {'url': 'https://thumbnail.url/maxres.jpg'} } }, 'statistics': { 'viewCount': '50000', 'likeCount': '1500', 'commentCount': '200' }, 'contentDetails': { 'duration': 'PT10M30S', 'definition': 'hd' } }, { 'id': 'video2', 'snippet': { 'title': 'Heat Pump Installation', 'description': 'Step by step heat pump installation tutorial.', 'publishedAt': '2025-08-10T10:00:00Z', 'channelId': 'UC_test_channel_id', 'channelTitle': 'Test Channel', 'tags': ['heat pump', 'installation'], 'thumbnails': { 'high': {'url': 'https://thumbnail.url/high.jpg'} } }, 'statistics': { 'viewCount': '30000', 'likeCount': '800', 'commentCount': '150' }, 'contentDetails': { 'duration': 'PT15M45S', 'definition': 'hd' } } ] } @pytest.fixture def sample_transcript(self): """Sample transcript data""" return [ {'text': 'Welcome to this HVAC maintenance guide.', 'start': 0.0, 'duration': 3.0}, {'text': 'Today we will cover essential maintenance tips.', 'start': 3.0, 'duration': 4.0}, {'text': 'Regular maintenance extends system life.', 'start': 7.0, 'duration': 3.5} ] def test_initialization(self, config, mock_env_vars): """Test scraper initialization""" with patch('src.youtube_api_scraper.build') as mock_build: scraper = YouTubeAPIScraper(config) assert scraper.api_key == 'test-youtube-api-key' assert scraper.channel_url == 'https://www.youtube.com/@TestChannel' assert scraper.daily_quota_limit == 10000 assert scraper.quota_used == 0 assert scraper.max_transcripts_per_run == 50 mock_build.assert_called_once_with('youtube', 'v3', developerKey='test-youtube-api-key') def test_missing_api_key(self, config, monkeypatch): """Test initialization fails without API key""" monkeypatch.delenv('YOUTUBE_API_KEY', raising=False) with pytest.raises(ValueError, match="YOUTUBE_API_KEY not found"): YouTubeAPIScraper(config) def test_quota_tracking(self, scraper): """Test quota tracking mechanism""" # Test successful quota allocation assert scraper._track_quota('channels_list') is True assert scraper.quota_used == 1 assert scraper._track_quota('playlist_items', 5) is True assert scraper.quota_used == 6 assert scraper._track_quota('search') is True assert scraper.quota_used == 106 # Test quota limit prevention scraper.quota_used = 9999 assert scraper._track_quota('search') is False # Would exceed limit assert scraper.quota_used == 9999 # Unchanged def test_get_channel_info_by_handle(self, scraper, sample_channel_response): """Test getting channel info by handle""" scraper.youtube = Mock() mock_channels = Mock() scraper.youtube.channels.return_value = mock_channels mock_channels.list.return_value.execute.return_value = sample_channel_response result = scraper._get_channel_info() assert result is True assert scraper.channel_id == 'UC_test_channel_id' assert scraper.uploads_playlist_id == 'UU_test_channel_id' assert scraper.quota_used == 1 mock_channels.list.assert_called_once_with( part='snippet,statistics,contentDetails', forHandle='TestChannel' ) def test_get_channel_info_fallback_search(self, scraper): """Test channel search fallback when handle lookup fails""" scraper.youtube = Mock() # First attempt fails mock_channels = Mock() scraper.youtube.channels.return_value = mock_channels mock_channels.list.return_value.execute.return_value = {'items': []} # Search succeeds mock_search = Mock() scraper.youtube.search.return_value = mock_search search_response = { 'items': [{ 'snippet': {'channelId': 'UC_found_channel'} }] } mock_search.list.return_value.execute.return_value = search_response # Second channel lookup succeeds channel_response = { 'items': [{ 'id': 'UC_found_channel', 'snippet': {'title': 'Found Channel'}, 'statistics': {'subscriberCount': '5000', 'videoCount': '100'}, 'contentDetails': {'relatedPlaylists': {'uploads': 'UU_found_channel'}} }] } mock_channels.list.return_value.execute.side_effect = [{'items': []}, channel_response] result = scraper._get_channel_info() assert result is True assert scraper.channel_id == 'UC_found_channel' assert scraper.quota_used == 102 # 1 (failed) + 100 (search) + 1 (success) def test_fetch_all_video_ids(self, scraper, sample_playlist_response): """Test fetching all video IDs from channel""" scraper.channel_id = 'UC_test_channel_id' scraper.uploads_playlist_id = 'UU_test_channel_id' scraper.youtube = Mock() mock_playlist_items = Mock() scraper.youtube.playlistItems.return_value = mock_playlist_items mock_playlist_items.list.return_value.execute.return_value = sample_playlist_response video_ids = scraper._fetch_all_video_ids() assert len(video_ids) == 3 assert video_ids == ['video1', 'video2', 'video3'] assert scraper.quota_used == 1 def test_fetch_all_video_ids_with_pagination(self, scraper): """Test fetching video IDs with pagination""" scraper.channel_id = 'UC_test_channel_id' scraper.uploads_playlist_id = 'UU_test_channel_id' scraper.youtube = Mock() mock_playlist_items = Mock() scraper.youtube.playlistItems.return_value = mock_playlist_items # Simulate 2 pages of results page1 = { 'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(1, 51)], 'nextPageToken': 'token2' } page2 = { 'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(51, 71)], 'nextPageToken': None } mock_playlist_items.list.return_value.execute.side_effect = [page1, page2] video_ids = scraper._fetch_all_video_ids(max_videos=60) assert len(video_ids) == 60 assert scraper.quota_used == 2 # 2 API calls def test_fetch_video_details_batch(self, scraper, sample_videos_response): """Test fetching video details in batches""" scraper.youtube = Mock() mock_videos = Mock() scraper.youtube.videos.return_value = mock_videos mock_videos.list.return_value.execute.return_value = sample_videos_response video_ids = ['video1', 'video2'] videos = scraper._fetch_video_details_batch(video_ids) assert len(videos) == 2 assert videos[0]['id'] == 'video1' assert videos[0]['title'] == 'HVAC Maintenance Tips' assert videos[0]['view_count'] == 50000 assert videos[0]['engagement_rate'] > 0 assert scraper.quota_used == 1 @patch('src.youtube_api_scraper.YouTubeTranscriptApi') def test_fetch_transcript_success(self, mock_transcript_api, scraper, sample_transcript): """Test successful transcript fetching""" # Mock the class method get_transcript mock_transcript_api.get_transcript.return_value = sample_transcript transcript = scraper._fetch_transcript('video1') assert transcript is not None assert 'Welcome to this HVAC maintenance guide' in transcript assert 'Regular maintenance extends system life' in transcript mock_transcript_api.get_transcript.assert_called_once_with('video1') @patch('src.youtube_api_scraper.YouTubeTranscriptApi') def test_fetch_transcript_failure(self, mock_transcript_api, scraper): """Test transcript fetching when unavailable""" # Mock the class method to raise an exception mock_transcript_api.get_transcript.side_effect = Exception("No transcript available") transcript = scraper._fetch_transcript('video_no_transcript') assert transcript is None @patch.object(YouTubeAPIScraper, '_fetch_transcript') @patch.object(YouTubeAPIScraper, '_fetch_video_details_batch') @patch.object(YouTubeAPIScraper, '_fetch_all_video_ids') @patch.object(YouTubeAPIScraper, '_get_channel_info') def test_fetch_content_full_flow(self, mock_channel_info, mock_video_ids, mock_details, mock_transcript, scraper): """Test complete content fetching flow""" # Setup mocks mock_channel_info.return_value = True mock_video_ids.return_value = ['video1', 'video2', 'video3'] mock_details.return_value = [ {'id': 'video1', 'title': 'Video 1', 'view_count': 50000}, {'id': 'video2', 'title': 'Video 2', 'view_count': 30000}, {'id': 'video3', 'title': 'Video 3', 'view_count': 10000} ] mock_transcript.return_value = 'Sample transcript text' videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True) assert len(videos) == 3 assert mock_video_ids.called assert mock_details.called # Should fetch transcripts for top 3 videos (or max_transcripts_per_run) assert mock_transcript.call_count == 3 def test_quota_limit_enforcement(self, scraper): """Test that quota limits are enforced""" scraper.quota_used = 9950 # This should succeed (costs 1 unit) assert scraper._track_quota('videos_list') is True assert scraper.quota_used == 9951 # This should fail (would cost 100 units) assert scraper._track_quota('search') is False assert scraper.quota_used == 9951 # Unchanged def test_get_video_type(self, scraper): """Test video type determination based on duration""" # Short video (< 60 seconds) assert scraper._get_video_type({'duration': 'PT30S'}) == 'short' # Regular video assert scraper._get_video_type({'duration': 'PT5M30S'}) == 'video' # Long video (> 10 minutes) assert scraper._get_video_type({'duration': 'PT15M0S'}) == 'video' assert scraper._get_video_type({'duration': 'PT1H30M0S'}) == 'video' def test_format_markdown(self, scraper): """Test markdown formatting with enhanced data""" videos = [{ 'id': 'test_video', 'title': 'Test Video Title', 'published_at': '2025-08-15T10:00:00Z', 'channel_title': 'Test Channel', 'duration': 'PT10M30S', 'view_count': 50000, 'like_count': 1500, 'comment_count': 200, 'engagement_rate': 3.4, 'like_ratio': 3.0, 'tags': ['tag1', 'tag2', 'tag3'], 'thumbnail': 'https://thumbnail.url', 'description': 'Full untruncated description of the video', 'transcript': 'This is the transcript text' }] markdown = scraper.format_markdown(videos) assert '# ID: test_video' in markdown assert '## Title: Test Video Title' in markdown assert '## Type: video' in markdown assert '## Views: 50,000' in markdown assert '## Likes: 1,500' in markdown assert '## Comments: 200' in markdown assert '## Engagement Rate: 3.40%' in markdown assert '## Like Ratio: 3.00%' in markdown assert '## Tags: tag1, tag2, tag3' in markdown assert '## Description:' in markdown assert 'Full untruncated description' in markdown assert '## Transcript:' in markdown assert 'This is the transcript text' in markdown def test_incremental_items(self, scraper): """Test getting incremental items since last sync""" items = [ {'id': 'new_video', 'published_at': '2025-08-20'}, {'id': 'last_video', 'published_at': '2025-08-15'}, {'id': 'old_video', 'published_at': '2025-08-10'} ] # No state - return all new_items = scraper.get_incremental_items(items, {}) assert len(new_items) == 3 # With state - return only new state = { 'last_video_id': 'last_video', 'last_published': '2025-08-15' } new_items = scraper.get_incremental_items(items, state) assert len(new_items) == 1 assert new_items[0]['id'] == 'new_video' def test_update_state(self, scraper): """Test state update with latest video info""" items = [ {'id': 'latest_video', 'title': 'Latest Video', 'published_at': '2025-08-20'}, {'id': 'older_video', 'title': 'Older Video', 'published_at': '2025-08-15'} ] state = scraper.update_state({}, items) assert state['last_video_id'] == 'latest_video' assert state['last_published'] == '2025-08-20' assert state['last_video_title'] == 'Latest Video' assert state['video_count'] == 2 assert state['quota_used'] == 0 assert 'last_sync' in state def test_efficient_quota_usage_for_370_videos(self, scraper): """Test that fetching 370 videos uses minimal quota""" scraper.channel_id = 'UC_test' scraper.uploads_playlist_id = 'UU_test' # Simulate fetching 370 videos # 370 videos / 50 per page = 8 pages for playlist items for _ in range(8): scraper._track_quota('playlist_items') # 370 videos / 50 per batch = 8 batches for video details for _ in range(8): scraper._track_quota('videos_list') # Total quota should be very low assert scraper.quota_used == 16 # 8 + 8 assert scraper.quota_used < 20 # Well under daily limit # We can afford many transcripts with remaining quota remaining = scraper.daily_quota_limit - scraper.quota_used assert remaining > 9900 # Plenty of quota left if __name__ == "__main__": pytest.main([__file__, "-v"])