hvac-kia-content/tests/test_youtube_api_scraper.py

#!/usr/bin/env python3
"""
Comprehensive test suite for YouTube API scraper with quota management
Following TDD principles for robust implementation validation
"""

import pytest
import json
import os
from unittest.mock import Mock, patch, MagicMock, call
from datetime import datetime
import pytz
from pathlib import Path

# Import the scraper
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.youtube_api_scraper import YouTubeAPIScraper
from src.base_scraper import ScraperConfig


class TestYouTubeAPIScraper:
    """Test suite for YouTube API scraper with quota management"""

    @pytest.fixture
    def config(self, tmp_path):
        """Create test configuration"""
        return ScraperConfig(
            source_name='youtube',
            brand_name='test_brand',
            data_dir=tmp_path / 'data',
            logs_dir=tmp_path / 'logs',
            timezone='America/Halifax'
        )

    @pytest.fixture
    def mock_env_vars(self, monkeypatch):
        """Mock environment variables"""
        monkeypatch.setenv('YOUTUBE_API_KEY', 'test-youtube-api-key')
        monkeypatch.setenv('YOUTUBE_CHANNEL_URL', 'https://www.youtube.com/@TestChannel')

    @pytest.fixture
    def scraper(self, config, mock_env_vars):
        """Create scraper instance with mocked environment"""
        with patch('src.youtube_api_scraper.build'):
            return YouTubeAPIScraper(config)

    @pytest.fixture
    def sample_channel_response(self):
        """Sample channel details response"""
        return {
            'items': [{
                'id': 'UC_test_channel_id',
                'snippet': {
                    'title': 'Test Channel',
                    'description': 'Test channel description'
                },
                'statistics': {
                    'subscriberCount': '10000',
                    'viewCount': '1000000',
                    'videoCount': '370'
                },
                'contentDetails': {
                    'relatedPlaylists': {
                        'uploads': 'UU_test_channel_id'
                    }
                }
            }]
        }

    @pytest.fixture
    def sample_playlist_response(self):
        """Sample playlist items response"""
        return {
            'items': [
                {'contentDetails': {'videoId': 'video1'}},
                {'contentDetails': {'videoId': 'video2'}},
                {'contentDetails': {'videoId': 'video3'}}
            ],
            'nextPageToken': None
        }

    @pytest.fixture
    def sample_videos_response(self):
        """Sample videos details response"""
        return {
            'items': [
                {
                    'id': 'video1',
                    'snippet': {
                        'title': 'HVAC Maintenance Tips',
                        'description': 'Complete guide to maintaining your HVAC system for optimal performance and longevity.',
                        'publishedAt': '2025-08-15T10:00:00Z',
                        'channelId': 'UC_test_channel_id',
                        'channelTitle': 'Test Channel',
                        'tags': ['hvac', 'maintenance', 'tips', 'guide'],
                        'thumbnails': {
                            'maxres': {'url': 'https://thumbnail.url/maxres.jpg'}
                        }
                    },
                    'statistics': {
                        'viewCount': '50000',
                        'likeCount': '1500',
                        'commentCount': '200'
                    },
                    'contentDetails': {
                        'duration': 'PT10M30S',
                        'definition': 'hd'
                    }
                },
                {
                    'id': 'video2',
                    'snippet': {
                        'title': 'Heat Pump Installation',
                        'description': 'Step by step heat pump installation tutorial.',
                        'publishedAt': '2025-08-10T10:00:00Z',
                        'channelId': 'UC_test_channel_id',
                        'channelTitle': 'Test Channel',
                        'tags': ['heat pump', 'installation'],
                        'thumbnails': {
                            'high': {'url': 'https://thumbnail.url/high.jpg'}
                        }
                    },
                    'statistics': {
                        'viewCount': '30000',
                        'likeCount': '800',
                        'commentCount': '150'
                    },
                    'contentDetails': {
                        'duration': 'PT15M45S',
                        'definition': 'hd'
                    }
                }
            ]
        }

    @pytest.fixture
    def sample_transcript(self):
        """Sample transcript data"""
        return [
            {'text': 'Welcome to this HVAC maintenance guide.', 'start': 0.0, 'duration': 3.0},
            {'text': 'Today we will cover essential maintenance tips.', 'start': 3.0, 'duration': 4.0},
            {'text': 'Regular maintenance extends system life.', 'start': 7.0, 'duration': 3.5}
        ]

    def test_initialization(self, config, mock_env_vars):
        """Test scraper initialization"""
        with patch('src.youtube_api_scraper.build') as mock_build:
            scraper = YouTubeAPIScraper(config)

            assert scraper.api_key == 'test-youtube-api-key'
            assert scraper.channel_url == 'https://www.youtube.com/@TestChannel'
            assert scraper.daily_quota_limit == 10000
            assert scraper.quota_used == 0
            assert scraper.max_transcripts_per_run == 50
            mock_build.assert_called_once_with('youtube', 'v3', developerKey='test-youtube-api-key')

    def test_missing_api_key(self, config, monkeypatch):
        """Test initialization fails without API key"""
        monkeypatch.delenv('YOUTUBE_API_KEY', raising=False)
        with pytest.raises(ValueError, match="YOUTUBE_API_KEY not found"):
            YouTubeAPIScraper(config)

    def test_quota_tracking(self, scraper):
        """Test quota tracking mechanism"""
        # Test successful quota allocation
        assert scraper._track_quota('channels_list') is True
        assert scraper.quota_used == 1

        assert scraper._track_quota('playlist_items', 5) is True
        assert scraper.quota_used == 6

        assert scraper._track_quota('search') is True
        assert scraper.quota_used == 106

        # Test quota limit prevention
        scraper.quota_used = 9999
        assert scraper._track_quota('search') is False  # Would exceed limit
        assert scraper.quota_used == 9999  # Unchanged

    def test_get_channel_info_by_handle(self, scraper, sample_channel_response):
        """Test getting channel info by handle"""
        scraper.youtube = Mock()
        mock_channels = Mock()
        scraper.youtube.channels.return_value = mock_channels
        mock_channels.list.return_value.execute.return_value = sample_channel_response

        result = scraper._get_channel_info()

        assert result is True
        assert scraper.channel_id == 'UC_test_channel_id'
        assert scraper.uploads_playlist_id == 'UU_test_channel_id'
        assert scraper.quota_used == 1

        mock_channels.list.assert_called_once_with(
            part='snippet,statistics,contentDetails',
            forHandle='TestChannel'
        )

    def test_get_channel_info_fallback_search(self, scraper):
        """Test channel search fallback when handle lookup fails"""
        scraper.youtube = Mock()

        # First attempt fails
        mock_channels = Mock()
        scraper.youtube.channels.return_value = mock_channels
        mock_channels.list.return_value.execute.return_value = {'items': []}

        # Search succeeds
        mock_search = Mock()
        scraper.youtube.search.return_value = mock_search
        search_response = {
            'items': [{
                'snippet': {'channelId': 'UC_found_channel'}
            }]
        }
        mock_search.list.return_value.execute.return_value = search_response

        # Second channel lookup succeeds
        channel_response = {
            'items': [{
                'id': 'UC_found_channel',
                'snippet': {'title': 'Found Channel'},
                'statistics': {'subscriberCount': '5000', 'videoCount': '100'},
                'contentDetails': {'relatedPlaylists': {'uploads': 'UU_found_channel'}}
            }]
        }
        mock_channels.list.return_value.execute.side_effect = [{'items': []}, channel_response]

        result = scraper._get_channel_info()

        assert result is True
        assert scraper.channel_id == 'UC_found_channel'
        assert scraper.quota_used == 102  # 1 (failed) + 100 (search) + 1 (success)

    def test_fetch_all_video_ids(self, scraper, sample_playlist_response):
        """Test fetching all video IDs from channel"""
        scraper.channel_id = 'UC_test_channel_id'
        scraper.uploads_playlist_id = 'UU_test_channel_id'

        scraper.youtube = Mock()
        mock_playlist_items = Mock()
        scraper.youtube.playlistItems.return_value = mock_playlist_items
        mock_playlist_items.list.return_value.execute.return_value = sample_playlist_response

        video_ids = scraper._fetch_all_video_ids()

        assert len(video_ids) == 3
        assert video_ids == ['video1', 'video2', 'video3']
        assert scraper.quota_used == 1

    def test_fetch_all_video_ids_with_pagination(self, scraper):
        """Test fetching video IDs with pagination"""
        scraper.channel_id = 'UC_test_channel_id'
        scraper.uploads_playlist_id = 'UU_test_channel_id'

        scraper.youtube = Mock()
        mock_playlist_items = Mock()
        scraper.youtube.playlistItems.return_value = mock_playlist_items

        # Simulate 2 pages of results
        page1 = {
            'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(1, 51)],
            'nextPageToken': 'token2'
        }
        page2 = {
            'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(51, 71)],
            'nextPageToken': None
        }
        mock_playlist_items.list.return_value.execute.side_effect = [page1, page2]

        video_ids = scraper._fetch_all_video_ids(max_videos=60)

        assert len(video_ids) == 60
        assert scraper.quota_used == 2  # 2 API calls

    def test_fetch_video_details_batch(self, scraper, sample_videos_response):
        """Test fetching video details in batches"""
        scraper.youtube = Mock()
        mock_videos = Mock()
        scraper.youtube.videos.return_value = mock_videos
        mock_videos.list.return_value.execute.return_value = sample_videos_response

        video_ids = ['video1', 'video2']
        videos = scraper._fetch_video_details_batch(video_ids)

        assert len(videos) == 2
        assert videos[0]['id'] == 'video1'
        assert videos[0]['title'] == 'HVAC Maintenance Tips'
        assert videos[0]['view_count'] == 50000
        assert videos[0]['engagement_rate'] > 0
        assert scraper.quota_used == 1

    @patch('src.youtube_api_scraper.YouTubeTranscriptApi')
    def test_fetch_transcript_success(self, mock_transcript_api, scraper, sample_transcript):
        """Test successful transcript fetching"""
        # Mock the class method get_transcript
        mock_transcript_api.get_transcript.return_value = sample_transcript

        transcript = scraper._fetch_transcript('video1')

        assert transcript is not None
        assert 'Welcome to this HVAC maintenance guide' in transcript
        assert 'Regular maintenance extends system life' in transcript
        mock_transcript_api.get_transcript.assert_called_once_with('video1')

    @patch('src.youtube_api_scraper.YouTubeTranscriptApi')
    def test_fetch_transcript_failure(self, mock_transcript_api, scraper):
        """Test transcript fetching when unavailable"""
        # Mock the class method to raise an exception
        mock_transcript_api.get_transcript.side_effect = Exception("No transcript available")

        transcript = scraper._fetch_transcript('video_no_transcript')

        assert transcript is None

    @patch.object(YouTubeAPIScraper, '_fetch_transcript')
    @patch.object(YouTubeAPIScraper, '_fetch_video_details_batch')
    @patch.object(YouTubeAPIScraper, '_fetch_all_video_ids')
    @patch.object(YouTubeAPIScraper, '_get_channel_info')
    def test_fetch_content_full_flow(self, mock_channel_info, mock_video_ids,
                                    mock_details, mock_transcript, scraper):
        """Test complete content fetching flow"""
        # Setup mocks
        mock_channel_info.return_value = True
        mock_video_ids.return_value = ['video1', 'video2', 'video3']
        mock_details.return_value = [
            {'id': 'video1', 'title': 'Video 1', 'view_count': 50000},
            {'id': 'video2', 'title': 'Video 2', 'view_count': 30000},
            {'id': 'video3', 'title': 'Video 3', 'view_count': 10000}
        ]
        mock_transcript.return_value = 'Sample transcript text'

        videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True)

        assert len(videos) == 3
        assert mock_video_ids.called
        assert mock_details.called
        # Should fetch transcripts for top 3 videos (or max_transcripts_per_run)
        assert mock_transcript.call_count == 3

    def test_quota_limit_enforcement(self, scraper):
        """Test that quota limits are enforced"""
        scraper.quota_used = 9950

        # This should succeed (costs 1 unit)
        assert scraper._track_quota('videos_list') is True
        assert scraper.quota_used == 9951

        # This should fail (would cost 100 units)
        assert scraper._track_quota('search') is False
        assert scraper.quota_used == 9951  # Unchanged

    def test_get_video_type(self, scraper):
        """Test video type determination based on duration"""
        # Short video (< 60 seconds)
        assert scraper._get_video_type({'duration': 'PT30S'}) == 'short'

        # Regular video
        assert scraper._get_video_type({'duration': 'PT5M30S'}) == 'video'

        # Long video (> 10 minutes)
        assert scraper._get_video_type({'duration': 'PT15M0S'}) == 'video'
        assert scraper._get_video_type({'duration': 'PT1H30M0S'}) == 'video'

    def test_format_markdown(self, scraper):
        """Test markdown formatting with enhanced data"""
        videos = [{
            'id': 'test_video',
            'title': 'Test Video Title',
            'published_at': '2025-08-15T10:00:00Z',
            'channel_title': 'Test Channel',
            'duration': 'PT10M30S',
            'view_count': 50000,
            'like_count': 1500,
            'comment_count': 200,
            'engagement_rate': 3.4,
            'like_ratio': 3.0,
            'tags': ['tag1', 'tag2', 'tag3'],
            'thumbnail': 'https://thumbnail.url',
            'description': 'Full untruncated description of the video',
            'transcript': 'This is the transcript text'
        }]

        markdown = scraper.format_markdown(videos)

        assert '# ID: test_video' in markdown
        assert '## Title: Test Video Title' in markdown
        assert '## Type: video' in markdown
        assert '## Views: 50,000' in markdown
        assert '## Likes: 1,500' in markdown
        assert '## Comments: 200' in markdown
        assert '## Engagement Rate: 3.40%' in markdown
        assert '## Like Ratio: 3.00%' in markdown
        assert '## Tags: tag1, tag2, tag3' in markdown
        assert '## Description:' in markdown
        assert 'Full untruncated description' in markdown
        assert '## Transcript:' in markdown
        assert 'This is the transcript text' in markdown

    def test_incremental_items(self, scraper):
        """Test getting incremental items since last sync"""
        items = [
            {'id': 'new_video', 'published_at': '2025-08-20'},
            {'id': 'last_video', 'published_at': '2025-08-15'},
            {'id': 'old_video', 'published_at': '2025-08-10'}
        ]

        # No state - return all
        new_items = scraper.get_incremental_items(items, {})
        assert len(new_items) == 3

        # With state - return only new
        state = {
            'last_video_id': 'last_video',
            'last_published': '2025-08-15'
        }
        new_items = scraper.get_incremental_items(items, state)
        assert len(new_items) == 1
        assert new_items[0]['id'] == 'new_video'

    def test_update_state(self, scraper):
        """Test state update with latest video info"""
        items = [
            {'id': 'latest_video', 'title': 'Latest Video', 'published_at': '2025-08-20'},
            {'id': 'older_video', 'title': 'Older Video', 'published_at': '2025-08-15'}
        ]

        state = scraper.update_state({}, items)

        assert state['last_video_id'] == 'latest_video'
        assert state['last_published'] == '2025-08-20'
        assert state['last_video_title'] == 'Latest Video'
        assert state['video_count'] == 2
        assert state['quota_used'] == 0
        assert 'last_sync' in state

    def test_efficient_quota_usage_for_370_videos(self, scraper):
        """Test that fetching 370 videos uses minimal quota"""
        scraper.channel_id = 'UC_test'
        scraper.uploads_playlist_id = 'UU_test'

        # Simulate fetching 370 videos
        # 370 videos / 50 per page = 8 pages for playlist items
        for _ in range(8):
            scraper._track_quota('playlist_items')

        # 370 videos / 50 per batch = 8 batches for video details
        for _ in range(8):
            scraper._track_quota('videos_list')

        # Total quota should be very low
        assert scraper.quota_used == 16  # 8 + 8
        assert scraper.quota_used < 20  # Well under daily limit

        # We can afford many transcripts with remaining quota
        remaining = scraper.daily_quota_limit - scraper.quota_used
        assert remaining > 9900  # Plenty of quota left


if __name__ == "__main__":
    pytest.main([__file__, "-v"])