hvac-kia-content/tests/test_youtube_api_scraper.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

462 lines
No EOL
18 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive test suite for YouTube API scraper with quota management
Following TDD principles for robust implementation validation
"""
import pytest
import json
import os
from unittest.mock import Mock, patch, MagicMock, call
from datetime import datetime
import pytz
from pathlib import Path
# Import the scraper
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.youtube_api_scraper import YouTubeAPIScraper
from src.base_scraper import ScraperConfig
class TestYouTubeAPIScraper:
"""Test suite for YouTube API scraper with quota management"""
@pytest.fixture
def config(self, tmp_path):
"""Create test configuration"""
return ScraperConfig(
source_name='youtube',
brand_name='test_brand',
data_dir=tmp_path / 'data',
logs_dir=tmp_path / 'logs',
timezone='America/Halifax'
)
@pytest.fixture
def mock_env_vars(self, monkeypatch):
"""Mock environment variables"""
monkeypatch.setenv('YOUTUBE_API_KEY', 'test-youtube-api-key')
monkeypatch.setenv('YOUTUBE_CHANNEL_URL', 'https://www.youtube.com/@TestChannel')
@pytest.fixture
def scraper(self, config, mock_env_vars):
"""Create scraper instance with mocked environment"""
with patch('src.youtube_api_scraper.build'):
return YouTubeAPIScraper(config)
@pytest.fixture
def sample_channel_response(self):
"""Sample channel details response"""
return {
'items': [{
'id': 'UC_test_channel_id',
'snippet': {
'title': 'Test Channel',
'description': 'Test channel description'
},
'statistics': {
'subscriberCount': '10000',
'viewCount': '1000000',
'videoCount': '370'
},
'contentDetails': {
'relatedPlaylists': {
'uploads': 'UU_test_channel_id'
}
}
}]
}
@pytest.fixture
def sample_playlist_response(self):
"""Sample playlist items response"""
return {
'items': [
{'contentDetails': {'videoId': 'video1'}},
{'contentDetails': {'videoId': 'video2'}},
{'contentDetails': {'videoId': 'video3'}}
],
'nextPageToken': None
}
@pytest.fixture
def sample_videos_response(self):
"""Sample videos details response"""
return {
'items': [
{
'id': 'video1',
'snippet': {
'title': 'HVAC Maintenance Tips',
'description': 'Complete guide to maintaining your HVAC system for optimal performance and longevity.',
'publishedAt': '2025-08-15T10:00:00Z',
'channelId': 'UC_test_channel_id',
'channelTitle': 'Test Channel',
'tags': ['hvac', 'maintenance', 'tips', 'guide'],
'thumbnails': {
'maxres': {'url': 'https://thumbnail.url/maxres.jpg'}
}
},
'statistics': {
'viewCount': '50000',
'likeCount': '1500',
'commentCount': '200'
},
'contentDetails': {
'duration': 'PT10M30S',
'definition': 'hd'
}
},
{
'id': 'video2',
'snippet': {
'title': 'Heat Pump Installation',
'description': 'Step by step heat pump installation tutorial.',
'publishedAt': '2025-08-10T10:00:00Z',
'channelId': 'UC_test_channel_id',
'channelTitle': 'Test Channel',
'tags': ['heat pump', 'installation'],
'thumbnails': {
'high': {'url': 'https://thumbnail.url/high.jpg'}
}
},
'statistics': {
'viewCount': '30000',
'likeCount': '800',
'commentCount': '150'
},
'contentDetails': {
'duration': 'PT15M45S',
'definition': 'hd'
}
}
]
}
@pytest.fixture
def sample_transcript(self):
"""Sample transcript data"""
return [
{'text': 'Welcome to this HVAC maintenance guide.', 'start': 0.0, 'duration': 3.0},
{'text': 'Today we will cover essential maintenance tips.', 'start': 3.0, 'duration': 4.0},
{'text': 'Regular maintenance extends system life.', 'start': 7.0, 'duration': 3.5}
]
def test_initialization(self, config, mock_env_vars):
"""Test scraper initialization"""
with patch('src.youtube_api_scraper.build') as mock_build:
scraper = YouTubeAPIScraper(config)
assert scraper.api_key == 'test-youtube-api-key'
assert scraper.channel_url == 'https://www.youtube.com/@TestChannel'
assert scraper.daily_quota_limit == 10000
assert scraper.quota_used == 0
assert scraper.max_transcripts_per_run == 50
mock_build.assert_called_once_with('youtube', 'v3', developerKey='test-youtube-api-key')
def test_missing_api_key(self, config, monkeypatch):
"""Test initialization fails without API key"""
monkeypatch.delenv('YOUTUBE_API_KEY', raising=False)
with pytest.raises(ValueError, match="YOUTUBE_API_KEY not found"):
YouTubeAPIScraper(config)
def test_quota_tracking(self, scraper):
"""Test quota tracking mechanism"""
# Test successful quota allocation
assert scraper._track_quota('channels_list') is True
assert scraper.quota_used == 1
assert scraper._track_quota('playlist_items', 5) is True
assert scraper.quota_used == 6
assert scraper._track_quota('search') is True
assert scraper.quota_used == 106
# Test quota limit prevention
scraper.quota_used = 9999
assert scraper._track_quota('search') is False # Would exceed limit
assert scraper.quota_used == 9999 # Unchanged
def test_get_channel_info_by_handle(self, scraper, sample_channel_response):
"""Test getting channel info by handle"""
scraper.youtube = Mock()
mock_channels = Mock()
scraper.youtube.channels.return_value = mock_channels
mock_channels.list.return_value.execute.return_value = sample_channel_response
result = scraper._get_channel_info()
assert result is True
assert scraper.channel_id == 'UC_test_channel_id'
assert scraper.uploads_playlist_id == 'UU_test_channel_id'
assert scraper.quota_used == 1
mock_channels.list.assert_called_once_with(
part='snippet,statistics,contentDetails',
forHandle='TestChannel'
)
def test_get_channel_info_fallback_search(self, scraper):
"""Test channel search fallback when handle lookup fails"""
scraper.youtube = Mock()
# First attempt fails
mock_channels = Mock()
scraper.youtube.channels.return_value = mock_channels
mock_channels.list.return_value.execute.return_value = {'items': []}
# Search succeeds
mock_search = Mock()
scraper.youtube.search.return_value = mock_search
search_response = {
'items': [{
'snippet': {'channelId': 'UC_found_channel'}
}]
}
mock_search.list.return_value.execute.return_value = search_response
# Second channel lookup succeeds
channel_response = {
'items': [{
'id': 'UC_found_channel',
'snippet': {'title': 'Found Channel'},
'statistics': {'subscriberCount': '5000', 'videoCount': '100'},
'contentDetails': {'relatedPlaylists': {'uploads': 'UU_found_channel'}}
}]
}
mock_channels.list.return_value.execute.side_effect = [{'items': []}, channel_response]
result = scraper._get_channel_info()
assert result is True
assert scraper.channel_id == 'UC_found_channel'
assert scraper.quota_used == 102 # 1 (failed) + 100 (search) + 1 (success)
def test_fetch_all_video_ids(self, scraper, sample_playlist_response):
"""Test fetching all video IDs from channel"""
scraper.channel_id = 'UC_test_channel_id'
scraper.uploads_playlist_id = 'UU_test_channel_id'
scraper.youtube = Mock()
mock_playlist_items = Mock()
scraper.youtube.playlistItems.return_value = mock_playlist_items
mock_playlist_items.list.return_value.execute.return_value = sample_playlist_response
video_ids = scraper._fetch_all_video_ids()
assert len(video_ids) == 3
assert video_ids == ['video1', 'video2', 'video3']
assert scraper.quota_used == 1
def test_fetch_all_video_ids_with_pagination(self, scraper):
"""Test fetching video IDs with pagination"""
scraper.channel_id = 'UC_test_channel_id'
scraper.uploads_playlist_id = 'UU_test_channel_id'
scraper.youtube = Mock()
mock_playlist_items = Mock()
scraper.youtube.playlistItems.return_value = mock_playlist_items
# Simulate 2 pages of results
page1 = {
'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(1, 51)],
'nextPageToken': 'token2'
}
page2 = {
'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(51, 71)],
'nextPageToken': None
}
mock_playlist_items.list.return_value.execute.side_effect = [page1, page2]
video_ids = scraper._fetch_all_video_ids(max_videos=60)
assert len(video_ids) == 60
assert scraper.quota_used == 2 # 2 API calls
def test_fetch_video_details_batch(self, scraper, sample_videos_response):
"""Test fetching video details in batches"""
scraper.youtube = Mock()
mock_videos = Mock()
scraper.youtube.videos.return_value = mock_videos
mock_videos.list.return_value.execute.return_value = sample_videos_response
video_ids = ['video1', 'video2']
videos = scraper._fetch_video_details_batch(video_ids)
assert len(videos) == 2
assert videos[0]['id'] == 'video1'
assert videos[0]['title'] == 'HVAC Maintenance Tips'
assert videos[0]['view_count'] == 50000
assert videos[0]['engagement_rate'] > 0
assert scraper.quota_used == 1
@patch('src.youtube_api_scraper.YouTubeTranscriptApi')
def test_fetch_transcript_success(self, mock_transcript_api, scraper, sample_transcript):
"""Test successful transcript fetching"""
# Mock the class method get_transcript
mock_transcript_api.get_transcript.return_value = sample_transcript
transcript = scraper._fetch_transcript('video1')
assert transcript is not None
assert 'Welcome to this HVAC maintenance guide' in transcript
assert 'Regular maintenance extends system life' in transcript
mock_transcript_api.get_transcript.assert_called_once_with('video1')
@patch('src.youtube_api_scraper.YouTubeTranscriptApi')
def test_fetch_transcript_failure(self, mock_transcript_api, scraper):
"""Test transcript fetching when unavailable"""
# Mock the class method to raise an exception
mock_transcript_api.get_transcript.side_effect = Exception("No transcript available")
transcript = scraper._fetch_transcript('video_no_transcript')
assert transcript is None
@patch.object(YouTubeAPIScraper, '_fetch_transcript')
@patch.object(YouTubeAPIScraper, '_fetch_video_details_batch')
@patch.object(YouTubeAPIScraper, '_fetch_all_video_ids')
@patch.object(YouTubeAPIScraper, '_get_channel_info')
def test_fetch_content_full_flow(self, mock_channel_info, mock_video_ids,
mock_details, mock_transcript, scraper):
"""Test complete content fetching flow"""
# Setup mocks
mock_channel_info.return_value = True
mock_video_ids.return_value = ['video1', 'video2', 'video3']
mock_details.return_value = [
{'id': 'video1', 'title': 'Video 1', 'view_count': 50000},
{'id': 'video2', 'title': 'Video 2', 'view_count': 30000},
{'id': 'video3', 'title': 'Video 3', 'view_count': 10000}
]
mock_transcript.return_value = 'Sample transcript text'
videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True)
assert len(videos) == 3
assert mock_video_ids.called
assert mock_details.called
# Should fetch transcripts for top 3 videos (or max_transcripts_per_run)
assert mock_transcript.call_count == 3
def test_quota_limit_enforcement(self, scraper):
"""Test that quota limits are enforced"""
scraper.quota_used = 9950
# This should succeed (costs 1 unit)
assert scraper._track_quota('videos_list') is True
assert scraper.quota_used == 9951
# This should fail (would cost 100 units)
assert scraper._track_quota('search') is False
assert scraper.quota_used == 9951 # Unchanged
def test_get_video_type(self, scraper):
"""Test video type determination based on duration"""
# Short video (< 60 seconds)
assert scraper._get_video_type({'duration': 'PT30S'}) == 'short'
# Regular video
assert scraper._get_video_type({'duration': 'PT5M30S'}) == 'video'
# Long video (> 10 minutes)
assert scraper._get_video_type({'duration': 'PT15M0S'}) == 'video'
assert scraper._get_video_type({'duration': 'PT1H30M0S'}) == 'video'
def test_format_markdown(self, scraper):
"""Test markdown formatting with enhanced data"""
videos = [{
'id': 'test_video',
'title': 'Test Video Title',
'published_at': '2025-08-15T10:00:00Z',
'channel_title': 'Test Channel',
'duration': 'PT10M30S',
'view_count': 50000,
'like_count': 1500,
'comment_count': 200,
'engagement_rate': 3.4,
'like_ratio': 3.0,
'tags': ['tag1', 'tag2', 'tag3'],
'thumbnail': 'https://thumbnail.url',
'description': 'Full untruncated description of the video',
'transcript': 'This is the transcript text'
}]
markdown = scraper.format_markdown(videos)
assert '# ID: test_video' in markdown
assert '## Title: Test Video Title' in markdown
assert '## Type: video' in markdown
assert '## Views: 50,000' in markdown
assert '## Likes: 1,500' in markdown
assert '## Comments: 200' in markdown
assert '## Engagement Rate: 3.40%' in markdown
assert '## Like Ratio: 3.00%' in markdown
assert '## Tags: tag1, tag2, tag3' in markdown
assert '## Description:' in markdown
assert 'Full untruncated description' in markdown
assert '## Transcript:' in markdown
assert 'This is the transcript text' in markdown
def test_incremental_items(self, scraper):
"""Test getting incremental items since last sync"""
items = [
{'id': 'new_video', 'published_at': '2025-08-20'},
{'id': 'last_video', 'published_at': '2025-08-15'},
{'id': 'old_video', 'published_at': '2025-08-10'}
]
# No state - return all
new_items = scraper.get_incremental_items(items, {})
assert len(new_items) == 3
# With state - return only new
state = {
'last_video_id': 'last_video',
'last_published': '2025-08-15'
}
new_items = scraper.get_incremental_items(items, state)
assert len(new_items) == 1
assert new_items[0]['id'] == 'new_video'
def test_update_state(self, scraper):
"""Test state update with latest video info"""
items = [
{'id': 'latest_video', 'title': 'Latest Video', 'published_at': '2025-08-20'},
{'id': 'older_video', 'title': 'Older Video', 'published_at': '2025-08-15'}
]
state = scraper.update_state({}, items)
assert state['last_video_id'] == 'latest_video'
assert state['last_published'] == '2025-08-20'
assert state['last_video_title'] == 'Latest Video'
assert state['video_count'] == 2
assert state['quota_used'] == 0
assert 'last_sync' in state
def test_efficient_quota_usage_for_370_videos(self, scraper):
"""Test that fetching 370 videos uses minimal quota"""
scraper.channel_id = 'UC_test'
scraper.uploads_playlist_id = 'UU_test'
# Simulate fetching 370 videos
# 370 videos / 50 per page = 8 pages for playlist items
for _ in range(8):
scraper._track_quota('playlist_items')
# 370 videos / 50 per batch = 8 batches for video details
for _ in range(8):
scraper._track_quota('videos_list')
# Total quota should be very low
assert scraper.quota_used == 16 # 8 + 8
assert scraper.quota_used < 20 # Well under daily limit
# We can afford many transcripts with remaining quota
remaining = scraper.daily_quota_limit - scraper.quota_used
assert remaining > 9900 # Plenty of quota left
if __name__ == "__main__":
pytest.main([__file__, "-v"])