Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
462 lines
No EOL
18 KiB
Python
462 lines
No EOL
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive test suite for YouTube API scraper with quota management
|
|
Following TDD principles for robust implementation validation
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
import os
|
|
from unittest.mock import Mock, patch, MagicMock, call
|
|
from datetime import datetime
|
|
import pytz
|
|
from pathlib import Path
|
|
|
|
# Import the scraper
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from src.youtube_api_scraper import YouTubeAPIScraper
|
|
from src.base_scraper import ScraperConfig
|
|
|
|
|
|
class TestYouTubeAPIScraper:
|
|
"""Test suite for YouTube API scraper with quota management"""
|
|
|
|
@pytest.fixture
|
|
def config(self, tmp_path):
|
|
"""Create test configuration"""
|
|
return ScraperConfig(
|
|
source_name='youtube',
|
|
brand_name='test_brand',
|
|
data_dir=tmp_path / 'data',
|
|
logs_dir=tmp_path / 'logs',
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
@pytest.fixture
|
|
def mock_env_vars(self, monkeypatch):
|
|
"""Mock environment variables"""
|
|
monkeypatch.setenv('YOUTUBE_API_KEY', 'test-youtube-api-key')
|
|
monkeypatch.setenv('YOUTUBE_CHANNEL_URL', 'https://www.youtube.com/@TestChannel')
|
|
|
|
@pytest.fixture
|
|
def scraper(self, config, mock_env_vars):
|
|
"""Create scraper instance with mocked environment"""
|
|
with patch('src.youtube_api_scraper.build'):
|
|
return YouTubeAPIScraper(config)
|
|
|
|
@pytest.fixture
|
|
def sample_channel_response(self):
|
|
"""Sample channel details response"""
|
|
return {
|
|
'items': [{
|
|
'id': 'UC_test_channel_id',
|
|
'snippet': {
|
|
'title': 'Test Channel',
|
|
'description': 'Test channel description'
|
|
},
|
|
'statistics': {
|
|
'subscriberCount': '10000',
|
|
'viewCount': '1000000',
|
|
'videoCount': '370'
|
|
},
|
|
'contentDetails': {
|
|
'relatedPlaylists': {
|
|
'uploads': 'UU_test_channel_id'
|
|
}
|
|
}
|
|
}]
|
|
}
|
|
|
|
@pytest.fixture
|
|
def sample_playlist_response(self):
|
|
"""Sample playlist items response"""
|
|
return {
|
|
'items': [
|
|
{'contentDetails': {'videoId': 'video1'}},
|
|
{'contentDetails': {'videoId': 'video2'}},
|
|
{'contentDetails': {'videoId': 'video3'}}
|
|
],
|
|
'nextPageToken': None
|
|
}
|
|
|
|
@pytest.fixture
|
|
def sample_videos_response(self):
|
|
"""Sample videos details response"""
|
|
return {
|
|
'items': [
|
|
{
|
|
'id': 'video1',
|
|
'snippet': {
|
|
'title': 'HVAC Maintenance Tips',
|
|
'description': 'Complete guide to maintaining your HVAC system for optimal performance and longevity.',
|
|
'publishedAt': '2025-08-15T10:00:00Z',
|
|
'channelId': 'UC_test_channel_id',
|
|
'channelTitle': 'Test Channel',
|
|
'tags': ['hvac', 'maintenance', 'tips', 'guide'],
|
|
'thumbnails': {
|
|
'maxres': {'url': 'https://thumbnail.url/maxres.jpg'}
|
|
}
|
|
},
|
|
'statistics': {
|
|
'viewCount': '50000',
|
|
'likeCount': '1500',
|
|
'commentCount': '200'
|
|
},
|
|
'contentDetails': {
|
|
'duration': 'PT10M30S',
|
|
'definition': 'hd'
|
|
}
|
|
},
|
|
{
|
|
'id': 'video2',
|
|
'snippet': {
|
|
'title': 'Heat Pump Installation',
|
|
'description': 'Step by step heat pump installation tutorial.',
|
|
'publishedAt': '2025-08-10T10:00:00Z',
|
|
'channelId': 'UC_test_channel_id',
|
|
'channelTitle': 'Test Channel',
|
|
'tags': ['heat pump', 'installation'],
|
|
'thumbnails': {
|
|
'high': {'url': 'https://thumbnail.url/high.jpg'}
|
|
}
|
|
},
|
|
'statistics': {
|
|
'viewCount': '30000',
|
|
'likeCount': '800',
|
|
'commentCount': '150'
|
|
},
|
|
'contentDetails': {
|
|
'duration': 'PT15M45S',
|
|
'definition': 'hd'
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
@pytest.fixture
|
|
def sample_transcript(self):
|
|
"""Sample transcript data"""
|
|
return [
|
|
{'text': 'Welcome to this HVAC maintenance guide.', 'start': 0.0, 'duration': 3.0},
|
|
{'text': 'Today we will cover essential maintenance tips.', 'start': 3.0, 'duration': 4.0},
|
|
{'text': 'Regular maintenance extends system life.', 'start': 7.0, 'duration': 3.5}
|
|
]
|
|
|
|
def test_initialization(self, config, mock_env_vars):
|
|
"""Test scraper initialization"""
|
|
with patch('src.youtube_api_scraper.build') as mock_build:
|
|
scraper = YouTubeAPIScraper(config)
|
|
|
|
assert scraper.api_key == 'test-youtube-api-key'
|
|
assert scraper.channel_url == 'https://www.youtube.com/@TestChannel'
|
|
assert scraper.daily_quota_limit == 10000
|
|
assert scraper.quota_used == 0
|
|
assert scraper.max_transcripts_per_run == 50
|
|
mock_build.assert_called_once_with('youtube', 'v3', developerKey='test-youtube-api-key')
|
|
|
|
def test_missing_api_key(self, config, monkeypatch):
|
|
"""Test initialization fails without API key"""
|
|
monkeypatch.delenv('YOUTUBE_API_KEY', raising=False)
|
|
with pytest.raises(ValueError, match="YOUTUBE_API_KEY not found"):
|
|
YouTubeAPIScraper(config)
|
|
|
|
def test_quota_tracking(self, scraper):
|
|
"""Test quota tracking mechanism"""
|
|
# Test successful quota allocation
|
|
assert scraper._track_quota('channels_list') is True
|
|
assert scraper.quota_used == 1
|
|
|
|
assert scraper._track_quota('playlist_items', 5) is True
|
|
assert scraper.quota_used == 6
|
|
|
|
assert scraper._track_quota('search') is True
|
|
assert scraper.quota_used == 106
|
|
|
|
# Test quota limit prevention
|
|
scraper.quota_used = 9999
|
|
assert scraper._track_quota('search') is False # Would exceed limit
|
|
assert scraper.quota_used == 9999 # Unchanged
|
|
|
|
def test_get_channel_info_by_handle(self, scraper, sample_channel_response):
|
|
"""Test getting channel info by handle"""
|
|
scraper.youtube = Mock()
|
|
mock_channels = Mock()
|
|
scraper.youtube.channels.return_value = mock_channels
|
|
mock_channels.list.return_value.execute.return_value = sample_channel_response
|
|
|
|
result = scraper._get_channel_info()
|
|
|
|
assert result is True
|
|
assert scraper.channel_id == 'UC_test_channel_id'
|
|
assert scraper.uploads_playlist_id == 'UU_test_channel_id'
|
|
assert scraper.quota_used == 1
|
|
|
|
mock_channels.list.assert_called_once_with(
|
|
part='snippet,statistics,contentDetails',
|
|
forHandle='TestChannel'
|
|
)
|
|
|
|
def test_get_channel_info_fallback_search(self, scraper):
|
|
"""Test channel search fallback when handle lookup fails"""
|
|
scraper.youtube = Mock()
|
|
|
|
# First attempt fails
|
|
mock_channels = Mock()
|
|
scraper.youtube.channels.return_value = mock_channels
|
|
mock_channels.list.return_value.execute.return_value = {'items': []}
|
|
|
|
# Search succeeds
|
|
mock_search = Mock()
|
|
scraper.youtube.search.return_value = mock_search
|
|
search_response = {
|
|
'items': [{
|
|
'snippet': {'channelId': 'UC_found_channel'}
|
|
}]
|
|
}
|
|
mock_search.list.return_value.execute.return_value = search_response
|
|
|
|
# Second channel lookup succeeds
|
|
channel_response = {
|
|
'items': [{
|
|
'id': 'UC_found_channel',
|
|
'snippet': {'title': 'Found Channel'},
|
|
'statistics': {'subscriberCount': '5000', 'videoCount': '100'},
|
|
'contentDetails': {'relatedPlaylists': {'uploads': 'UU_found_channel'}}
|
|
}]
|
|
}
|
|
mock_channels.list.return_value.execute.side_effect = [{'items': []}, channel_response]
|
|
|
|
result = scraper._get_channel_info()
|
|
|
|
assert result is True
|
|
assert scraper.channel_id == 'UC_found_channel'
|
|
assert scraper.quota_used == 102 # 1 (failed) + 100 (search) + 1 (success)
|
|
|
|
def test_fetch_all_video_ids(self, scraper, sample_playlist_response):
|
|
"""Test fetching all video IDs from channel"""
|
|
scraper.channel_id = 'UC_test_channel_id'
|
|
scraper.uploads_playlist_id = 'UU_test_channel_id'
|
|
|
|
scraper.youtube = Mock()
|
|
mock_playlist_items = Mock()
|
|
scraper.youtube.playlistItems.return_value = mock_playlist_items
|
|
mock_playlist_items.list.return_value.execute.return_value = sample_playlist_response
|
|
|
|
video_ids = scraper._fetch_all_video_ids()
|
|
|
|
assert len(video_ids) == 3
|
|
assert video_ids == ['video1', 'video2', 'video3']
|
|
assert scraper.quota_used == 1
|
|
|
|
def test_fetch_all_video_ids_with_pagination(self, scraper):
|
|
"""Test fetching video IDs with pagination"""
|
|
scraper.channel_id = 'UC_test_channel_id'
|
|
scraper.uploads_playlist_id = 'UU_test_channel_id'
|
|
|
|
scraper.youtube = Mock()
|
|
mock_playlist_items = Mock()
|
|
scraper.youtube.playlistItems.return_value = mock_playlist_items
|
|
|
|
# Simulate 2 pages of results
|
|
page1 = {
|
|
'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(1, 51)],
|
|
'nextPageToken': 'token2'
|
|
}
|
|
page2 = {
|
|
'items': [{'contentDetails': {'videoId': f'video{i}'}} for i in range(51, 71)],
|
|
'nextPageToken': None
|
|
}
|
|
mock_playlist_items.list.return_value.execute.side_effect = [page1, page2]
|
|
|
|
video_ids = scraper._fetch_all_video_ids(max_videos=60)
|
|
|
|
assert len(video_ids) == 60
|
|
assert scraper.quota_used == 2 # 2 API calls
|
|
|
|
def test_fetch_video_details_batch(self, scraper, sample_videos_response):
|
|
"""Test fetching video details in batches"""
|
|
scraper.youtube = Mock()
|
|
mock_videos = Mock()
|
|
scraper.youtube.videos.return_value = mock_videos
|
|
mock_videos.list.return_value.execute.return_value = sample_videos_response
|
|
|
|
video_ids = ['video1', 'video2']
|
|
videos = scraper._fetch_video_details_batch(video_ids)
|
|
|
|
assert len(videos) == 2
|
|
assert videos[0]['id'] == 'video1'
|
|
assert videos[0]['title'] == 'HVAC Maintenance Tips'
|
|
assert videos[0]['view_count'] == 50000
|
|
assert videos[0]['engagement_rate'] > 0
|
|
assert scraper.quota_used == 1
|
|
|
|
@patch('src.youtube_api_scraper.YouTubeTranscriptApi')
|
|
def test_fetch_transcript_success(self, mock_transcript_api, scraper, sample_transcript):
|
|
"""Test successful transcript fetching"""
|
|
# Mock the class method get_transcript
|
|
mock_transcript_api.get_transcript.return_value = sample_transcript
|
|
|
|
transcript = scraper._fetch_transcript('video1')
|
|
|
|
assert transcript is not None
|
|
assert 'Welcome to this HVAC maintenance guide' in transcript
|
|
assert 'Regular maintenance extends system life' in transcript
|
|
mock_transcript_api.get_transcript.assert_called_once_with('video1')
|
|
|
|
@patch('src.youtube_api_scraper.YouTubeTranscriptApi')
|
|
def test_fetch_transcript_failure(self, mock_transcript_api, scraper):
|
|
"""Test transcript fetching when unavailable"""
|
|
# Mock the class method to raise an exception
|
|
mock_transcript_api.get_transcript.side_effect = Exception("No transcript available")
|
|
|
|
transcript = scraper._fetch_transcript('video_no_transcript')
|
|
|
|
assert transcript is None
|
|
|
|
@patch.object(YouTubeAPIScraper, '_fetch_transcript')
|
|
@patch.object(YouTubeAPIScraper, '_fetch_video_details_batch')
|
|
@patch.object(YouTubeAPIScraper, '_fetch_all_video_ids')
|
|
@patch.object(YouTubeAPIScraper, '_get_channel_info')
|
|
def test_fetch_content_full_flow(self, mock_channel_info, mock_video_ids,
|
|
mock_details, mock_transcript, scraper):
|
|
"""Test complete content fetching flow"""
|
|
# Setup mocks
|
|
mock_channel_info.return_value = True
|
|
mock_video_ids.return_value = ['video1', 'video2', 'video3']
|
|
mock_details.return_value = [
|
|
{'id': 'video1', 'title': 'Video 1', 'view_count': 50000},
|
|
{'id': 'video2', 'title': 'Video 2', 'view_count': 30000},
|
|
{'id': 'video3', 'title': 'Video 3', 'view_count': 10000}
|
|
]
|
|
mock_transcript.return_value = 'Sample transcript text'
|
|
|
|
videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True)
|
|
|
|
assert len(videos) == 3
|
|
assert mock_video_ids.called
|
|
assert mock_details.called
|
|
# Should fetch transcripts for top 3 videos (or max_transcripts_per_run)
|
|
assert mock_transcript.call_count == 3
|
|
|
|
def test_quota_limit_enforcement(self, scraper):
|
|
"""Test that quota limits are enforced"""
|
|
scraper.quota_used = 9950
|
|
|
|
# This should succeed (costs 1 unit)
|
|
assert scraper._track_quota('videos_list') is True
|
|
assert scraper.quota_used == 9951
|
|
|
|
# This should fail (would cost 100 units)
|
|
assert scraper._track_quota('search') is False
|
|
assert scraper.quota_used == 9951 # Unchanged
|
|
|
|
def test_get_video_type(self, scraper):
|
|
"""Test video type determination based on duration"""
|
|
# Short video (< 60 seconds)
|
|
assert scraper._get_video_type({'duration': 'PT30S'}) == 'short'
|
|
|
|
# Regular video
|
|
assert scraper._get_video_type({'duration': 'PT5M30S'}) == 'video'
|
|
|
|
# Long video (> 10 minutes)
|
|
assert scraper._get_video_type({'duration': 'PT15M0S'}) == 'video'
|
|
assert scraper._get_video_type({'duration': 'PT1H30M0S'}) == 'video'
|
|
|
|
def test_format_markdown(self, scraper):
|
|
"""Test markdown formatting with enhanced data"""
|
|
videos = [{
|
|
'id': 'test_video',
|
|
'title': 'Test Video Title',
|
|
'published_at': '2025-08-15T10:00:00Z',
|
|
'channel_title': 'Test Channel',
|
|
'duration': 'PT10M30S',
|
|
'view_count': 50000,
|
|
'like_count': 1500,
|
|
'comment_count': 200,
|
|
'engagement_rate': 3.4,
|
|
'like_ratio': 3.0,
|
|
'tags': ['tag1', 'tag2', 'tag3'],
|
|
'thumbnail': 'https://thumbnail.url',
|
|
'description': 'Full untruncated description of the video',
|
|
'transcript': 'This is the transcript text'
|
|
}]
|
|
|
|
markdown = scraper.format_markdown(videos)
|
|
|
|
assert '# ID: test_video' in markdown
|
|
assert '## Title: Test Video Title' in markdown
|
|
assert '## Type: video' in markdown
|
|
assert '## Views: 50,000' in markdown
|
|
assert '## Likes: 1,500' in markdown
|
|
assert '## Comments: 200' in markdown
|
|
assert '## Engagement Rate: 3.40%' in markdown
|
|
assert '## Like Ratio: 3.00%' in markdown
|
|
assert '## Tags: tag1, tag2, tag3' in markdown
|
|
assert '## Description:' in markdown
|
|
assert 'Full untruncated description' in markdown
|
|
assert '## Transcript:' in markdown
|
|
assert 'This is the transcript text' in markdown
|
|
|
|
def test_incremental_items(self, scraper):
|
|
"""Test getting incremental items since last sync"""
|
|
items = [
|
|
{'id': 'new_video', 'published_at': '2025-08-20'},
|
|
{'id': 'last_video', 'published_at': '2025-08-15'},
|
|
{'id': 'old_video', 'published_at': '2025-08-10'}
|
|
]
|
|
|
|
# No state - return all
|
|
new_items = scraper.get_incremental_items(items, {})
|
|
assert len(new_items) == 3
|
|
|
|
# With state - return only new
|
|
state = {
|
|
'last_video_id': 'last_video',
|
|
'last_published': '2025-08-15'
|
|
}
|
|
new_items = scraper.get_incremental_items(items, state)
|
|
assert len(new_items) == 1
|
|
assert new_items[0]['id'] == 'new_video'
|
|
|
|
def test_update_state(self, scraper):
|
|
"""Test state update with latest video info"""
|
|
items = [
|
|
{'id': 'latest_video', 'title': 'Latest Video', 'published_at': '2025-08-20'},
|
|
{'id': 'older_video', 'title': 'Older Video', 'published_at': '2025-08-15'}
|
|
]
|
|
|
|
state = scraper.update_state({}, items)
|
|
|
|
assert state['last_video_id'] == 'latest_video'
|
|
assert state['last_published'] == '2025-08-20'
|
|
assert state['last_video_title'] == 'Latest Video'
|
|
assert state['video_count'] == 2
|
|
assert state['quota_used'] == 0
|
|
assert 'last_sync' in state
|
|
|
|
def test_efficient_quota_usage_for_370_videos(self, scraper):
|
|
"""Test that fetching 370 videos uses minimal quota"""
|
|
scraper.channel_id = 'UC_test'
|
|
scraper.uploads_playlist_id = 'UU_test'
|
|
|
|
# Simulate fetching 370 videos
|
|
# 370 videos / 50 per page = 8 pages for playlist items
|
|
for _ in range(8):
|
|
scraper._track_quota('playlist_items')
|
|
|
|
# 370 videos / 50 per batch = 8 batches for video details
|
|
for _ in range(8):
|
|
scraper._track_quota('videos_list')
|
|
|
|
# Total quota should be very low
|
|
assert scraper.quota_used == 16 # 8 + 8
|
|
assert scraper.quota_used < 20 # Well under daily limit
|
|
|
|
# We can afford many transcripts with remaining quota
|
|
remaining = scraper.daily_quota_limit - scraper.quota_used
|
|
assert remaining > 9900 # Plenty of quota left
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"]) |