#!/usr/bin/env python3 """ Continue YouTube caption fetching using remaining quota Fetches captions for videos 50-188 (next 139 videos by view count) Uses up to 95% of daily quota (9,500 units) """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.youtube_api_scraper_v2 import YouTubeAPIScraper from src.base_scraper import ScraperConfig from datetime import datetime import pytz import time import json import logging # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('logs/youtube_caption_continue.log'), logging.StreamHandler() ] ) logger = logging.getLogger('youtube_captions') def load_existing_videos(): """Load existing video data from the latest markdown file.""" latest_file = Path('data/markdown_current/hvacnkowitall_YouTube_2025-08-19T100336.md') if not latest_file.exists(): logger.error(f"Latest YouTube file not found: {latest_file}") return [] # Parse the markdown to extract video data content = latest_file.read_text(encoding='utf-8') videos = [] # Simple parsing - split by video sections sections = content.split('# ID: ') for section in sections[1:]: # Skip first empty section lines = section.strip().split('\n') if not lines: continue video_id = lines[0].strip() video_data = {'id': video_id} # Parse basic info for line in lines: if line.startswith('## Title: '): video_data['title'] = line.replace('## Title: ', '') elif line.startswith('## Views: '): views_str = line.replace('## Views: ', '').replace(',', '') video_data['view_count'] = int(views_str) if views_str.isdigit() else 0 elif line.startswith('## Caption Status:'): video_data['has_caption_info'] = True videos.append(video_data) logger.info(f"Loaded {len(videos)} videos from existing file") return videos def continue_caption_fetching(): """Continue fetching captions from where we left off.""" logger.info("=" * 60) logger.info("CONTINUING YOUTUBE CAPTION FETCHING") logger.info("=" * 60) # Load existing video data videos = load_existing_videos() if not videos: logger.error("No existing videos found to continue from") return False # Sort by view count (descending) videos.sort(key=lambda x: x.get('view_count', 0), reverse=True) # Count how many already have captions with_captions = sum(1 for v in videos if v.get('has_caption_info')) without_captions = [v for v in videos if not v.get('has_caption_info')] logger.info(f"Current status:") logger.info(f" Total videos: {len(videos)}") logger.info(f" Already have captions: {with_captions}") logger.info(f" Need captions: {len(without_captions)}") # Calculate quota quota_used_so_far = 2519 # From previous run daily_limit = 10000 target_usage = int(daily_limit * 0.95) # 95% = 9,500 units available_quota = target_usage - quota_used_so_far logger.info(f"Quota analysis:") logger.info(f" Daily limit: {daily_limit:,} units") logger.info(f" Already used: {quota_used_so_far:,} units") logger.info(f" Target (95%): {target_usage:,} units") logger.info(f" Available: {available_quota:,} units") # Calculate how many more videos we can caption max_additional_captions = available_quota // 50 # 50 units per video videos_to_caption = without_captions[:max_additional_captions] logger.info(f"Caption plan:") logger.info(f" Videos to caption now: {len(videos_to_caption)}") logger.info(f" Estimated quota cost: {len(videos_to_caption) * 50:,} units") logger.info(f" Will use: {quota_used_so_far + (len(videos_to_caption) * 50):,} units total") if not videos_to_caption: logger.info("No additional videos to caption within quota limits") return True # Set up scraper config = ScraperConfig( source_name='YouTube', brand_name='hvacnkowitall', data_dir=Path('data/markdown_current'), logs_dir=Path('logs/YouTube'), timezone='America/Halifax' ) scraper = YouTubeAPIScraper(config) scraper.quota_used = quota_used_so_far # Set initial quota usage logger.info(f"Starting caption fetching for {len(videos_to_caption)} videos...") start_time = time.time() captions_found = 0 for i, video in enumerate(videos_to_caption, 1): video_id = video['id'] title = video.get('title', 'Unknown')[:50] logger.info(f"[{i}/{len(videos_to_caption)}] Fetching caption for: {title}...") # Fetch caption info caption_info = scraper._fetch_caption_text(video_id) if caption_info: video['caption_text'] = caption_info captions_found += 1 logger.info(f" ✅ Caption found") else: logger.info(f" ❌ No caption available") # Add delay to be respectful time.sleep(0.5) # Check if we're approaching quota limit if scraper.quota_used >= target_usage: logger.warning(f"Reached 95% quota limit at video {i}") break elapsed = time.time() - start_time logger.info(f"Caption fetching complete!") logger.info(f" Duration: {elapsed:.1f} seconds") logger.info(f" Captions found: {captions_found}") logger.info(f" Quota used: {scraper.quota_used:,}/{daily_limit:,} units") logger.info(f" Quota percentage: {(scraper.quota_used/daily_limit)*100:.1f}%") # Update the video data with new caption info video_lookup = {v['id']: v for v in videos} for video in videos_to_caption: if video['id'] in video_lookup and video.get('caption_text'): video_lookup[video['id']]['caption_text'] = video['caption_text'] # Save updated data timestamp = datetime.now(pytz.timezone('America/Halifax')).strftime('%Y-%m-%dT%H%M%S') updated_filename = f"hvacnkowitall_YouTube_{timestamp}_captions.md" # Generate updated markdown (simplified version) markdown_sections = [] for video in videos: section = [] section.append(f"# ID: {video['id']}") section.append("") section.append(f"## Title: {video.get('title', 'Unknown')}") section.append("") section.append(f"## Views: {video.get('view_count', 0):,}") section.append("") # Caption status if video.get('caption_text'): section.append("## Caption Status:") section.append(video['caption_text']) section.append("") elif video.get('has_caption_info'): section.append("## Caption Status:") section.append("[Captions available - ]") section.append("") section.append("-" * 50) section.append("") markdown_sections.append('\n'.join(section)) # Save updated file output_file = Path(f'data/markdown_current/{updated_filename}') output_file.write_text('\n'.join(markdown_sections), encoding='utf-8') logger.info(f"Updated file saved: {output_file}") # Calculate remaining work total_with_captions = with_captions + captions_found remaining_videos = len(videos) - total_with_captions logger.info(f"Progress summary:") logger.info(f" Total videos: {len(videos)}") logger.info(f" Captioned: {total_with_captions}") logger.info(f" Remaining: {remaining_videos}") logger.info(f" Progress: {(total_with_captions/len(videos))*100:.1f}%") if remaining_videos > 0: days_needed = (remaining_videos // 190) + (1 if remaining_videos % 190 else 0) logger.info(f" Estimated days to complete: {days_needed}") return True if __name__ == "__main__": success = continue_caption_fetching() sys.exit(0 if success else 1)