hvac-kia-content/continue_youtube_captions.py
Ben Reed 8ceb858026 Implement cumulative markdown system and API integrations
Major improvements:
- Add CumulativeMarkdownManager for intelligent content merging
- Implement YouTube Data API v3 integration with caption support
- Add MailChimp API integration with content cleaning
- Create single source-of-truth files that grow with updates
- Smart merging: updates existing entries with better data
- Properly combines backlog + incremental daily updates

Features:
- 179/444 YouTube videos now have captions (40.3%)
- MailChimp content cleaned of headers/footers
- All sources consolidated to single files
- Archive management with timestamped versions
- Test suite and documentation included

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 10:53:40 -03:00

229 lines
No EOL
8 KiB
Python

#!/usr/bin/env python3
"""
Continue YouTube caption fetching using remaining quota
Fetches captions for videos 50-188 (next 139 videos by view count)
Uses up to 95% of daily quota (9,500 units)
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.youtube_api_scraper_v2 import YouTubeAPIScraper
from src.base_scraper import ScraperConfig
from datetime import datetime
import pytz
import time
import json
import logging
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/youtube_caption_continue.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('youtube_captions')
def load_existing_videos():
"""Load existing video data from the latest markdown file."""
latest_file = Path('data/markdown_current/hvacnkowitall_YouTube_2025-08-19T100336.md')
if not latest_file.exists():
logger.error(f"Latest YouTube file not found: {latest_file}")
return []
# Parse the markdown to extract video data
content = latest_file.read_text(encoding='utf-8')
videos = []
# Simple parsing - split by video sections
sections = content.split('# ID: ')
for section in sections[1:]: # Skip first empty section
lines = section.strip().split('\n')
if not lines:
continue
video_id = lines[0].strip()
video_data = {'id': video_id}
# Parse basic info
for line in lines:
if line.startswith('## Title: '):
video_data['title'] = line.replace('## Title: ', '')
elif line.startswith('## Views: '):
views_str = line.replace('## Views: ', '').replace(',', '')
video_data['view_count'] = int(views_str) if views_str.isdigit() else 0
elif line.startswith('## Caption Status:'):
video_data['has_caption_info'] = True
videos.append(video_data)
logger.info(f"Loaded {len(videos)} videos from existing file")
return videos
def continue_caption_fetching():
"""Continue fetching captions from where we left off."""
logger.info("=" * 60)
logger.info("CONTINUING YOUTUBE CAPTION FETCHING")
logger.info("=" * 60)
# Load existing video data
videos = load_existing_videos()
if not videos:
logger.error("No existing videos found to continue from")
return False
# Sort by view count (descending)
videos.sort(key=lambda x: x.get('view_count', 0), reverse=True)
# Count how many already have captions
with_captions = sum(1 for v in videos if v.get('has_caption_info'))
without_captions = [v for v in videos if not v.get('has_caption_info')]
logger.info(f"Current status:")
logger.info(f" Total videos: {len(videos)}")
logger.info(f" Already have captions: {with_captions}")
logger.info(f" Need captions: {len(without_captions)}")
# Calculate quota
quota_used_so_far = 2519 # From previous run
daily_limit = 10000
target_usage = int(daily_limit * 0.95) # 95% = 9,500 units
available_quota = target_usage - quota_used_so_far
logger.info(f"Quota analysis:")
logger.info(f" Daily limit: {daily_limit:,} units")
logger.info(f" Already used: {quota_used_so_far:,} units")
logger.info(f" Target (95%): {target_usage:,} units")
logger.info(f" Available: {available_quota:,} units")
# Calculate how many more videos we can caption
max_additional_captions = available_quota // 50 # 50 units per video
videos_to_caption = without_captions[:max_additional_captions]
logger.info(f"Caption plan:")
logger.info(f" Videos to caption now: {len(videos_to_caption)}")
logger.info(f" Estimated quota cost: {len(videos_to_caption) * 50:,} units")
logger.info(f" Will use: {quota_used_so_far + (len(videos_to_caption) * 50):,} units total")
if not videos_to_caption:
logger.info("No additional videos to caption within quota limits")
return True
# Set up scraper
config = ScraperConfig(
source_name='YouTube',
brand_name='hvacnkowitall',
data_dir=Path('data/markdown_current'),
logs_dir=Path('logs/YouTube'),
timezone='America/Halifax'
)
scraper = YouTubeAPIScraper(config)
scraper.quota_used = quota_used_so_far # Set initial quota usage
logger.info(f"Starting caption fetching for {len(videos_to_caption)} videos...")
start_time = time.time()
captions_found = 0
for i, video in enumerate(videos_to_caption, 1):
video_id = video['id']
title = video.get('title', 'Unknown')[:50]
logger.info(f"[{i}/{len(videos_to_caption)}] Fetching caption for: {title}...")
# Fetch caption info
caption_info = scraper._fetch_caption_text(video_id)
if caption_info:
video['caption_text'] = caption_info
captions_found += 1
logger.info(f" ✅ Caption found")
else:
logger.info(f" ❌ No caption available")
# Add delay to be respectful
time.sleep(0.5)
# Check if we're approaching quota limit
if scraper.quota_used >= target_usage:
logger.warning(f"Reached 95% quota limit at video {i}")
break
elapsed = time.time() - start_time
logger.info(f"Caption fetching complete!")
logger.info(f" Duration: {elapsed:.1f} seconds")
logger.info(f" Captions found: {captions_found}")
logger.info(f" Quota used: {scraper.quota_used:,}/{daily_limit:,} units")
logger.info(f" Quota percentage: {(scraper.quota_used/daily_limit)*100:.1f}%")
# Update the video data with new caption info
video_lookup = {v['id']: v for v in videos}
for video in videos_to_caption:
if video['id'] in video_lookup and video.get('caption_text'):
video_lookup[video['id']]['caption_text'] = video['caption_text']
# Save updated data
timestamp = datetime.now(pytz.timezone('America/Halifax')).strftime('%Y-%m-%dT%H%M%S')
updated_filename = f"hvacnkowitall_YouTube_{timestamp}_captions.md"
# Generate updated markdown (simplified version)
markdown_sections = []
for video in videos:
section = []
section.append(f"# ID: {video['id']}")
section.append("")
section.append(f"## Title: {video.get('title', 'Unknown')}")
section.append("")
section.append(f"## Views: {video.get('view_count', 0):,}")
section.append("")
# Caption status
if video.get('caption_text'):
section.append("## Caption Status:")
section.append(video['caption_text'])
section.append("")
elif video.get('has_caption_info'):
section.append("## Caption Status:")
section.append("[Captions available - ]")
section.append("")
section.append("-" * 50)
section.append("")
markdown_sections.append('\n'.join(section))
# Save updated file
output_file = Path(f'data/markdown_current/{updated_filename}')
output_file.write_text('\n'.join(markdown_sections), encoding='utf-8')
logger.info(f"Updated file saved: {output_file}")
# Calculate remaining work
total_with_captions = with_captions + captions_found
remaining_videos = len(videos) - total_with_captions
logger.info(f"Progress summary:")
logger.info(f" Total videos: {len(videos)}")
logger.info(f" Captioned: {total_with_captions}")
logger.info(f" Remaining: {remaining_videos}")
logger.info(f" Progress: {(total_with_captions/len(videos))*100:.1f}%")
if remaining_videos > 0:
days_needed = (remaining_videos // 190) + (1 if remaining_videos % 190 else 0)
logger.info(f" Estimated days to complete: {days_needed}")
return True
if __name__ == "__main__":
success = continue_caption_fetching()
sys.exit(0 if success else 1)