Major improvements: - Add CumulativeMarkdownManager for intelligent content merging - Implement YouTube Data API v3 integration with caption support - Add MailChimp API integration with content cleaning - Create single source-of-truth files that grow with updates - Smart merging: updates existing entries with better data - Properly combines backlog + incremental daily updates Features: - 179/444 YouTube videos now have captions (40.3%) - MailChimp content cleaned of headers/footers - All sources consolidated to single files - Archive management with timestamped versions - Test suite and documentation included 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
229 lines
No EOL
8 KiB
Python
229 lines
No EOL
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Continue YouTube caption fetching using remaining quota
|
|
Fetches captions for videos 50-188 (next 139 videos by view count)
|
|
Uses up to 95% of daily quota (9,500 units)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.youtube_api_scraper_v2 import YouTubeAPIScraper
|
|
from src.base_scraper import ScraperConfig
|
|
from datetime import datetime
|
|
import pytz
|
|
import time
|
|
import json
|
|
import logging
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('logs/youtube_caption_continue.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger('youtube_captions')
|
|
|
|
|
|
def load_existing_videos():
|
|
"""Load existing video data from the latest markdown file."""
|
|
latest_file = Path('data/markdown_current/hvacnkowitall_YouTube_2025-08-19T100336.md')
|
|
|
|
if not latest_file.exists():
|
|
logger.error(f"Latest YouTube file not found: {latest_file}")
|
|
return []
|
|
|
|
# Parse the markdown to extract video data
|
|
content = latest_file.read_text(encoding='utf-8')
|
|
videos = []
|
|
|
|
# Simple parsing - split by video sections
|
|
sections = content.split('# ID: ')
|
|
|
|
for section in sections[1:]: # Skip first empty section
|
|
lines = section.strip().split('\n')
|
|
if not lines:
|
|
continue
|
|
|
|
video_id = lines[0].strip()
|
|
video_data = {'id': video_id}
|
|
|
|
# Parse basic info
|
|
for line in lines:
|
|
if line.startswith('## Title: '):
|
|
video_data['title'] = line.replace('## Title: ', '')
|
|
elif line.startswith('## Views: '):
|
|
views_str = line.replace('## Views: ', '').replace(',', '')
|
|
video_data['view_count'] = int(views_str) if views_str.isdigit() else 0
|
|
elif line.startswith('## Caption Status:'):
|
|
video_data['has_caption_info'] = True
|
|
|
|
videos.append(video_data)
|
|
|
|
logger.info(f"Loaded {len(videos)} videos from existing file")
|
|
return videos
|
|
|
|
|
|
def continue_caption_fetching():
|
|
"""Continue fetching captions from where we left off."""
|
|
logger.info("=" * 60)
|
|
logger.info("CONTINUING YOUTUBE CAPTION FETCHING")
|
|
logger.info("=" * 60)
|
|
|
|
# Load existing video data
|
|
videos = load_existing_videos()
|
|
|
|
if not videos:
|
|
logger.error("No existing videos found to continue from")
|
|
return False
|
|
|
|
# Sort by view count (descending)
|
|
videos.sort(key=lambda x: x.get('view_count', 0), reverse=True)
|
|
|
|
# Count how many already have captions
|
|
with_captions = sum(1 for v in videos if v.get('has_caption_info'))
|
|
without_captions = [v for v in videos if not v.get('has_caption_info')]
|
|
|
|
logger.info(f"Current status:")
|
|
logger.info(f" Total videos: {len(videos)}")
|
|
logger.info(f" Already have captions: {with_captions}")
|
|
logger.info(f" Need captions: {len(without_captions)}")
|
|
|
|
# Calculate quota
|
|
quota_used_so_far = 2519 # From previous run
|
|
daily_limit = 10000
|
|
target_usage = int(daily_limit * 0.95) # 95% = 9,500 units
|
|
available_quota = target_usage - quota_used_so_far
|
|
|
|
logger.info(f"Quota analysis:")
|
|
logger.info(f" Daily limit: {daily_limit:,} units")
|
|
logger.info(f" Already used: {quota_used_so_far:,} units")
|
|
logger.info(f" Target (95%): {target_usage:,} units")
|
|
logger.info(f" Available: {available_quota:,} units")
|
|
|
|
# Calculate how many more videos we can caption
|
|
max_additional_captions = available_quota // 50 # 50 units per video
|
|
videos_to_caption = without_captions[:max_additional_captions]
|
|
|
|
logger.info(f"Caption plan:")
|
|
logger.info(f" Videos to caption now: {len(videos_to_caption)}")
|
|
logger.info(f" Estimated quota cost: {len(videos_to_caption) * 50:,} units")
|
|
logger.info(f" Will use: {quota_used_so_far + (len(videos_to_caption) * 50):,} units total")
|
|
|
|
if not videos_to_caption:
|
|
logger.info("No additional videos to caption within quota limits")
|
|
return True
|
|
|
|
# Set up scraper
|
|
config = ScraperConfig(
|
|
source_name='YouTube',
|
|
brand_name='hvacnkowitall',
|
|
data_dir=Path('data/markdown_current'),
|
|
logs_dir=Path('logs/YouTube'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = YouTubeAPIScraper(config)
|
|
scraper.quota_used = quota_used_so_far # Set initial quota usage
|
|
|
|
logger.info(f"Starting caption fetching for {len(videos_to_caption)} videos...")
|
|
start_time = time.time()
|
|
|
|
captions_found = 0
|
|
for i, video in enumerate(videos_to_caption, 1):
|
|
video_id = video['id']
|
|
title = video.get('title', 'Unknown')[:50]
|
|
|
|
logger.info(f"[{i}/{len(videos_to_caption)}] Fetching caption for: {title}...")
|
|
|
|
# Fetch caption info
|
|
caption_info = scraper._fetch_caption_text(video_id)
|
|
|
|
if caption_info:
|
|
video['caption_text'] = caption_info
|
|
captions_found += 1
|
|
logger.info(f" ✅ Caption found")
|
|
else:
|
|
logger.info(f" ❌ No caption available")
|
|
|
|
# Add delay to be respectful
|
|
time.sleep(0.5)
|
|
|
|
# Check if we're approaching quota limit
|
|
if scraper.quota_used >= target_usage:
|
|
logger.warning(f"Reached 95% quota limit at video {i}")
|
|
break
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
logger.info(f"Caption fetching complete!")
|
|
logger.info(f" Duration: {elapsed:.1f} seconds")
|
|
logger.info(f" Captions found: {captions_found}")
|
|
logger.info(f" Quota used: {scraper.quota_used:,}/{daily_limit:,} units")
|
|
logger.info(f" Quota percentage: {(scraper.quota_used/daily_limit)*100:.1f}%")
|
|
|
|
# Update the video data with new caption info
|
|
video_lookup = {v['id']: v for v in videos}
|
|
for video in videos_to_caption:
|
|
if video['id'] in video_lookup and video.get('caption_text'):
|
|
video_lookup[video['id']]['caption_text'] = video['caption_text']
|
|
|
|
# Save updated data
|
|
timestamp = datetime.now(pytz.timezone('America/Halifax')).strftime('%Y-%m-%dT%H%M%S')
|
|
updated_filename = f"hvacnkowitall_YouTube_{timestamp}_captions.md"
|
|
|
|
# Generate updated markdown (simplified version)
|
|
markdown_sections = []
|
|
for video in videos:
|
|
section = []
|
|
section.append(f"# ID: {video['id']}")
|
|
section.append("")
|
|
section.append(f"## Title: {video.get('title', 'Unknown')}")
|
|
section.append("")
|
|
section.append(f"## Views: {video.get('view_count', 0):,}")
|
|
section.append("")
|
|
|
|
# Caption status
|
|
if video.get('caption_text'):
|
|
section.append("## Caption Status:")
|
|
section.append(video['caption_text'])
|
|
section.append("")
|
|
elif video.get('has_caption_info'):
|
|
section.append("## Caption Status:")
|
|
section.append("[Captions available - ]")
|
|
section.append("")
|
|
|
|
section.append("-" * 50)
|
|
section.append("")
|
|
markdown_sections.append('\n'.join(section))
|
|
|
|
# Save updated file
|
|
output_file = Path(f'data/markdown_current/{updated_filename}')
|
|
output_file.write_text('\n'.join(markdown_sections), encoding='utf-8')
|
|
|
|
logger.info(f"Updated file saved: {output_file}")
|
|
|
|
# Calculate remaining work
|
|
total_with_captions = with_captions + captions_found
|
|
remaining_videos = len(videos) - total_with_captions
|
|
|
|
logger.info(f"Progress summary:")
|
|
logger.info(f" Total videos: {len(videos)}")
|
|
logger.info(f" Captioned: {total_with_captions}")
|
|
logger.info(f" Remaining: {remaining_videos}")
|
|
logger.info(f" Progress: {(total_with_captions/len(videos))*100:.1f}%")
|
|
|
|
if remaining_videos > 0:
|
|
days_needed = (remaining_videos // 190) + (1 if remaining_videos % 190 else 0)
|
|
logger.info(f" Estimated days to complete: {days_needed}")
|
|
|
|
return True
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = continue_caption_fetching()
|
|
sys.exit(0 if success else 1) |