hvac-kia-content/continue_youtube_captions.py

#!/usr/bin/env python3
"""
Continue YouTube caption fetching using remaining quota
Fetches captions for videos 50-188 (next 139 videos by view count)
Uses up to 95% of daily quota (9,500 units)
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.youtube_api_scraper_v2 import YouTubeAPIScraper
from src.base_scraper import ScraperConfig
from datetime import datetime
import pytz
import time
import json
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/youtube_caption_continue.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('youtube_captions')


def load_existing_videos():
    """Load existing video data from the latest markdown file."""
    latest_file = Path('data/markdown_current/hvacnkowitall_YouTube_2025-08-19T100336.md')

    if not latest_file.exists():
        logger.error(f"Latest YouTube file not found: {latest_file}")
        return []

    # Parse the markdown to extract video data
    content = latest_file.read_text(encoding='utf-8')
    videos = []

    # Simple parsing - split by video sections
    sections = content.split('# ID: ')

    for section in sections[1:]:  # Skip first empty section
        lines = section.strip().split('\n')
        if not lines:
            continue

        video_id = lines[0].strip()
        video_data = {'id': video_id}

        # Parse basic info
        for line in lines:
            if line.startswith('## Title: '):
                video_data['title'] = line.replace('## Title: ', '')
            elif line.startswith('## Views: '):
                views_str = line.replace('## Views: ', '').replace(',', '')
                video_data['view_count'] = int(views_str) if views_str.isdigit() else 0
            elif line.startswith('## Caption Status:'):
                video_data['has_caption_info'] = True

        videos.append(video_data)

    logger.info(f"Loaded {len(videos)} videos from existing file")
    return videos


def continue_caption_fetching():
    """Continue fetching captions from where we left off."""
    logger.info("=" * 60)
    logger.info("CONTINUING YOUTUBE CAPTION FETCHING")
    logger.info("=" * 60)

    # Load existing video data
    videos = load_existing_videos()

    if not videos:
        logger.error("No existing videos found to continue from")
        return False

    # Sort by view count (descending)
    videos.sort(key=lambda x: x.get('view_count', 0), reverse=True)

    # Count how many already have captions
    with_captions = sum(1 for v in videos if v.get('has_caption_info'))
    without_captions = [v for v in videos if not v.get('has_caption_info')]

    logger.info(f"Current status:")
    logger.info(f"  Total videos: {len(videos)}")
    logger.info(f"  Already have captions: {with_captions}")
    logger.info(f"  Need captions: {len(without_captions)}")

    # Calculate quota
    quota_used_so_far = 2519  # From previous run
    daily_limit = 10000
    target_usage = int(daily_limit * 0.95)  # 95% = 9,500 units
    available_quota = target_usage - quota_used_so_far

    logger.info(f"Quota analysis:")
    logger.info(f"  Daily limit: {daily_limit:,} units")
    logger.info(f"  Already used: {quota_used_so_far:,} units")
    logger.info(f"  Target (95%): {target_usage:,} units")
    logger.info(f"  Available: {available_quota:,} units")

    # Calculate how many more videos we can caption
    max_additional_captions = available_quota // 50  # 50 units per video
    videos_to_caption = without_captions[:max_additional_captions]

    logger.info(f"Caption plan:")
    logger.info(f"  Videos to caption now: {len(videos_to_caption)}")
    logger.info(f"  Estimated quota cost: {len(videos_to_caption) * 50:,} units")
    logger.info(f"  Will use: {quota_used_so_far + (len(videos_to_caption) * 50):,} units total")

    if not videos_to_caption:
        logger.info("No additional videos to caption within quota limits")
        return True

    # Set up scraper
    config = ScraperConfig(
        source_name='YouTube',
        brand_name='hvacnkowitall',
        data_dir=Path('data/markdown_current'),
        logs_dir=Path('logs/YouTube'),
        timezone='America/Halifax'
    )

    scraper = YouTubeAPIScraper(config)
    scraper.quota_used = quota_used_so_far  # Set initial quota usage

    logger.info(f"Starting caption fetching for {len(videos_to_caption)} videos...")
    start_time = time.time()

    captions_found = 0
    for i, video in enumerate(videos_to_caption, 1):
        video_id = video['id']
        title = video.get('title', 'Unknown')[:50]

        logger.info(f"[{i}/{len(videos_to_caption)}] Fetching caption for: {title}...")

        # Fetch caption info
        caption_info = scraper._fetch_caption_text(video_id)

        if caption_info:
            video['caption_text'] = caption_info
            captions_found += 1
            logger.info(f"  ✅ Caption found")
        else:
            logger.info(f"  ❌ No caption available")

        # Add delay to be respectful
        time.sleep(0.5)

        # Check if we're approaching quota limit
        if scraper.quota_used >= target_usage:
            logger.warning(f"Reached 95% quota limit at video {i}")
            break

    elapsed = time.time() - start_time

    logger.info(f"Caption fetching complete!")
    logger.info(f"  Duration: {elapsed:.1f} seconds")
    logger.info(f"  Captions found: {captions_found}")
    logger.info(f"  Quota used: {scraper.quota_used:,}/{daily_limit:,} units")
    logger.info(f"  Quota percentage: {(scraper.quota_used/daily_limit)*100:.1f}%")

    # Update the video data with new caption info
    video_lookup = {v['id']: v for v in videos}
    for video in videos_to_caption:
        if video['id'] in video_lookup and video.get('caption_text'):
            video_lookup[video['id']]['caption_text'] = video['caption_text']

    # Save updated data
    timestamp = datetime.now(pytz.timezone('America/Halifax')).strftime('%Y-%m-%dT%H%M%S')
    updated_filename = f"hvacnkowitall_YouTube_{timestamp}_captions.md"

    # Generate updated markdown (simplified version)
    markdown_sections = []
    for video in videos:
        section = []
        section.append(f"# ID: {video['id']}")
        section.append("")
        section.append(f"## Title: {video.get('title', 'Unknown')}")
        section.append("")
        section.append(f"## Views: {video.get('view_count', 0):,}")
        section.append("")

        # Caption status
        if video.get('caption_text'):
            section.append("## Caption Status:")
            section.append(video['caption_text'])
            section.append("")
        elif video.get('has_caption_info'):
            section.append("## Caption Status:")
            section.append("[Captions available - ]")
            section.append("")

        section.append("-" * 50)
        section.append("")
        markdown_sections.append('\n'.join(section))

    # Save updated file
    output_file = Path(f'data/markdown_current/{updated_filename}')
    output_file.write_text('\n'.join(markdown_sections), encoding='utf-8')

    logger.info(f"Updated file saved: {output_file}")

    # Calculate remaining work
    total_with_captions = with_captions + captions_found
    remaining_videos = len(videos) - total_with_captions

    logger.info(f"Progress summary:")
    logger.info(f"  Total videos: {len(videos)}")
    logger.info(f"  Captioned: {total_with_captions}")
    logger.info(f"  Remaining: {remaining_videos}")
    logger.info(f"  Progress: {(total_with_captions/len(videos))*100:.1f}%")

    if remaining_videos > 0:
        days_needed = (remaining_videos // 190) + (1 if remaining_videos % 190 else 0)
        logger.info(f"  Estimated days to complete: {days_needed}")

    return True


if __name__ == "__main__":
    success = continue_caption_fetching()
    sys.exit(0 if success else 1)