hvac-kia-content/youtube_backlog_with_transcripts_slow.py

#!/usr/bin/env python3
"""
YouTube Backlog Capture with Transcripts - Slow Rate Limited Version

This script captures the complete YouTube channel backlog with transcripts
using extended delays to avoid YouTube's rate limiting on transcript fetching.

Designed for overnight/extended processing with minimal intervention required.
"""

import time
import random
import logging
from pathlib import Path
from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs_backlog_transcripts/youtube_slow_backlog.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def main():
    """Execute slow YouTube backlog capture with transcripts."""

    print("=" * 80)
    print("YouTube Backlog Capture with Transcripts - SLOW VERSION")
    print("=" * 80)
    print()
    print("This script will:")
    print("- Capture ALL available YouTube videos (~370 videos)")
    print("- Download transcripts for each video")
    print("- Use extended delays (60-120 seconds between videos)")
    print("- Take 5-10 minute breaks every 5 videos")
    print("- Estimated completion time: 8-12 hours")
    print()

    # Get user confirmation
    confirm = input("This is a very long process. Continue? (y/N): ").strip().lower()
    if confirm != 'y':
        print("Cancelled.")
        return

    # Setup configuration for backlog processing
    config = ScraperConfig(
        source_name='youtube',
        brand_name='hvacknowitall',
        data_dir=Path('data_backlog_with_transcripts'),
        logs_dir=Path('logs_backlog_transcripts'),
        timezone='America/Halifax'
    )

    # Create directories
    config.data_dir.mkdir(parents=True, exist_ok=True)
    config.logs_dir.mkdir(parents=True, exist_ok=True)

    # Initialize scraper
    scraper = YouTubeScraper(config)

    # Clear any existing state to ensure full backlog
    if scraper.state_file.exists():
        scraper.state_file.unlink()
        logger.info("Cleared existing state for full backlog capture")

    # Override the backlog delay method with even more conservative delays
    original_backlog_delay = scraper._backlog_delay

    def ultra_conservative_delay(transcript_mode=False):
        """Ultra-conservative delays for transcript fetching."""
        if transcript_mode:
            # 60-120 seconds for transcript requests (much longer than original 30-90)
            base_delay = random.uniform(60, 120)
        else:
            # 30-60 seconds for basic video info (longer than original 10-30)
            base_delay = random.uniform(30, 60)

        # Add extra randomization
        jitter = random.uniform(0.9, 1.1)
        final_delay = base_delay * jitter

        logger.info(f"Ultra-conservative delay: {final_delay:.1f} seconds...")
        time.sleep(final_delay)

    # Replace the delay method
    scraper._backlog_delay = ultra_conservative_delay

    print("Starting YouTube backlog capture...")
    print("Monitor progress in logs_backlog_transcripts/youtube_slow_backlog.log")
    print()

    start_time = time.time()

    try:
        # Fetch content with transcripts (no max_posts = full backlog)
        videos = scraper.fetch_content(
            max_posts=None,  # Get all videos
            fetch_transcripts=True
        )

        # Format and save markdown
        if videos:
            markdown_content = scraper.format_markdown(videos)

            # Save to file
            output_file = config.data_dir / "youtube_backlog_with_transcripts.md"
            output_file.write_text(markdown_content, encoding='utf-8')

            logger.info(f"Saved {len(videos)} videos with transcripts to {output_file}")

            # Statistics
            total_duration = time.time() - start_time
            with_transcripts = sum(1 for v in videos if v.get('transcript'))
            total_views = sum(v.get('view_count', 0) for v in videos)

            print()
            print("=" * 80)
            print("YOUTUBE BACKLOG CAPTURE COMPLETED")
            print("=" * 80)
            print(f"Total videos captured: {len(videos)}")
            print(f"Videos with transcripts: {with_transcripts}")
            print(f"Success rate: {with_transcripts/len(videos)*100:.1f}%")
            print(f"Total views: {total_views:,}")
            print(f"Processing time: {total_duration/3600:.1f} hours")
            print(f"Output file: {output_file}")
            print("=" * 80)

        else:
            logger.error("No videos were captured")

    except KeyboardInterrupt:
        logger.info("Process interrupted by user")
        print("\nProcess interrupted. Partial results may be available.")

    except Exception as e:
        logger.error(f"Error during backlog capture: {e}")
        print(f"\nError occurred: {e}")

    finally:
        # Restore original delay method
        scraper._backlog_delay = original_backlog_delay

        total_time = time.time() - start_time
        print(f"\nTotal execution time: {total_time/3600:.1f} hours")

if __name__ == "__main__":
    main()