hvac-kia-content/youtube_slow_backlog_with_transcripts.py

#!/usr/bin/env python3
"""
YouTube Slow Backlog Capture: ALL VIDEOS with Transcripts
Extended delays to avoid rate limiting - expected duration: 6-8 hours
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper
from datetime import datetime, timedelta
import logging
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('youtube_slow_backlog_transcripts.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def estimate_completion_time(total_videos: int):
    """Estimate completion time with extended delays."""
    # Per video: 30-90 seconds delay + 3-5 seconds processing = ~60 seconds average
    avg_time_per_video = 60  # seconds

    # Extra breaks: every 5 videos, 2-5 minutes (3.5 min average)
    breaks_count = total_videos // 5
    break_time = breaks_count * 3.5 * 60  # seconds

    total_seconds = (total_videos * avg_time_per_video) + break_time
    total_hours = total_seconds / 3600

    estimated_completion = datetime.now() + timedelta(seconds=total_seconds)

    logger.info(f"📊 TIME ESTIMATION:")
    logger.info(f"  Videos to process: {total_videos}")
    logger.info(f"  Average time per video: {avg_time_per_video} seconds")
    logger.info(f"  Extended breaks: {breaks_count} breaks x 3.5 min = {break_time/60:.0f} minutes")
    logger.info(f"  Total estimated time: {total_hours:.1f} hours")
    logger.info(f"  Estimated completion: {estimated_completion.strftime('%Y-%m-%d %H:%M:%S')}")

    return total_hours

def test_authentication_with_retry():
    """Test authentication with retry after rate limiting."""
    logger.info("🔐 Testing YouTube authentication with rate limit recovery...")

    config = ScraperConfig(
        source_name="youtube_test",
        brand_name="hvacknowitall",
        data_dir=Path("test_data/auth_retry_test"),
        logs_dir=Path("test_logs/auth_retry_test"),
        timezone="America/Halifax"
    )

    scraper = YouTubeScraper(config)
    max_retries = 3

    for attempt in range(max_retries):
        try:
            # Test with single video
            logger.info(f"Authentication test attempt {attempt + 1}/{max_retries}...")
            test_video = scraper.fetch_video_details("TpdYT_itu9U", fetch_transcript=True)

            if test_video and test_video.get('transcript'):
                logger.info(f"✅ Authentication and transcript test passed (attempt {attempt + 1})")
                return True
            elif test_video:
                logger.info(f"✅ Authentication passed, but no transcript (rate limited)")
                logger.info("This is expected - transcript fetching will resume with delays")
                return True
            else:
                logger.warning(f"❌ Authentication test failed (attempt {attempt + 1})")

        except Exception as e:
            logger.warning(f"Authentication test error (attempt {attempt + 1}): {e}")

        if attempt < max_retries - 1:
            retry_delay = (attempt + 1) * 60  # 1, 2, 3 minutes
            logger.info(f"Waiting {retry_delay} seconds before retry...")
            time.sleep(retry_delay)

    logger.error("❌ All authentication attempts failed")
    return False

def fetch_slow_backlog_with_transcripts():
    """Fetch ALL YouTube videos with transcripts using extended delays."""
    logger.info("🐌 YOUTUBE SLOW BACKLOG: All videos with transcripts and extended delays")
    logger.info("This process is designed to avoid rate limiting over 6-8 hours")
    logger.info("=" * 75)

    # Create config for production backlog
    config = ScraperConfig(
        source_name="youtube",
        brand_name="hvacknowitall",
        data_dir=Path("data_production_backlog"),
        logs_dir=Path("logs_production_backlog"),
        timezone="America/Halifax"
    )

    # Initialize scraper
    scraper = YouTubeScraper(config)

    # First get video count for estimation
    logger.info("Getting video count for time estimation...")
    video_list = scraper.fetch_channel_videos()
    if not video_list:
        logger.error("❌ Could not fetch video list")
        return False

    # Show time estimation
    estimate_completion_time(len(video_list))

    # Clear any existing state for full backlog
    if scraper.state_file.exists():
        scraper.state_file.unlink()
        logger.info("Cleared existing state for full backlog capture")

    start_time = time.time()

    try:
        # Fetch ALL videos with transcripts using slow mode (no max_posts = backlog mode)
        logger.info("\nStarting slow backlog capture with transcripts...")
        logger.info("Using extended delays: 30-90 seconds between videos + 2-5 minute breaks every 5 videos")

        videos = scraper.fetch_content(fetch_transcripts=True)  # No max_posts = slow backlog mode

        if not videos:
            logger.error("❌ No videos fetched")
            return False

        # Count videos with transcripts
        transcript_count = sum(1 for video in videos if video.get('transcript'))
        total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos)

        # Generate markdown
        logger.info("\nGenerating comprehensive markdown with transcripts...")
        markdown = scraper.format_markdown(videos)

        # Save with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"hvacknowitall_youtube_slow_backlog_transcripts_{timestamp}.md"

        output_dir = config.data_dir / "markdown_current"
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / filename

        output_file.write_text(markdown, encoding='utf-8')

        # Calculate final stats
        duration = time.time() - start_time
        avg_time_per_video = duration / len(videos)

        # Final statistics
        logger.info("\n" + "=" * 75)
        logger.info("🎉 SLOW YOUTUBE BACKLOG CAPTURE COMPLETE")
        logger.info(f"📊 FINAL STATISTICS:")
        logger.info(f"  Total videos processed: {len(videos)}")
        logger.info(f"  Videos with transcripts: {transcript_count}")
        logger.info(f"  Transcript success rate: {transcript_count/len(videos)*100:.1f}%")
        logger.info(f"  Total transcript characters: {total_transcript_chars:,}")
        logger.info(f"  Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars")
        logger.info(f"  Total processing time: {duration/3600:.1f} hours")
        logger.info(f"  Average time per video: {avg_time_per_video:.0f} seconds")
        logger.info(f"  Markdown file size: {output_file.stat().st_size / 1024 / 1024:.1f} MB")
        logger.info(f"📄 Saved to: {output_file}")

        # Success validation
        if len(videos) >= 300:  # Expect at least 300 videos
            logger.info(f"✅ SUCCESS: Captured {len(videos)} videos - full backlog complete")
        else:
            logger.warning(f"⚠️  Only {len(videos)} videos captured, expected ~370")

        if transcript_count >= len(videos) * 0.8:  # Expect 80%+ transcript success
            logger.info(f"✅ SUCCESS: {transcript_count/len(videos)*100:.1f}% transcript success rate")
        else:
            logger.warning(f"⚠️  Only {transcript_count/len(videos)*100:.1f}% transcript success")

        # Show transcript samples
        logger.info(f"\n📝 TRANSCRIPT SAMPLES:")
        transcript_videos = [v for v in videos if v.get('transcript')][:3]
        for i, video in enumerate(transcript_videos):
            title = video.get('title', 'Unknown')[:40] + "..."
            transcript = video.get('transcript', '')
            logger.info(f"  {i+1}. {title}")
            logger.info(f"     Length: {len(transcript):,} chars")
            preview = transcript[:80] + "..." if len(transcript) > 80 else transcript
            logger.info(f"     Preview: {preview}")

        return True

    except Exception as e:
        logger.error(f"❌ Slow backlog capture failed: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return False

def main():
    """Main execution with slow processing and time estimation."""
    print("\n🐌 YouTube Slow Backlog Capture with Transcripts")
    print("=" * 55)
    print("Extended delays to avoid rate limiting")
    print("Expected duration: 6-8 hours")
    print("Processing ~370 videos with 30-90 second delays + breaks")

    # Step 1: Test authentication with retry
    print("\nStep 1: Testing authentication with rate limit recovery...")
    if not test_authentication_with_retry():
        print("❌ Authentication failed after retries. Cannot proceed.")
        return False

    print("✅ Authentication validated")

    # Step 2: Show time commitment warning
    print(f"\nStep 2: Time commitment warning")
    print("⚠️  This process will take 6-8 hours to complete")
    print("⚠️  The process will run with 30-90 second delays between videos")
    print("⚠️  Extended 2-5 minute breaks every 5 videos")
    print("⚠️  This is necessary to avoid YouTube rate limiting")

    print("\nPress Enter to start slow backlog capture or Ctrl+C to cancel...")

    try:
        input()
    except KeyboardInterrupt:
        print("\nCancelled by user")
        return False

    # Step 3: Execute slow backlog
    return fetch_slow_backlog_with_transcripts()

if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.info("\nSlow backlog capture interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.critical(f"Slow backlog capture failed: {e}")
        sys.exit(2)