hvac-kia-content/fetch_youtube_with_transcripts.py

#!/usr/bin/env python3
"""
Fetch YouTube videos with transcripts
This will take longer as it needs to fetch each video individually
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper
from datetime import datetime
import logging
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('youtube_transcripts.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def fetch_with_transcripts(max_videos: int = 10):
    """Fetch YouTube videos with transcripts"""
    logger.info("🎥 Fetching YouTube videos WITH TRANSCRIPTS")
    logger.info(f"This will fetch detailed info and transcripts for {max_videos} videos")
    logger.info("Note: This is slower as each video requires individual API calls")
    logger.info("=" * 60)

    # Create config
    config = ScraperConfig(
        source_name="youtube",
        brand_name="hvacknowitall",
        data_dir=Path("data_production_backlog"),
        logs_dir=Path("logs_production_backlog"),
        timezone="America/Halifax"
    )

    # Initialize scraper
    scraper = YouTubeScraper(config)

    # First get video list (fast)
    logger.info(f"Step 1: Fetching video list from channel...")
    videos = scraper.fetch_channel_videos(max_videos=max_videos)

    if not videos:
        logger.error("No videos found")
        return False

    logger.info(f"Found {len(videos)} videos")

    # Now fetch detailed info with transcripts for each video
    logger.info("\nStep 2: Fetching transcripts for each video...")
    logger.info("This will take approximately 3-5 seconds per video")

    videos_with_transcripts = []
    transcript_count = 0

    for i, video in enumerate(videos):
        video_id = video.get('id')
        if not video_id:
            continue

        logger.info(f"\n[{i+1}/{len(videos)}] Processing: {video.get('title', 'Unknown')[:60]}...")

        # Add delay to avoid rate limiting
        if i > 0:
            scraper._humanized_delay(2, 4)

        # Fetch with transcript
        detailed_info = scraper.fetch_video_details(video_id, fetch_transcript=True)

        if detailed_info:
            if detailed_info.get('transcript'):
                transcript_count += 1
                logger.info(f"  ✅ Transcript found!")
            else:
                logger.info(f"  ⚠️ No transcript available")

            videos_with_transcripts.append(detailed_info)
        else:
            logger.warning(f"  ❌ Failed to fetch details")
            # Use basic info if detailed fetch fails
            videos_with_transcripts.append(video)

        # Extra delay every 10 videos
        if (i + 1) % 10 == 0:
            logger.info("Taking extended break after 10 videos...")
            time.sleep(10)

    # Generate markdown
    logger.info("\nStep 3: Generating markdown...")
    markdown = scraper.format_markdown(videos_with_transcripts)

    # Save with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"hvacknowitall_youtube_transcripts_{timestamp}.md"

    output_dir = config.data_dir / "markdown_current"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / filename

    output_file.write_text(markdown, encoding='utf-8')
    logger.info(f"📄 Saved to: {output_file}")

    # Statistics
    logger.info("\n" + "=" * 60)
    logger.info("📊 YOUTUBE TRANSCRIPT CAPTURE COMPLETE")
    logger.info(f"Total videos: {len(videos_with_transcripts)}")
    logger.info(f"Videos with transcripts: {transcript_count}")
    logger.info(f"Success rate: {transcript_count/len(videos_with_transcripts)*100:.1f}%")

    return True

def main():
    """Main execution"""
    print("\n⚠️ WARNING: Fetching transcripts requires individual API calls for each video")
    print("This will take approximately 3-5 seconds per video")
    print(f"Estimated time for 370 videos: 20-30 minutes")
    print("\nOptions:")
    print("1. Test with 5 videos first")
    print("2. Fetch first 50 videos with transcripts")
    print("3. Fetch all 370 videos with transcripts (20-30 mins)")
    print("4. Cancel")

    choice = input("\nEnter choice (1-4): ")

    if choice == "1":
        return fetch_with_transcripts(5)
    elif choice == "2":
        return fetch_with_transcripts(50)
    elif choice == "3":
        return fetch_with_transcripts(370)
    else:
        print("Cancelled")
        return False

if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.info("\nCapture interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.critical(f"Capture failed: {e}")
        sys.exit(2)