hvac-kia-content/fetch_youtube_100_with_transcripts.py

#!/usr/bin/env python3
"""
Fetch 100 YouTube videos with transcripts for backlog processing
This will capture the first 100 videos with full transcript extraction
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper
from datetime import datetime
import logging
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('youtube_100_transcripts.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def fetch_100_with_transcripts():
    """Fetch 100 YouTube videos with transcripts for backlog"""
    logger.info("🎥 YOUTUBE BACKLOG: Fetching 100 videos WITH TRANSCRIPTS")
    logger.info("This will take approximately 5-8 minutes (3-5 seconds per video)")
    logger.info("=" * 70)

    # Create config for backlog processing
    config = ScraperConfig(
        source_name="youtube",
        brand_name="hvacknowitall",
        data_dir=Path("data_production_backlog"),
        logs_dir=Path("logs_production_backlog"),
        timezone="America/Halifax"
    )

    # Initialize scraper
    scraper = YouTubeScraper(config)

    # Test authentication first
    auth_status = scraper.auth_handler.get_status()
    if not auth_status['has_valid_cookies']:
        logger.error("❌ No valid YouTube authentication found")
        logger.error("Please ensure you're logged into YouTube in Firefox")
        return False

    logger.info(f"✅ Authentication validated: {auth_status['cookie_path']}")

    # Fetch 100 videos with transcripts using the enhanced method
    logger.info("Fetching 100 videos with transcripts...")
    start_time = time.time()

    try:
        videos = scraper.fetch_content(max_posts=100, fetch_transcripts=True)

        if not videos:
            logger.error("❌ No videos fetched")
            return False

        # Count videos with transcripts
        transcript_count = sum(1 for video in videos if video.get('transcript'))
        total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos)

        # Generate markdown
        logger.info("\nGenerating markdown with transcripts...")
        markdown = scraper.format_markdown(videos)

        # Save with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"hvacknowitall_youtube_backlog_100_transcripts_{timestamp}.md"

        output_dir = config.data_dir / "markdown_current"
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / filename

        output_file.write_text(markdown, encoding='utf-8')

        # Calculate duration
        duration = time.time() - start_time

        # Final statistics
        logger.info("\n" + "=" * 70)
        logger.info("🎉 YOUTUBE BACKLOG CAPTURE COMPLETE")
        logger.info(f"📊 STATISTICS:")
        logger.info(f"  Total videos fetched: {len(videos)}")
        logger.info(f"  Videos with transcripts: {transcript_count}")
        logger.info(f"  Transcript success rate: {transcript_count/len(videos)*100:.1f}%")
        logger.info(f"  Total transcript characters: {total_transcript_chars:,}")
        logger.info(f"  Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars")
        logger.info(f"  Processing time: {duration/60:.1f} minutes")
        logger.info(f"  Average time per video: {duration/len(videos):.1f} seconds")
        logger.info(f"📄 Saved to: {output_file}")

        # Show sample transcript info
        logger.info(f"\n📝 SAMPLE TRANSCRIPT DATA:")
        for i, video in enumerate(videos[:3]):
            title = video.get('title', 'Unknown')[:50] + "..."
            transcript = video.get('transcript', '')
            if transcript:
                logger.info(f"  {i+1}. {title} - {len(transcript):,} chars")
                preview = transcript[:100] + "..." if len(transcript) > 100 else transcript
                logger.info(f"     Preview: {preview}")
            else:
                logger.info(f"  {i+1}. {title} - No transcript")

        return True

    except Exception as e:
        logger.error(f"❌ Failed to fetch videos: {e}")
        return False

def main():
    """Main execution"""
    print("\n🎥 YouTube Backlog Capture with Transcripts")
    print("=" * 50)
    print("This will fetch 100 YouTube videos with full transcripts")
    print("Estimated time: 5-8 minutes")
    print("Output: Markdown file with videos and complete transcripts")
    print("\nPress Enter to continue or Ctrl+C to cancel...")

    try:
        input()
    except KeyboardInterrupt:
        print("\nCancelled by user")
        return False

    return fetch_100_with_transcripts()

if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.info("\nCapture interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.critical(f"Capture failed: {e}")
        sys.exit(2)