#!/usr/bin/env python3 """ YouTube Backlog Capture with Transcripts - Slow Rate Limited Version This script captures the complete YouTube channel backlog with transcripts using extended delays to avoid YouTube's rate limiting on transcript fetching. Designed for overnight/extended processing with minimal intervention required. """ import time import random import logging from pathlib import Path from src.base_scraper import ScraperConfig from src.youtube_scraper import YouTubeScraper # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('logs_backlog_transcripts/youtube_slow_backlog.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def main(): """Execute slow YouTube backlog capture with transcripts.""" print("=" * 80) print("YouTube Backlog Capture with Transcripts - SLOW VERSION") print("=" * 80) print() print("This script will:") print("- Capture ALL available YouTube videos (~370 videos)") print("- Download transcripts for each video") print("- Use extended delays (60-120 seconds between videos)") print("- Take 5-10 minute breaks every 5 videos") print("- Estimated completion time: 8-12 hours") print() # Get user confirmation confirm = input("This is a very long process. Continue? (y/N): ").strip().lower() if confirm != 'y': print("Cancelled.") return # Setup configuration for backlog processing config = ScraperConfig( source_name='youtube', brand_name='hvacknowitall', data_dir=Path('data_backlog_with_transcripts'), logs_dir=Path('logs_backlog_transcripts'), timezone='America/Halifax' ) # Create directories config.data_dir.mkdir(parents=True, exist_ok=True) config.logs_dir.mkdir(parents=True, exist_ok=True) # Initialize scraper scraper = YouTubeScraper(config) # Clear any existing state to ensure full backlog if scraper.state_file.exists(): scraper.state_file.unlink() logger.info("Cleared existing state for full backlog capture") # Override the backlog delay method with even more conservative delays original_backlog_delay = scraper._backlog_delay def ultra_conservative_delay(transcript_mode=False): """Ultra-conservative delays for transcript fetching.""" if transcript_mode: # 60-120 seconds for transcript requests (much longer than original 30-90) base_delay = random.uniform(60, 120) else: # 30-60 seconds for basic video info (longer than original 10-30) base_delay = random.uniform(30, 60) # Add extra randomization jitter = random.uniform(0.9, 1.1) final_delay = base_delay * jitter logger.info(f"Ultra-conservative delay: {final_delay:.1f} seconds...") time.sleep(final_delay) # Replace the delay method scraper._backlog_delay = ultra_conservative_delay print("Starting YouTube backlog capture...") print("Monitor progress in logs_backlog_transcripts/youtube_slow_backlog.log") print() start_time = time.time() try: # Fetch content with transcripts (no max_posts = full backlog) videos = scraper.fetch_content( max_posts=None, # Get all videos fetch_transcripts=True ) # Format and save markdown if videos: markdown_content = scraper.format_markdown(videos) # Save to file output_file = config.data_dir / "youtube_backlog_with_transcripts.md" output_file.write_text(markdown_content, encoding='utf-8') logger.info(f"Saved {len(videos)} videos with transcripts to {output_file}") # Statistics total_duration = time.time() - start_time with_transcripts = sum(1 for v in videos if v.get('transcript')) total_views = sum(v.get('view_count', 0) for v in videos) print() print("=" * 80) print("YOUTUBE BACKLOG CAPTURE COMPLETED") print("=" * 80) print(f"Total videos captured: {len(videos)}") print(f"Videos with transcripts: {with_transcripts}") print(f"Success rate: {with_transcripts/len(videos)*100:.1f}%") print(f"Total views: {total_views:,}") print(f"Processing time: {total_duration/3600:.1f} hours") print(f"Output file: {output_file}") print("=" * 80) else: logger.error("No videos were captured") except KeyboardInterrupt: logger.info("Process interrupted by user") print("\nProcess interrupted. Partial results may be available.") except Exception as e: logger.error(f"Error during backlog capture: {e}") print(f"\nError occurred: {e}") finally: # Restore original delay method scraper._backlog_delay = original_backlog_delay total_time = time.time() - start_time print(f"\nTotal execution time: {total_time/3600:.1f} hours") if __name__ == "__main__": main()