#!/usr/bin/env python3 """ YouTube Slow Backlog Capture: ALL VIDEOS with Transcripts Extended delays to avoid rate limiting - expected duration: 6-8 hours """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.youtube_scraper import YouTubeScraper from datetime import datetime, timedelta import logging import time # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('youtube_slow_backlog_transcripts.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def estimate_completion_time(total_videos: int): """Estimate completion time with extended delays.""" # Per video: 30-90 seconds delay + 3-5 seconds processing = ~60 seconds average avg_time_per_video = 60 # seconds # Extra breaks: every 5 videos, 2-5 minutes (3.5 min average) breaks_count = total_videos // 5 break_time = breaks_count * 3.5 * 60 # seconds total_seconds = (total_videos * avg_time_per_video) + break_time total_hours = total_seconds / 3600 estimated_completion = datetime.now() + timedelta(seconds=total_seconds) logger.info(f"šŸ“Š TIME ESTIMATION:") logger.info(f" Videos to process: {total_videos}") logger.info(f" Average time per video: {avg_time_per_video} seconds") logger.info(f" Extended breaks: {breaks_count} breaks x 3.5 min = {break_time/60:.0f} minutes") logger.info(f" Total estimated time: {total_hours:.1f} hours") logger.info(f" Estimated completion: {estimated_completion.strftime('%Y-%m-%d %H:%M:%S')}") return total_hours def test_authentication_with_retry(): """Test authentication with retry after rate limiting.""" logger.info("šŸ” Testing YouTube authentication with rate limit recovery...") config = ScraperConfig( source_name="youtube_test", brand_name="hvacknowitall", data_dir=Path("test_data/auth_retry_test"), logs_dir=Path("test_logs/auth_retry_test"), timezone="America/Halifax" ) scraper = YouTubeScraper(config) max_retries = 3 for attempt in range(max_retries): try: # Test with single video logger.info(f"Authentication test attempt {attempt + 1}/{max_retries}...") test_video = scraper.fetch_video_details("TpdYT_itu9U", fetch_transcript=True) if test_video and test_video.get('transcript'): logger.info(f"āœ… Authentication and transcript test passed (attempt {attempt + 1})") return True elif test_video: logger.info(f"āœ… Authentication passed, but no transcript (rate limited)") logger.info("This is expected - transcript fetching will resume with delays") return True else: logger.warning(f"āŒ Authentication test failed (attempt {attempt + 1})") except Exception as e: logger.warning(f"Authentication test error (attempt {attempt + 1}): {e}") if attempt < max_retries - 1: retry_delay = (attempt + 1) * 60 # 1, 2, 3 minutes logger.info(f"Waiting {retry_delay} seconds before retry...") time.sleep(retry_delay) logger.error("āŒ All authentication attempts failed") return False def fetch_slow_backlog_with_transcripts(): """Fetch ALL YouTube videos with transcripts using extended delays.""" logger.info("🐌 YOUTUBE SLOW BACKLOG: All videos with transcripts and extended delays") logger.info("This process is designed to avoid rate limiting over 6-8 hours") logger.info("=" * 75) # Create config for production backlog config = ScraperConfig( source_name="youtube", brand_name="hvacknowitall", data_dir=Path("data_production_backlog"), logs_dir=Path("logs_production_backlog"), timezone="America/Halifax" ) # Initialize scraper scraper = YouTubeScraper(config) # First get video count for estimation logger.info("Getting video count for time estimation...") video_list = scraper.fetch_channel_videos() if not video_list: logger.error("āŒ Could not fetch video list") return False # Show time estimation estimate_completion_time(len(video_list)) # Clear any existing state for full backlog if scraper.state_file.exists(): scraper.state_file.unlink() logger.info("Cleared existing state for full backlog capture") start_time = time.time() try: # Fetch ALL videos with transcripts using slow mode (no max_posts = backlog mode) logger.info("\nStarting slow backlog capture with transcripts...") logger.info("Using extended delays: 30-90 seconds between videos + 2-5 minute breaks every 5 videos") videos = scraper.fetch_content(fetch_transcripts=True) # No max_posts = slow backlog mode if not videos: logger.error("āŒ No videos fetched") return False # Count videos with transcripts transcript_count = sum(1 for video in videos if video.get('transcript')) total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos) # Generate markdown logger.info("\nGenerating comprehensive markdown with transcripts...") markdown = scraper.format_markdown(videos) # Save with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"hvacknowitall_youtube_slow_backlog_transcripts_{timestamp}.md" output_dir = config.data_dir / "markdown_current" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / filename output_file.write_text(markdown, encoding='utf-8') # Calculate final stats duration = time.time() - start_time avg_time_per_video = duration / len(videos) # Final statistics logger.info("\n" + "=" * 75) logger.info("šŸŽ‰ SLOW YOUTUBE BACKLOG CAPTURE COMPLETE") logger.info(f"šŸ“Š FINAL STATISTICS:") logger.info(f" Total videos processed: {len(videos)}") logger.info(f" Videos with transcripts: {transcript_count}") logger.info(f" Transcript success rate: {transcript_count/len(videos)*100:.1f}%") logger.info(f" Total transcript characters: {total_transcript_chars:,}") logger.info(f" Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars") logger.info(f" Total processing time: {duration/3600:.1f} hours") logger.info(f" Average time per video: {avg_time_per_video:.0f} seconds") logger.info(f" Markdown file size: {output_file.stat().st_size / 1024 / 1024:.1f} MB") logger.info(f"šŸ“„ Saved to: {output_file}") # Success validation if len(videos) >= 300: # Expect at least 300 videos logger.info(f"āœ… SUCCESS: Captured {len(videos)} videos - full backlog complete") else: logger.warning(f"āš ļø Only {len(videos)} videos captured, expected ~370") if transcript_count >= len(videos) * 0.8: # Expect 80%+ transcript success logger.info(f"āœ… SUCCESS: {transcript_count/len(videos)*100:.1f}% transcript success rate") else: logger.warning(f"āš ļø Only {transcript_count/len(videos)*100:.1f}% transcript success") # Show transcript samples logger.info(f"\nšŸ“ TRANSCRIPT SAMPLES:") transcript_videos = [v for v in videos if v.get('transcript')][:3] for i, video in enumerate(transcript_videos): title = video.get('title', 'Unknown')[:40] + "..." transcript = video.get('transcript', '') logger.info(f" {i+1}. {title}") logger.info(f" Length: {len(transcript):,} chars") preview = transcript[:80] + "..." if len(transcript) > 80 else transcript logger.info(f" Preview: {preview}") return True except Exception as e: logger.error(f"āŒ Slow backlog capture failed: {e}") import traceback logger.error(traceback.format_exc()) return False def main(): """Main execution with slow processing and time estimation.""" print("\n🐌 YouTube Slow Backlog Capture with Transcripts") print("=" * 55) print("Extended delays to avoid rate limiting") print("Expected duration: 6-8 hours") print("Processing ~370 videos with 30-90 second delays + breaks") # Step 1: Test authentication with retry print("\nStep 1: Testing authentication with rate limit recovery...") if not test_authentication_with_retry(): print("āŒ Authentication failed after retries. Cannot proceed.") return False print("āœ… Authentication validated") # Step 2: Show time commitment warning print(f"\nStep 2: Time commitment warning") print("āš ļø This process will take 6-8 hours to complete") print("āš ļø The process will run with 30-90 second delays between videos") print("āš ļø Extended 2-5 minute breaks every 5 videos") print("āš ļø This is necessary to avoid YouTube rate limiting") print("\nPress Enter to start slow backlog capture or Ctrl+C to cancel...") try: input() except KeyboardInterrupt: print("\nCancelled by user") return False # Step 3: Execute slow backlog return fetch_slow_backlog_with_transcripts() if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: logger.info("\nSlow backlog capture interrupted by user") sys.exit(1) except Exception as e: logger.critical(f"Slow backlog capture failed: {e}") sys.exit(2)