#!/usr/bin/env python3 """ Fetch 100 YouTube videos with transcripts for backlog processing This will capture the first 100 videos with full transcript extraction """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.youtube_scraper import YouTubeScraper from datetime import datetime import logging import time # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('youtube_100_transcripts.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def fetch_100_with_transcripts(): """Fetch 100 YouTube videos with transcripts for backlog""" logger.info("šŸŽ„ YOUTUBE BACKLOG: Fetching 100 videos WITH TRANSCRIPTS") logger.info("This will take approximately 5-8 minutes (3-5 seconds per video)") logger.info("=" * 70) # Create config for backlog processing config = ScraperConfig( source_name="youtube", brand_name="hvacknowitall", data_dir=Path("data_production_backlog"), logs_dir=Path("logs_production_backlog"), timezone="America/Halifax" ) # Initialize scraper scraper = YouTubeScraper(config) # Test authentication first auth_status = scraper.auth_handler.get_status() if not auth_status['has_valid_cookies']: logger.error("āŒ No valid YouTube authentication found") logger.error("Please ensure you're logged into YouTube in Firefox") return False logger.info(f"āœ… Authentication validated: {auth_status['cookie_path']}") # Fetch 100 videos with transcripts using the enhanced method logger.info("Fetching 100 videos with transcripts...") start_time = time.time() try: videos = scraper.fetch_content(max_posts=100, fetch_transcripts=True) if not videos: logger.error("āŒ No videos fetched") return False # Count videos with transcripts transcript_count = sum(1 for video in videos if video.get('transcript')) total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos) # Generate markdown logger.info("\nGenerating markdown with transcripts...") markdown = scraper.format_markdown(videos) # Save with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"hvacknowitall_youtube_backlog_100_transcripts_{timestamp}.md" output_dir = config.data_dir / "markdown_current" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / filename output_file.write_text(markdown, encoding='utf-8') # Calculate duration duration = time.time() - start_time # Final statistics logger.info("\n" + "=" * 70) logger.info("šŸŽ‰ YOUTUBE BACKLOG CAPTURE COMPLETE") logger.info(f"šŸ“Š STATISTICS:") logger.info(f" Total videos fetched: {len(videos)}") logger.info(f" Videos with transcripts: {transcript_count}") logger.info(f" Transcript success rate: {transcript_count/len(videos)*100:.1f}%") logger.info(f" Total transcript characters: {total_transcript_chars:,}") logger.info(f" Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars") logger.info(f" Processing time: {duration/60:.1f} minutes") logger.info(f" Average time per video: {duration/len(videos):.1f} seconds") logger.info(f"šŸ“„ Saved to: {output_file}") # Show sample transcript info logger.info(f"\nšŸ“ SAMPLE TRANSCRIPT DATA:") for i, video in enumerate(videos[:3]): title = video.get('title', 'Unknown')[:50] + "..." transcript = video.get('transcript', '') if transcript: logger.info(f" {i+1}. {title} - {len(transcript):,} chars") preview = transcript[:100] + "..." if len(transcript) > 100 else transcript logger.info(f" Preview: {preview}") else: logger.info(f" {i+1}. {title} - No transcript") return True except Exception as e: logger.error(f"āŒ Failed to fetch videos: {e}") return False def main(): """Main execution""" print("\nšŸŽ„ YouTube Backlog Capture with Transcripts") print("=" * 50) print("This will fetch 100 YouTube videos with full transcripts") print("Estimated time: 5-8 minutes") print("Output: Markdown file with videos and complete transcripts") print("\nPress Enter to continue or Ctrl+C to cancel...") try: input() except KeyboardInterrupt: print("\nCancelled by user") return False return fetch_100_with_transcripts() if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: logger.info("\nCapture interrupted by user") sys.exit(1) except Exception as e: logger.critical(f"Capture failed: {e}") sys.exit(2)