#!/usr/bin/env python3 """ Fetch YouTube videos with transcripts This will take longer as it needs to fetch each video individually """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.youtube_scraper import YouTubeScraper from datetime import datetime import logging import time # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('youtube_transcripts.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def fetch_with_transcripts(max_videos: int = 10): """Fetch YouTube videos with transcripts""" logger.info("šŸŽ„ Fetching YouTube videos WITH TRANSCRIPTS") logger.info(f"This will fetch detailed info and transcripts for {max_videos} videos") logger.info("Note: This is slower as each video requires individual API calls") logger.info("=" * 60) # Create config config = ScraperConfig( source_name="youtube", brand_name="hvacknowitall", data_dir=Path("data_production_backlog"), logs_dir=Path("logs_production_backlog"), timezone="America/Halifax" ) # Initialize scraper scraper = YouTubeScraper(config) # First get video list (fast) logger.info(f"Step 1: Fetching video list from channel...") videos = scraper.fetch_channel_videos(max_videos=max_videos) if not videos: logger.error("No videos found") return False logger.info(f"Found {len(videos)} videos") # Now fetch detailed info with transcripts for each video logger.info("\nStep 2: Fetching transcripts for each video...") logger.info("This will take approximately 3-5 seconds per video") videos_with_transcripts = [] transcript_count = 0 for i, video in enumerate(videos): video_id = video.get('id') if not video_id: continue logger.info(f"\n[{i+1}/{len(videos)}] Processing: {video.get('title', 'Unknown')[:60]}...") # Add delay to avoid rate limiting if i > 0: scraper._humanized_delay(2, 4) # Fetch with transcript detailed_info = scraper.fetch_video_details(video_id, fetch_transcript=True) if detailed_info: if detailed_info.get('transcript'): transcript_count += 1 logger.info(f" āœ… Transcript found!") else: logger.info(f" āš ļø No transcript available") videos_with_transcripts.append(detailed_info) else: logger.warning(f" āŒ Failed to fetch details") # Use basic info if detailed fetch fails videos_with_transcripts.append(video) # Extra delay every 10 videos if (i + 1) % 10 == 0: logger.info("Taking extended break after 10 videos...") time.sleep(10) # Generate markdown logger.info("\nStep 3: Generating markdown...") markdown = scraper.format_markdown(videos_with_transcripts) # Save with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"hvacknowitall_youtube_transcripts_{timestamp}.md" output_dir = config.data_dir / "markdown_current" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / filename output_file.write_text(markdown, encoding='utf-8') logger.info(f"šŸ“„ Saved to: {output_file}") # Statistics logger.info("\n" + "=" * 60) logger.info("šŸ“Š YOUTUBE TRANSCRIPT CAPTURE COMPLETE") logger.info(f"Total videos: {len(videos_with_transcripts)}") logger.info(f"Videos with transcripts: {transcript_count}") logger.info(f"Success rate: {transcript_count/len(videos_with_transcripts)*100:.1f}%") return True def main(): """Main execution""" print("\nāš ļø WARNING: Fetching transcripts requires individual API calls for each video") print("This will take approximately 3-5 seconds per video") print(f"Estimated time for 370 videos: 20-30 minutes") print("\nOptions:") print("1. Test with 5 videos first") print("2. Fetch first 50 videos with transcripts") print("3. Fetch all 370 videos with transcripts (20-30 mins)") print("4. Cancel") choice = input("\nEnter choice (1-4): ") if choice == "1": return fetch_with_transcripts(5) elif choice == "2": return fetch_with_transcripts(50) elif choice == "3": return fetch_with_transcripts(370) else: print("Cancelled") return False if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: logger.info("\nCapture interrupted by user") sys.exit(1) except Exception as e: logger.critical(f"Capture failed: {e}") sys.exit(2)