#!/usr/bin/env python3 """ Test YouTube transcript extraction """ import yt_dlp import json def test_transcript(video_id: str = "TpdYT_itu9U"): """Test fetching transcript for a YouTube video""" print(f"Testing transcript extraction for video: {video_id}") print("=" * 60) ydl_opts = { 'quiet': False, 'no_warnings': False, 'writesubtitles': True, # Download subtitles 'writeautomaticsub': True, # Download auto-generated subtitles if no manual ones 'subtitlesformat': 'json3', # Format for subtitles 'skip_download': True, # Don't download the video 'extract_flat': False, 'cookiefile': 'data_production_backlog/.cookies/youtube_cookies.txt', # Use existing cookies } try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: video_url = f"https://www.youtube.com/watch?v={video_id}" info = ydl.extract_info(video_url, download=False) # Check for subtitles subtitles = info.get('subtitles', {}) auto_captions = info.get('automatic_captions', {}) print(f"\nšŸ“ Video: {info.get('title', 'Unknown')}") print(f"Duration: {info.get('duration', 0)} seconds") print(f"\nšŸ“‹ Available subtitles:") if subtitles: print(f" Manual subtitles: {list(subtitles.keys())}") else: print(f" No manual subtitles") if auto_captions: print(f" Auto-generated captions: {list(auto_captions.keys())}") else: print(f" No auto-generated captions") # Try to get English transcript transcript_text = None # First try manual subtitles if 'en' in subtitles: print("\nāœ… English subtitles available!") # Get the subtitle URL for sub in subtitles['en']: if sub.get('ext') == 'json3': print(f" Subtitle URL: {sub.get('url', 'N/A')[:100]}...") break # Then try auto-generated elif 'en' in auto_captions: print("\nāœ… English auto-generated captions available!") # Get the caption URL for cap in auto_captions['en']: if cap.get('ext') == 'json3': print(f" Caption URL: {cap.get('url', 'N/A')[:100]}...") break else: print("\nāŒ No English transcripts available") return True except Exception as e: print(f"āŒ Error: {e}") return False if __name__ == "__main__": # Test with a recent video test_transcript("TpdYT_itu9U") print("\n" + "=" * 60) print("Transcript extraction is POSSIBLE with yt-dlp!") print("We can add this feature to the YouTube scraper.")