#!/usr/bin/env python3 """ Test YouTube transcript extraction with authenticated cookies """ import sys from pathlib import Path sys.path.append(str(Path(__file__).parent / 'src')) from youtube_auth_handler import YouTubeAuthHandler import yt_dlp def test_hvac_video(): """Test with actual HVAC Know It All video""" # Use a real HVAC video URL video_url = "https://www.youtube.com/watch?v=TpdYT_itu9U" # Update this to actual HVAC video print("๐ŸŽฅ Testing YouTube transcript extraction") print("=" * 60) print(f"Video: {video_url}") handler = YouTubeAuthHandler() # Test authentication status status = handler.get_status() print(f"\n๐Ÿ“Š Auth Status:") print(f" Has valid cookies: {status['has_valid_cookies']}") print(f" Cookie path: {status['cookie_path']}") # Extract video info with transcripts print(f"\n๐Ÿ” Extracting video information...") video_info = handler.extract_video_info(video_url) if video_info: print(f"โœ… Video extraction successful!") print(f" Title: {video_info.get('title', 'Unknown')}") print(f" Duration: {video_info.get('duration', 0)} seconds") print(f" Views: {video_info.get('view_count', 'Unknown')}") # Check for transcripts subtitles = video_info.get('subtitles', {}) auto_captions = video_info.get('automatic_captions', {}) print(f"\n๐Ÿ“ Transcript Availability:") if subtitles: print(f" Manual subtitles: {list(subtitles.keys())}") if auto_captions: print(f" Auto-captions: {list(auto_captions.keys())}") if 'en' in auto_captions: print(f"\nโœ… English auto-captions found!") captions = auto_captions['en'] print(f" Available formats:") for i, cap in enumerate(captions[:3]): # Show first 3 formats ext = cap.get('ext', 'unknown') url = cap.get('url', '') print(f" {i+1}. {ext}: {url[:50]}...") # Try to fetch actual transcript content print(f"\n๐Ÿ“ฅ Fetching transcript content...") try: # Use first format (usually JSON) caption_url = captions[0]['url'] # Download caption content import urllib.request with urllib.request.urlopen(caption_url) as response: content = response.read().decode('utf-8') # Show preview preview = content[:500] + "..." if len(content) > 500 else content print(f" Content preview ({len(content)} chars):") print(f" {preview}") return True except Exception as e: print(f" โŒ Failed to fetch transcript: {e}") else: print(f" โŒ No English auto-captions available") else: print(f" โŒ No auto-captions available") else: print(f"โŒ Video extraction failed") return False return True def test_direct_yt_dlp(): """Test direct yt-dlp with cookies""" print(f"\n๐Ÿงช Testing direct yt-dlp with authenticated cookies") print("=" * 60) cookie_path = Path("data_production_backlog/.cookies/youtube_cookies.txt") ydl_opts = { 'cookiefile': str(cookie_path), 'quiet': False, 'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['en'], 'skip_download': True, } test_video = "https://www.youtube.com/watch?v=TpdYT_itu9U" try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: print(f"Extracting with direct yt-dlp...") info = ydl.extract_info(test_video, download=False) if info: print(f"โœ… Direct yt-dlp successful!") auto_captions = info.get('automatic_captions', {}) if 'en' in auto_captions: print(f"โœ… Transcripts available via direct yt-dlp!") return True else: print(f"โŒ No transcripts in direct yt-dlp") except Exception as e: print(f"โŒ Direct yt-dlp failed: {e}") return False if __name__ == "__main__": success = test_hvac_video() if not success: print(f"\n" + "="*60) success = test_direct_yt_dlp() if success: print(f"\n๐ŸŽ‰ YouTube transcript extraction is working!") print(f"Ready to update YouTube scraper with transcript support.") else: print(f"\nโŒ YouTube transcript extraction not working") print(f"May need additional authentication or different approach.")