Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
84 lines
No EOL
3 KiB
Python
84 lines
No EOL
3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test YouTube transcript extraction
|
|
"""
|
|
|
|
import yt_dlp
|
|
import json
|
|
|
|
def test_transcript(video_id: str = "TpdYT_itu9U"):
|
|
"""Test fetching transcript for a YouTube video"""
|
|
|
|
print(f"Testing transcript extraction for video: {video_id}")
|
|
print("=" * 60)
|
|
|
|
ydl_opts = {
|
|
'quiet': False,
|
|
'no_warnings': False,
|
|
'writesubtitles': True, # Download subtitles
|
|
'writeautomaticsub': True, # Download auto-generated subtitles if no manual ones
|
|
'subtitlesformat': 'json3', # Format for subtitles
|
|
'skip_download': True, # Don't download the video
|
|
'extract_flat': False,
|
|
'cookiefile': 'data_production_backlog/.cookies/youtube_cookies.txt', # Use existing cookies
|
|
}
|
|
|
|
try:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
info = ydl.extract_info(video_url, download=False)
|
|
|
|
# Check for subtitles
|
|
subtitles = info.get('subtitles', {})
|
|
auto_captions = info.get('automatic_captions', {})
|
|
|
|
print(f"\n📝 Video: {info.get('title', 'Unknown')}")
|
|
print(f"Duration: {info.get('duration', 0)} seconds")
|
|
|
|
print(f"\n📋 Available subtitles:")
|
|
if subtitles:
|
|
print(f" Manual subtitles: {list(subtitles.keys())}")
|
|
else:
|
|
print(f" No manual subtitles")
|
|
|
|
if auto_captions:
|
|
print(f" Auto-generated captions: {list(auto_captions.keys())}")
|
|
else:
|
|
print(f" No auto-generated captions")
|
|
|
|
# Try to get English transcript
|
|
transcript_text = None
|
|
|
|
# First try manual subtitles
|
|
if 'en' in subtitles:
|
|
print("\n✅ English subtitles available!")
|
|
# Get the subtitle URL
|
|
for sub in subtitles['en']:
|
|
if sub.get('ext') == 'json3':
|
|
print(f" Subtitle URL: {sub.get('url', 'N/A')[:100]}...")
|
|
break
|
|
|
|
# Then try auto-generated
|
|
elif 'en' in auto_captions:
|
|
print("\n✅ English auto-generated captions available!")
|
|
# Get the caption URL
|
|
for cap in auto_captions['en']:
|
|
if cap.get('ext') == 'json3':
|
|
print(f" Caption URL: {cap.get('url', 'N/A')[:100]}...")
|
|
break
|
|
else:
|
|
print("\n❌ No English transcripts available")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
# Test with a recent video
|
|
test_transcript("TpdYT_itu9U")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Transcript extraction is POSSIBLE with yt-dlp!")
|
|
print("We can add this feature to the YouTube scraper.") |