hvac-kia-content/test_youtube_transcripts.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

145 lines
No EOL
5 KiB
Python

#!/usr/bin/env python3
"""
Test YouTube transcript extraction with authenticated cookies
"""
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent / 'src'))
from youtube_auth_handler import YouTubeAuthHandler
import yt_dlp
def test_hvac_video():
"""Test with actual HVAC Know It All video"""
# Use a real HVAC video URL
video_url = "https://www.youtube.com/watch?v=TpdYT_itu9U" # Update this to actual HVAC video
print("🎥 Testing YouTube transcript extraction")
print("=" * 60)
print(f"Video: {video_url}")
handler = YouTubeAuthHandler()
# Test authentication status
status = handler.get_status()
print(f"\n📊 Auth Status:")
print(f" Has valid cookies: {status['has_valid_cookies']}")
print(f" Cookie path: {status['cookie_path']}")
# Extract video info with transcripts
print(f"\n🔍 Extracting video information...")
video_info = handler.extract_video_info(video_url)
if video_info:
print(f"✅ Video extraction successful!")
print(f" Title: {video_info.get('title', 'Unknown')}")
print(f" Duration: {video_info.get('duration', 0)} seconds")
print(f" Views: {video_info.get('view_count', 'Unknown')}")
# Check for transcripts
subtitles = video_info.get('subtitles', {})
auto_captions = video_info.get('automatic_captions', {})
print(f"\n📝 Transcript Availability:")
if subtitles:
print(f" Manual subtitles: {list(subtitles.keys())}")
if auto_captions:
print(f" Auto-captions: {list(auto_captions.keys())}")
if 'en' in auto_captions:
print(f"\n✅ English auto-captions found!")
captions = auto_captions['en']
print(f" Available formats:")
for i, cap in enumerate(captions[:3]): # Show first 3 formats
ext = cap.get('ext', 'unknown')
url = cap.get('url', '')
print(f" {i+1}. {ext}: {url[:50]}...")
# Try to fetch actual transcript content
print(f"\n📥 Fetching transcript content...")
try:
# Use first format (usually JSON)
caption_url = captions[0]['url']
# Download caption content
import urllib.request
with urllib.request.urlopen(caption_url) as response:
content = response.read().decode('utf-8')
# Show preview
preview = content[:500] + "..." if len(content) > 500 else content
print(f" Content preview ({len(content)} chars):")
print(f" {preview}")
return True
except Exception as e:
print(f" ❌ Failed to fetch transcript: {e}")
else:
print(f" ❌ No English auto-captions available")
else:
print(f" ❌ No auto-captions available")
else:
print(f"❌ Video extraction failed")
return False
return True
def test_direct_yt_dlp():
"""Test direct yt-dlp with cookies"""
print(f"\n🧪 Testing direct yt-dlp with authenticated cookies")
print("=" * 60)
cookie_path = Path("data_production_backlog/.cookies/youtube_cookies.txt")
ydl_opts = {
'cookiefile': str(cookie_path),
'quiet': False,
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'skip_download': True,
}
test_video = "https://www.youtube.com/watch?v=TpdYT_itu9U"
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
print(f"Extracting with direct yt-dlp...")
info = ydl.extract_info(test_video, download=False)
if info:
print(f"✅ Direct yt-dlp successful!")
auto_captions = info.get('automatic_captions', {})
if 'en' in auto_captions:
print(f"✅ Transcripts available via direct yt-dlp!")
return True
else:
print(f"❌ No transcripts in direct yt-dlp")
except Exception as e:
print(f"❌ Direct yt-dlp failed: {e}")
return False
if __name__ == "__main__":
success = test_hvac_video()
if not success:
print(f"\n" + "="*60)
success = test_direct_yt_dlp()
if success:
print(f"\n🎉 YouTube transcript extraction is working!")
print(f"Ready to update YouTube scraper with transcript support.")
else:
print(f"\n❌ YouTube transcript extraction not working")
print(f"May need additional authentication or different approach.")