Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
145 lines
No EOL
5 KiB
Python
145 lines
No EOL
5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test YouTube transcript extraction with authenticated cookies
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.append(str(Path(__file__).parent / 'src'))
|
|
|
|
from youtube_auth_handler import YouTubeAuthHandler
|
|
import yt_dlp
|
|
|
|
def test_hvac_video():
|
|
"""Test with actual HVAC Know It All video"""
|
|
|
|
# Use a real HVAC video URL
|
|
video_url = "https://www.youtube.com/watch?v=TpdYT_itu9U" # Update this to actual HVAC video
|
|
|
|
print("🎥 Testing YouTube transcript extraction")
|
|
print("=" * 60)
|
|
print(f"Video: {video_url}")
|
|
|
|
handler = YouTubeAuthHandler()
|
|
|
|
# Test authentication status
|
|
status = handler.get_status()
|
|
print(f"\n📊 Auth Status:")
|
|
print(f" Has valid cookies: {status['has_valid_cookies']}")
|
|
print(f" Cookie path: {status['cookie_path']}")
|
|
|
|
# Extract video info with transcripts
|
|
print(f"\n🔍 Extracting video information...")
|
|
video_info = handler.extract_video_info(video_url)
|
|
|
|
if video_info:
|
|
print(f"✅ Video extraction successful!")
|
|
print(f" Title: {video_info.get('title', 'Unknown')}")
|
|
print(f" Duration: {video_info.get('duration', 0)} seconds")
|
|
print(f" Views: {video_info.get('view_count', 'Unknown')}")
|
|
|
|
# Check for transcripts
|
|
subtitles = video_info.get('subtitles', {})
|
|
auto_captions = video_info.get('automatic_captions', {})
|
|
|
|
print(f"\n📝 Transcript Availability:")
|
|
|
|
if subtitles:
|
|
print(f" Manual subtitles: {list(subtitles.keys())}")
|
|
|
|
if auto_captions:
|
|
print(f" Auto-captions: {list(auto_captions.keys())}")
|
|
|
|
if 'en' in auto_captions:
|
|
print(f"\n✅ English auto-captions found!")
|
|
captions = auto_captions['en']
|
|
|
|
print(f" Available formats:")
|
|
for i, cap in enumerate(captions[:3]): # Show first 3 formats
|
|
ext = cap.get('ext', 'unknown')
|
|
url = cap.get('url', '')
|
|
print(f" {i+1}. {ext}: {url[:50]}...")
|
|
|
|
# Try to fetch actual transcript content
|
|
print(f"\n📥 Fetching transcript content...")
|
|
try:
|
|
# Use first format (usually JSON)
|
|
caption_url = captions[0]['url']
|
|
|
|
# Download caption content
|
|
import urllib.request
|
|
with urllib.request.urlopen(caption_url) as response:
|
|
content = response.read().decode('utf-8')
|
|
|
|
# Show preview
|
|
preview = content[:500] + "..." if len(content) > 500 else content
|
|
print(f" Content preview ({len(content)} chars):")
|
|
print(f" {preview}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Failed to fetch transcript: {e}")
|
|
else:
|
|
print(f" ❌ No English auto-captions available")
|
|
else:
|
|
print(f" ❌ No auto-captions available")
|
|
|
|
else:
|
|
print(f"❌ Video extraction failed")
|
|
return False
|
|
|
|
return True
|
|
|
|
def test_direct_yt_dlp():
|
|
"""Test direct yt-dlp with cookies"""
|
|
|
|
print(f"\n🧪 Testing direct yt-dlp with authenticated cookies")
|
|
print("=" * 60)
|
|
|
|
cookie_path = Path("data_production_backlog/.cookies/youtube_cookies.txt")
|
|
|
|
ydl_opts = {
|
|
'cookiefile': str(cookie_path),
|
|
'quiet': False,
|
|
'writesubtitles': True,
|
|
'writeautomaticsub': True,
|
|
'subtitleslangs': ['en'],
|
|
'skip_download': True,
|
|
}
|
|
|
|
test_video = "https://www.youtube.com/watch?v=TpdYT_itu9U"
|
|
|
|
try:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
print(f"Extracting with direct yt-dlp...")
|
|
info = ydl.extract_info(test_video, download=False)
|
|
|
|
if info:
|
|
print(f"✅ Direct yt-dlp successful!")
|
|
|
|
auto_captions = info.get('automatic_captions', {})
|
|
if 'en' in auto_captions:
|
|
print(f"✅ Transcripts available via direct yt-dlp!")
|
|
return True
|
|
else:
|
|
print(f"❌ No transcripts in direct yt-dlp")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Direct yt-dlp failed: {e}")
|
|
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = test_hvac_video()
|
|
|
|
if not success:
|
|
print(f"\n" + "="*60)
|
|
success = test_direct_yt_dlp()
|
|
|
|
if success:
|
|
print(f"\n🎉 YouTube transcript extraction is working!")
|
|
print(f"Ready to update YouTube scraper with transcript support.")
|
|
else:
|
|
print(f"\n❌ YouTube transcript extraction not working")
|
|
print(f"May need additional authentication or different approach.") |