Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
131 lines
No EOL
4.4 KiB
Python
131 lines
No EOL
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test YouTube authentication with various methods
|
|
"""
|
|
|
|
import yt_dlp
|
|
from pathlib import Path
|
|
import json
|
|
|
|
def test_direct_extraction():
|
|
"""Try direct extraction without cookies first"""
|
|
|
|
print("Testing direct YouTube access...")
|
|
print("=" * 60)
|
|
|
|
test_video = "https://www.youtube.com/watch?v=TpdYT_itu9U"
|
|
|
|
# Basic options without authentication
|
|
ydl_opts = {
|
|
'quiet': False,
|
|
'no_warnings': False,
|
|
'extract_flat': False,
|
|
'skip_download': True,
|
|
'writesubtitles': True,
|
|
'writeautomaticsub': True,
|
|
'subtitleslangs': ['en'],
|
|
# Add user agent and headers
|
|
'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'referer': 'https://www.youtube.com/',
|
|
# Try age gate bypass
|
|
'age_limit': None,
|
|
# Format selection - try to avoid age-gated formats
|
|
'format': 'best[height<=720]',
|
|
}
|
|
|
|
try:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
print("Extracting video info...")
|
|
info = ydl.extract_info(test_video, download=False)
|
|
|
|
if info:
|
|
print(f"✅ Successfully extracted video info!")
|
|
print(f"Title: {info.get('title', 'Unknown')}")
|
|
print(f"Duration: {info.get('duration', 0)} seconds")
|
|
|
|
# Check for transcripts
|
|
subtitles = info.get('subtitles', {})
|
|
auto_captions = info.get('automatic_captions', {})
|
|
|
|
print(f"\nTranscript availability:")
|
|
if subtitles:
|
|
print(f" Manual subtitles: {list(subtitles.keys())}")
|
|
if auto_captions:
|
|
print(f" Auto-captions: {list(auto_captions.keys())[:5]}...") # Show first 5
|
|
|
|
if 'en' in auto_captions:
|
|
print(f"\n ✅ English auto-captions available!")
|
|
caption_urls = auto_captions['en']
|
|
for cap in caption_urls[:2]: # Show first 2 formats
|
|
print(f" - {cap.get('ext', 'unknown')}: {cap.get('url', '')[:80]}...")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return False
|
|
|
|
def test_with_cookie_file():
|
|
"""Test with existing cookie file"""
|
|
|
|
cookie_file = Path("data_production_backlog/.cookies/youtube_cookies.txt")
|
|
|
|
if not cookie_file.exists():
|
|
print(f"Cookie file not found: {cookie_file}")
|
|
return False
|
|
|
|
print(f"\nTesting with cookie file: {cookie_file}")
|
|
print("=" * 60)
|
|
|
|
test_video = "https://www.youtube.com/watch?v=TpdYT_itu9U"
|
|
|
|
ydl_opts = {
|
|
'cookiefile': str(cookie_file),
|
|
'quiet': False,
|
|
'no_warnings': False,
|
|
'skip_download': True,
|
|
'writesubtitles': True,
|
|
'writeautomaticsub': True,
|
|
'subtitleslangs': ['en'],
|
|
}
|
|
|
|
try:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
print("Extracting with cookies...")
|
|
info = ydl.extract_info(test_video, download=False)
|
|
|
|
if info:
|
|
print(f"✅ Success with cookies!")
|
|
|
|
# Check transcripts
|
|
auto_captions = info.get('automatic_captions', {})
|
|
if 'en' in auto_captions:
|
|
print(f"✅ Transcripts available with cookies!")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error with cookies: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
# Try direct first
|
|
success = test_direct_extraction()
|
|
|
|
if not success:
|
|
print("\n" + "=" * 60)
|
|
print("Direct extraction failed. Trying with cookies...")
|
|
success = test_with_cookie_file()
|
|
|
|
if success:
|
|
print("\n✅ YouTube access working!")
|
|
print("Transcripts can be fetched.")
|
|
else:
|
|
print("\n❌ YouTube access blocked")
|
|
print("\nYouTube is blocking automated access.")
|
|
print("This is a known issue with YouTube's anti-bot measures.")
|
|
print("\nPossible solutions:")
|
|
print("1. Use a proxy/VPN to change IP")
|
|
print("2. Wait and retry later")
|
|
print("3. Use authenticated browser session")
|
|
print("4. Use YouTube API with API key") |