Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
135 lines
No EOL
4.6 KiB
Python
135 lines
No EOL
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the enhanced YouTube scraper with transcript support
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
sys.path.append(str(Path(__file__).parent / 'src'))
|
|
|
|
from youtube_scraper import YouTubeScraper
|
|
from base_scraper import ScraperConfig
|
|
|
|
def test_single_video_with_transcript():
|
|
"""Test transcript extraction on a single video"""
|
|
|
|
print("🎥 Testing single video with transcript extraction")
|
|
print("=" * 60)
|
|
|
|
# Setup config
|
|
config = ScraperConfig(
|
|
source_name='youtube_test',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('test_data/youtube_transcript'),
|
|
logs_dir=Path('test_logs/youtube_transcript'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# Test with a specific video ID
|
|
video_id = "TpdYT_itu9U" # HVAC video we tested before
|
|
|
|
print(f"Fetching video details with transcript: {video_id}")
|
|
video_info = scraper.fetch_video_details(video_id, fetch_transcript=True)
|
|
|
|
if video_info:
|
|
print(f"✅ Video info extracted successfully!")
|
|
print(f" Title: {video_info.get('title', 'Unknown')}")
|
|
print(f" Duration: {video_info.get('duration', 0)} seconds")
|
|
print(f" Views: {video_info.get('view_count', 'Unknown')}")
|
|
|
|
transcript = video_info.get('transcript')
|
|
if transcript:
|
|
print(f" ✅ Transcript extracted: {len(transcript)} characters")
|
|
|
|
# Show preview
|
|
preview = transcript[:200] + "..." if len(transcript) > 200 else transcript
|
|
print(f" Preview: {preview}")
|
|
|
|
# Save to file for inspection
|
|
output_file = config.data_dir / 'test_video_with_transcript.json'
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(video_info, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" Saved full data to: {output_file}")
|
|
return True
|
|
else:
|
|
print(f" ❌ No transcript extracted")
|
|
return False
|
|
else:
|
|
print(f"❌ Failed to extract video info")
|
|
return False
|
|
|
|
def test_multiple_videos_with_transcripts():
|
|
"""Test fetching multiple videos with transcripts"""
|
|
|
|
print(f"\n🎬 Testing multiple videos with transcripts")
|
|
print("=" * 60)
|
|
|
|
# Setup config
|
|
config = ScraperConfig(
|
|
source_name='youtube_test_multi',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('test_data/youtube_multi_transcript'),
|
|
logs_dir=Path('test_logs/youtube_multi_transcript'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# Fetch 3 videos with transcripts
|
|
print(f"Fetching 3 videos with transcripts...")
|
|
videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True)
|
|
|
|
if videos:
|
|
print(f"✅ Fetched {len(videos)} videos!")
|
|
|
|
transcript_count = 0
|
|
total_transcript_chars = 0
|
|
|
|
for i, video in enumerate(videos):
|
|
title = video.get('title', 'Unknown')[:50] + "..."
|
|
transcript = video.get('transcript')
|
|
|
|
if transcript:
|
|
transcript_count += 1
|
|
total_transcript_chars += len(transcript)
|
|
print(f" {i+1}. {title} - ✅ Transcript ({len(transcript)} chars)")
|
|
else:
|
|
print(f" {i+1}. {title} - ❌ No transcript")
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Videos with transcripts: {transcript_count}/{len(videos)}")
|
|
print(f" Total transcript characters: {total_transcript_chars:,}")
|
|
|
|
# Save to markdown
|
|
markdown = scraper.format_markdown(videos)
|
|
output_file = config.data_dir / 'youtube_with_transcripts.md'
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
|
|
print(f" Saved markdown to: {output_file}")
|
|
|
|
return transcript_count > 0
|
|
else:
|
|
print(f"❌ Failed to fetch videos")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
print("🧪 Testing Enhanced YouTube Scraper")
|
|
print("=" * 60)
|
|
|
|
success1 = test_single_video_with_transcript()
|
|
success2 = test_multiple_videos_with_transcripts()
|
|
|
|
if success1 and success2:
|
|
print(f"\n🎉 All tests passed!")
|
|
print(f"YouTube scraper with transcript support is working!")
|
|
else:
|
|
print(f"\n❌ Some tests failed")
|
|
print(f"Single video: {'✅' if success1 else '❌'}")
|
|
print(f"Multiple videos: {'✅' if success2 else '❌'}") |