hvac-kia-content/test_youtube_scraper_enhanced.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

135 lines
No EOL
4.6 KiB
Python

#!/usr/bin/env python3
"""
Test the enhanced YouTube scraper with transcript support
"""
import sys
import json
from pathlib import Path
sys.path.append(str(Path(__file__).parent / 'src'))
from youtube_scraper import YouTubeScraper
from base_scraper import ScraperConfig
def test_single_video_with_transcript():
"""Test transcript extraction on a single video"""
print("🎥 Testing single video with transcript extraction")
print("=" * 60)
# Setup config
config = ScraperConfig(
source_name='youtube_test',
brand_name='hvacknowitall',
data_dir=Path('test_data/youtube_transcript'),
logs_dir=Path('test_logs/youtube_transcript'),
timezone='America/Halifax'
)
scraper = YouTubeScraper(config)
# Test with a specific video ID
video_id = "TpdYT_itu9U" # HVAC video we tested before
print(f"Fetching video details with transcript: {video_id}")
video_info = scraper.fetch_video_details(video_id, fetch_transcript=True)
if video_info:
print(f"✅ Video info extracted successfully!")
print(f" Title: {video_info.get('title', 'Unknown')}")
print(f" Duration: {video_info.get('duration', 0)} seconds")
print(f" Views: {video_info.get('view_count', 'Unknown')}")
transcript = video_info.get('transcript')
if transcript:
print(f" ✅ Transcript extracted: {len(transcript)} characters")
# Show preview
preview = transcript[:200] + "..." if len(transcript) > 200 else transcript
print(f" Preview: {preview}")
# Save to file for inspection
output_file = config.data_dir / 'test_video_with_transcript.json'
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(video_info, f, indent=2, ensure_ascii=False)
print(f" Saved full data to: {output_file}")
return True
else:
print(f" ❌ No transcript extracted")
return False
else:
print(f"❌ Failed to extract video info")
return False
def test_multiple_videos_with_transcripts():
"""Test fetching multiple videos with transcripts"""
print(f"\n🎬 Testing multiple videos with transcripts")
print("=" * 60)
# Setup config
config = ScraperConfig(
source_name='youtube_test_multi',
brand_name='hvacknowitall',
data_dir=Path('test_data/youtube_multi_transcript'),
logs_dir=Path('test_logs/youtube_multi_transcript'),
timezone='America/Halifax'
)
scraper = YouTubeScraper(config)
# Fetch 3 videos with transcripts
print(f"Fetching 3 videos with transcripts...")
videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True)
if videos:
print(f"✅ Fetched {len(videos)} videos!")
transcript_count = 0
total_transcript_chars = 0
for i, video in enumerate(videos):
title = video.get('title', 'Unknown')[:50] + "..."
transcript = video.get('transcript')
if transcript:
transcript_count += 1
total_transcript_chars += len(transcript)
print(f" {i+1}. {title} - ✅ Transcript ({len(transcript)} chars)")
else:
print(f" {i+1}. {title} - ❌ No transcript")
print(f"\nSummary:")
print(f" Videos with transcripts: {transcript_count}/{len(videos)}")
print(f" Total transcript characters: {total_transcript_chars:,}")
# Save to markdown
markdown = scraper.format_markdown(videos)
output_file = config.data_dir / 'youtube_with_transcripts.md'
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(markdown, encoding='utf-8')
print(f" Saved markdown to: {output_file}")
return transcript_count > 0
else:
print(f"❌ Failed to fetch videos")
return False
if __name__ == "__main__":
print("🧪 Testing Enhanced YouTube Scraper")
print("=" * 60)
success1 = test_single_video_with_transcript()
success2 = test_multiple_videos_with_transcripts()
if success1 and success2:
print(f"\n🎉 All tests passed!")
print(f"YouTube scraper with transcript support is working!")
else:
print(f"\n❌ Some tests failed")
print(f"Single video: {'' if success1 else ''}")
print(f"Multiple videos: {'' if success2 else ''}")