#!/usr/bin/env python3 """ Test the enhanced YouTube scraper with transcript support """ import sys import json from pathlib import Path sys.path.append(str(Path(__file__).parent / 'src')) from youtube_scraper import YouTubeScraper from base_scraper import ScraperConfig def test_single_video_with_transcript(): """Test transcript extraction on a single video""" print("๐ŸŽฅ Testing single video with transcript extraction") print("=" * 60) # Setup config config = ScraperConfig( source_name='youtube_test', brand_name='hvacknowitall', data_dir=Path('test_data/youtube_transcript'), logs_dir=Path('test_logs/youtube_transcript'), timezone='America/Halifax' ) scraper = YouTubeScraper(config) # Test with a specific video ID video_id = "TpdYT_itu9U" # HVAC video we tested before print(f"Fetching video details with transcript: {video_id}") video_info = scraper.fetch_video_details(video_id, fetch_transcript=True) if video_info: print(f"โœ… Video info extracted successfully!") print(f" Title: {video_info.get('title', 'Unknown')}") print(f" Duration: {video_info.get('duration', 0)} seconds") print(f" Views: {video_info.get('view_count', 'Unknown')}") transcript = video_info.get('transcript') if transcript: print(f" โœ… Transcript extracted: {len(transcript)} characters") # Show preview preview = transcript[:200] + "..." if len(transcript) > 200 else transcript print(f" Preview: {preview}") # Save to file for inspection output_file = config.data_dir / 'test_video_with_transcript.json' output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(video_info, f, indent=2, ensure_ascii=False) print(f" Saved full data to: {output_file}") return True else: print(f" โŒ No transcript extracted") return False else: print(f"โŒ Failed to extract video info") return False def test_multiple_videos_with_transcripts(): """Test fetching multiple videos with transcripts""" print(f"\n๐ŸŽฌ Testing multiple videos with transcripts") print("=" * 60) # Setup config config = ScraperConfig( source_name='youtube_test_multi', brand_name='hvacknowitall', data_dir=Path('test_data/youtube_multi_transcript'), logs_dir=Path('test_logs/youtube_multi_transcript'), timezone='America/Halifax' ) scraper = YouTubeScraper(config) # Fetch 3 videos with transcripts print(f"Fetching 3 videos with transcripts...") videos = scraper.fetch_content(max_posts=3, fetch_transcripts=True) if videos: print(f"โœ… Fetched {len(videos)} videos!") transcript_count = 0 total_transcript_chars = 0 for i, video in enumerate(videos): title = video.get('title', 'Unknown')[:50] + "..." transcript = video.get('transcript') if transcript: transcript_count += 1 total_transcript_chars += len(transcript) print(f" {i+1}. {title} - โœ… Transcript ({len(transcript)} chars)") else: print(f" {i+1}. {title} - โŒ No transcript") print(f"\nSummary:") print(f" Videos with transcripts: {transcript_count}/{len(videos)}") print(f" Total transcript characters: {total_transcript_chars:,}") # Save to markdown markdown = scraper.format_markdown(videos) output_file = config.data_dir / 'youtube_with_transcripts.md' output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(markdown, encoding='utf-8') print(f" Saved markdown to: {output_file}") return transcript_count > 0 else: print(f"โŒ Failed to fetch videos") return False if __name__ == "__main__": print("๐Ÿงช Testing Enhanced YouTube Scraper") print("=" * 60) success1 = test_single_video_with_transcript() success2 = test_multiple_videos_with_transcripts() if success1 and success2: print(f"\n๐ŸŽ‰ All tests passed!") print(f"YouTube scraper with transcript support is working!") else: print(f"\nโŒ Some tests failed") print(f"Single video: {'โœ…' if success1 else 'โŒ'}") print(f"Multiple videos: {'โœ…' if success2 else 'โŒ'}")