hvac-kia-content/fetch_youtube_100_with_transcripts.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

144 lines
No EOL
5.2 KiB
Python

#!/usr/bin/env python3
"""
Fetch 100 YouTube videos with transcripts for backlog processing
This will capture the first 100 videos with full transcript extraction
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from src.base_scraper import ScraperConfig
from src.youtube_scraper import YouTubeScraper
from datetime import datetime
import logging
import time
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('youtube_100_transcripts.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def fetch_100_with_transcripts():
"""Fetch 100 YouTube videos with transcripts for backlog"""
logger.info("🎥 YOUTUBE BACKLOG: Fetching 100 videos WITH TRANSCRIPTS")
logger.info("This will take approximately 5-8 minutes (3-5 seconds per video)")
logger.info("=" * 70)
# Create config for backlog processing
config = ScraperConfig(
source_name="youtube",
brand_name="hvacknowitall",
data_dir=Path("data_production_backlog"),
logs_dir=Path("logs_production_backlog"),
timezone="America/Halifax"
)
# Initialize scraper
scraper = YouTubeScraper(config)
# Test authentication first
auth_status = scraper.auth_handler.get_status()
if not auth_status['has_valid_cookies']:
logger.error("❌ No valid YouTube authentication found")
logger.error("Please ensure you're logged into YouTube in Firefox")
return False
logger.info(f"✅ Authentication validated: {auth_status['cookie_path']}")
# Fetch 100 videos with transcripts using the enhanced method
logger.info("Fetching 100 videos with transcripts...")
start_time = time.time()
try:
videos = scraper.fetch_content(max_posts=100, fetch_transcripts=True)
if not videos:
logger.error("❌ No videos fetched")
return False
# Count videos with transcripts
transcript_count = sum(1 for video in videos if video.get('transcript'))
total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos)
# Generate markdown
logger.info("\nGenerating markdown with transcripts...")
markdown = scraper.format_markdown(videos)
# Save with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"hvacknowitall_youtube_backlog_100_transcripts_{timestamp}.md"
output_dir = config.data_dir / "markdown_current"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / filename
output_file.write_text(markdown, encoding='utf-8')
# Calculate duration
duration = time.time() - start_time
# Final statistics
logger.info("\n" + "=" * 70)
logger.info("🎉 YOUTUBE BACKLOG CAPTURE COMPLETE")
logger.info(f"📊 STATISTICS:")
logger.info(f" Total videos fetched: {len(videos)}")
logger.info(f" Videos with transcripts: {transcript_count}")
logger.info(f" Transcript success rate: {transcript_count/len(videos)*100:.1f}%")
logger.info(f" Total transcript characters: {total_transcript_chars:,}")
logger.info(f" Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars")
logger.info(f" Processing time: {duration/60:.1f} minutes")
logger.info(f" Average time per video: {duration/len(videos):.1f} seconds")
logger.info(f"📄 Saved to: {output_file}")
# Show sample transcript info
logger.info(f"\n📝 SAMPLE TRANSCRIPT DATA:")
for i, video in enumerate(videos[:3]):
title = video.get('title', 'Unknown')[:50] + "..."
transcript = video.get('transcript', '')
if transcript:
logger.info(f" {i+1}. {title} - {len(transcript):,} chars")
preview = transcript[:100] + "..." if len(transcript) > 100 else transcript
logger.info(f" Preview: {preview}")
else:
logger.info(f" {i+1}. {title} - No transcript")
return True
except Exception as e:
logger.error(f"❌ Failed to fetch videos: {e}")
return False
def main():
"""Main execution"""
print("\n🎥 YouTube Backlog Capture with Transcripts")
print("=" * 50)
print("This will fetch 100 YouTube videos with full transcripts")
print("Estimated time: 5-8 minutes")
print("Output: Markdown file with videos and complete transcripts")
print("\nPress Enter to continue or Ctrl+C to cancel...")
try:
input()
except KeyboardInterrupt:
print("\nCancelled by user")
return False
return fetch_100_with_transcripts()
if __name__ == "__main__":
try:
success = main()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
logger.info("\nCapture interrupted by user")
sys.exit(1)
except Exception as e:
logger.critical(f"Capture failed: {e}")
sys.exit(2)