Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
144 lines
No EOL
5.2 KiB
Python
144 lines
No EOL
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch 100 YouTube videos with transcripts for backlog processing
|
|
This will capture the first 100 videos with full transcript extraction
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.base_scraper import ScraperConfig
|
|
from src.youtube_scraper import YouTubeScraper
|
|
from datetime import datetime
|
|
import logging
|
|
import time
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('youtube_100_transcripts.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def fetch_100_with_transcripts():
|
|
"""Fetch 100 YouTube videos with transcripts for backlog"""
|
|
logger.info("🎥 YOUTUBE BACKLOG: Fetching 100 videos WITH TRANSCRIPTS")
|
|
logger.info("This will take approximately 5-8 minutes (3-5 seconds per video)")
|
|
logger.info("=" * 70)
|
|
|
|
# Create config for backlog processing
|
|
config = ScraperConfig(
|
|
source_name="youtube",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data_production_backlog"),
|
|
logs_dir=Path("logs_production_backlog"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
# Initialize scraper
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# Test authentication first
|
|
auth_status = scraper.auth_handler.get_status()
|
|
if not auth_status['has_valid_cookies']:
|
|
logger.error("❌ No valid YouTube authentication found")
|
|
logger.error("Please ensure you're logged into YouTube in Firefox")
|
|
return False
|
|
|
|
logger.info(f"✅ Authentication validated: {auth_status['cookie_path']}")
|
|
|
|
# Fetch 100 videos with transcripts using the enhanced method
|
|
logger.info("Fetching 100 videos with transcripts...")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
videos = scraper.fetch_content(max_posts=100, fetch_transcripts=True)
|
|
|
|
if not videos:
|
|
logger.error("❌ No videos fetched")
|
|
return False
|
|
|
|
# Count videos with transcripts
|
|
transcript_count = sum(1 for video in videos if video.get('transcript'))
|
|
total_transcript_chars = sum(len(video.get('transcript', '')) for video in videos)
|
|
|
|
# Generate markdown
|
|
logger.info("\nGenerating markdown with transcripts...")
|
|
markdown = scraper.format_markdown(videos)
|
|
|
|
# Save with timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"hvacknowitall_youtube_backlog_100_transcripts_{timestamp}.md"
|
|
|
|
output_dir = config.data_dir / "markdown_current"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / filename
|
|
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
|
|
# Calculate duration
|
|
duration = time.time() - start_time
|
|
|
|
# Final statistics
|
|
logger.info("\n" + "=" * 70)
|
|
logger.info("🎉 YOUTUBE BACKLOG CAPTURE COMPLETE")
|
|
logger.info(f"📊 STATISTICS:")
|
|
logger.info(f" Total videos fetched: {len(videos)}")
|
|
logger.info(f" Videos with transcripts: {transcript_count}")
|
|
logger.info(f" Transcript success rate: {transcript_count/len(videos)*100:.1f}%")
|
|
logger.info(f" Total transcript characters: {total_transcript_chars:,}")
|
|
logger.info(f" Average transcript length: {total_transcript_chars/transcript_count if transcript_count > 0 else 0:,.0f} chars")
|
|
logger.info(f" Processing time: {duration/60:.1f} minutes")
|
|
logger.info(f" Average time per video: {duration/len(videos):.1f} seconds")
|
|
logger.info(f"📄 Saved to: {output_file}")
|
|
|
|
# Show sample transcript info
|
|
logger.info(f"\n📝 SAMPLE TRANSCRIPT DATA:")
|
|
for i, video in enumerate(videos[:3]):
|
|
title = video.get('title', 'Unknown')[:50] + "..."
|
|
transcript = video.get('transcript', '')
|
|
if transcript:
|
|
logger.info(f" {i+1}. {title} - {len(transcript):,} chars")
|
|
preview = transcript[:100] + "..." if len(transcript) > 100 else transcript
|
|
logger.info(f" Preview: {preview}")
|
|
else:
|
|
logger.info(f" {i+1}. {title} - No transcript")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to fetch videos: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
print("\n🎥 YouTube Backlog Capture with Transcripts")
|
|
print("=" * 50)
|
|
print("This will fetch 100 YouTube videos with full transcripts")
|
|
print("Estimated time: 5-8 minutes")
|
|
print("Output: Markdown file with videos and complete transcripts")
|
|
print("\nPress Enter to continue or Ctrl+C to cancel...")
|
|
|
|
try:
|
|
input()
|
|
except KeyboardInterrupt:
|
|
print("\nCancelled by user")
|
|
return False
|
|
|
|
return fetch_100_with_transcripts()
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|
|
except KeyboardInterrupt:
|
|
logger.info("\nCapture interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.critical(f"Capture failed: {e}")
|
|
sys.exit(2) |