Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
152 lines
No EOL
4.9 KiB
Python
152 lines
No EOL
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch YouTube videos with transcripts
|
|
This will take longer as it needs to fetch each video individually
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.base_scraper import ScraperConfig
|
|
from src.youtube_scraper import YouTubeScraper
|
|
from datetime import datetime
|
|
import logging
|
|
import time
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('youtube_transcripts.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def fetch_with_transcripts(max_videos: int = 10):
|
|
"""Fetch YouTube videos with transcripts"""
|
|
logger.info("🎥 Fetching YouTube videos WITH TRANSCRIPTS")
|
|
logger.info(f"This will fetch detailed info and transcripts for {max_videos} videos")
|
|
logger.info("Note: This is slower as each video requires individual API calls")
|
|
logger.info("=" * 60)
|
|
|
|
# Create config
|
|
config = ScraperConfig(
|
|
source_name="youtube",
|
|
brand_name="hvacknowitall",
|
|
data_dir=Path("data_production_backlog"),
|
|
logs_dir=Path("logs_production_backlog"),
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
# Initialize scraper
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# First get video list (fast)
|
|
logger.info(f"Step 1: Fetching video list from channel...")
|
|
videos = scraper.fetch_channel_videos(max_videos=max_videos)
|
|
|
|
if not videos:
|
|
logger.error("No videos found")
|
|
return False
|
|
|
|
logger.info(f"Found {len(videos)} videos")
|
|
|
|
# Now fetch detailed info with transcripts for each video
|
|
logger.info("\nStep 2: Fetching transcripts for each video...")
|
|
logger.info("This will take approximately 3-5 seconds per video")
|
|
|
|
videos_with_transcripts = []
|
|
transcript_count = 0
|
|
|
|
for i, video in enumerate(videos):
|
|
video_id = video.get('id')
|
|
if not video_id:
|
|
continue
|
|
|
|
logger.info(f"\n[{i+1}/{len(videos)}] Processing: {video.get('title', 'Unknown')[:60]}...")
|
|
|
|
# Add delay to avoid rate limiting
|
|
if i > 0:
|
|
scraper._humanized_delay(2, 4)
|
|
|
|
# Fetch with transcript
|
|
detailed_info = scraper.fetch_video_details(video_id, fetch_transcript=True)
|
|
|
|
if detailed_info:
|
|
if detailed_info.get('transcript'):
|
|
transcript_count += 1
|
|
logger.info(f" ✅ Transcript found!")
|
|
else:
|
|
logger.info(f" ⚠️ No transcript available")
|
|
|
|
videos_with_transcripts.append(detailed_info)
|
|
else:
|
|
logger.warning(f" ❌ Failed to fetch details")
|
|
# Use basic info if detailed fetch fails
|
|
videos_with_transcripts.append(video)
|
|
|
|
# Extra delay every 10 videos
|
|
if (i + 1) % 10 == 0:
|
|
logger.info("Taking extended break after 10 videos...")
|
|
time.sleep(10)
|
|
|
|
# Generate markdown
|
|
logger.info("\nStep 3: Generating markdown...")
|
|
markdown = scraper.format_markdown(videos_with_transcripts)
|
|
|
|
# Save with timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"hvacknowitall_youtube_transcripts_{timestamp}.md"
|
|
|
|
output_dir = config.data_dir / "markdown_current"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / filename
|
|
|
|
output_file.write_text(markdown, encoding='utf-8')
|
|
logger.info(f"📄 Saved to: {output_file}")
|
|
|
|
# Statistics
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("📊 YOUTUBE TRANSCRIPT CAPTURE COMPLETE")
|
|
logger.info(f"Total videos: {len(videos_with_transcripts)}")
|
|
logger.info(f"Videos with transcripts: {transcript_count}")
|
|
logger.info(f"Success rate: {transcript_count/len(videos_with_transcripts)*100:.1f}%")
|
|
|
|
return True
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
print("\n⚠️ WARNING: Fetching transcripts requires individual API calls for each video")
|
|
print("This will take approximately 3-5 seconds per video")
|
|
print(f"Estimated time for 370 videos: 20-30 minutes")
|
|
print("\nOptions:")
|
|
print("1. Test with 5 videos first")
|
|
print("2. Fetch first 50 videos with transcripts")
|
|
print("3. Fetch all 370 videos with transcripts (20-30 mins)")
|
|
print("4. Cancel")
|
|
|
|
choice = input("\nEnter choice (1-4): ")
|
|
|
|
if choice == "1":
|
|
return fetch_with_transcripts(5)
|
|
elif choice == "2":
|
|
return fetch_with_transcripts(50)
|
|
elif choice == "3":
|
|
return fetch_with_transcripts(370)
|
|
else:
|
|
print("Cancelled")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|
|
except KeyboardInterrupt:
|
|
logger.info("\nCapture interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.critical(f"Capture failed: {e}")
|
|
sys.exit(2) |