Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
152 lines
No EOL
5.3 KiB
Python
Executable file
152 lines
No EOL
5.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
YouTube Backlog Capture with Transcripts - Slow Rate Limited Version
|
|
|
|
This script captures the complete YouTube channel backlog with transcripts
|
|
using extended delays to avoid YouTube's rate limiting on transcript fetching.
|
|
|
|
Designed for overnight/extended processing with minimal intervention required.
|
|
"""
|
|
|
|
import time
|
|
import random
|
|
import logging
|
|
from pathlib import Path
|
|
from src.base_scraper import ScraperConfig
|
|
from src.youtube_scraper import YouTubeScraper
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('logs_backlog_transcripts/youtube_slow_backlog.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def main():
|
|
"""Execute slow YouTube backlog capture with transcripts."""
|
|
|
|
print("=" * 80)
|
|
print("YouTube Backlog Capture with Transcripts - SLOW VERSION")
|
|
print("=" * 80)
|
|
print()
|
|
print("This script will:")
|
|
print("- Capture ALL available YouTube videos (~370 videos)")
|
|
print("- Download transcripts for each video")
|
|
print("- Use extended delays (60-120 seconds between videos)")
|
|
print("- Take 5-10 minute breaks every 5 videos")
|
|
print("- Estimated completion time: 8-12 hours")
|
|
print()
|
|
|
|
# Get user confirmation
|
|
confirm = input("This is a very long process. Continue? (y/N): ").strip().lower()
|
|
if confirm != 'y':
|
|
print("Cancelled.")
|
|
return
|
|
|
|
# Setup configuration for backlog processing
|
|
config = ScraperConfig(
|
|
source_name='youtube',
|
|
brand_name='hvacknowitall',
|
|
data_dir=Path('data_backlog_with_transcripts'),
|
|
logs_dir=Path('logs_backlog_transcripts'),
|
|
timezone='America/Halifax'
|
|
)
|
|
|
|
# Create directories
|
|
config.data_dir.mkdir(parents=True, exist_ok=True)
|
|
config.logs_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize scraper
|
|
scraper = YouTubeScraper(config)
|
|
|
|
# Clear any existing state to ensure full backlog
|
|
if scraper.state_file.exists():
|
|
scraper.state_file.unlink()
|
|
logger.info("Cleared existing state for full backlog capture")
|
|
|
|
# Override the backlog delay method with even more conservative delays
|
|
original_backlog_delay = scraper._backlog_delay
|
|
|
|
def ultra_conservative_delay(transcript_mode=False):
|
|
"""Ultra-conservative delays for transcript fetching."""
|
|
if transcript_mode:
|
|
# 60-120 seconds for transcript requests (much longer than original 30-90)
|
|
base_delay = random.uniform(60, 120)
|
|
else:
|
|
# 30-60 seconds for basic video info (longer than original 10-30)
|
|
base_delay = random.uniform(30, 60)
|
|
|
|
# Add extra randomization
|
|
jitter = random.uniform(0.9, 1.1)
|
|
final_delay = base_delay * jitter
|
|
|
|
logger.info(f"Ultra-conservative delay: {final_delay:.1f} seconds...")
|
|
time.sleep(final_delay)
|
|
|
|
# Replace the delay method
|
|
scraper._backlog_delay = ultra_conservative_delay
|
|
|
|
print("Starting YouTube backlog capture...")
|
|
print("Monitor progress in logs_backlog_transcripts/youtube_slow_backlog.log")
|
|
print()
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Fetch content with transcripts (no max_posts = full backlog)
|
|
videos = scraper.fetch_content(
|
|
max_posts=None, # Get all videos
|
|
fetch_transcripts=True
|
|
)
|
|
|
|
# Format and save markdown
|
|
if videos:
|
|
markdown_content = scraper.format_markdown(videos)
|
|
|
|
# Save to file
|
|
output_file = config.data_dir / "youtube_backlog_with_transcripts.md"
|
|
output_file.write_text(markdown_content, encoding='utf-8')
|
|
|
|
logger.info(f"Saved {len(videos)} videos with transcripts to {output_file}")
|
|
|
|
# Statistics
|
|
total_duration = time.time() - start_time
|
|
with_transcripts = sum(1 for v in videos if v.get('transcript'))
|
|
total_views = sum(v.get('view_count', 0) for v in videos)
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("YOUTUBE BACKLOG CAPTURE COMPLETED")
|
|
print("=" * 80)
|
|
print(f"Total videos captured: {len(videos)}")
|
|
print(f"Videos with transcripts: {with_transcripts}")
|
|
print(f"Success rate: {with_transcripts/len(videos)*100:.1f}%")
|
|
print(f"Total views: {total_views:,}")
|
|
print(f"Processing time: {total_duration/3600:.1f} hours")
|
|
print(f"Output file: {output_file}")
|
|
print("=" * 80)
|
|
|
|
else:
|
|
logger.error("No videos were captured")
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Process interrupted by user")
|
|
print("\nProcess interrupted. Partial results may be available.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during backlog capture: {e}")
|
|
print(f"\nError occurred: {e}")
|
|
|
|
finally:
|
|
# Restore original delay method
|
|
scraper._backlog_delay = original_backlog_delay
|
|
|
|
total_time = time.time() - start_time
|
|
print(f"\nTotal execution time: {total_time/3600:.1f} hours")
|
|
|
|
if __name__ == "__main__":
|
|
main() |