#!/usr/bin/env python3 """ Fetch additional YouTube videos to reach 1000 total """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from src.base_scraper import ScraperConfig from src.youtube_scraper import YouTubeScraper from datetime import datetime import logging import time # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('youtube_1000.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def main(): """Fetch additional YouTube videos""" logger.info("🎥 Fetching additional YouTube videos to reach 1000 total") logger.info("Already have 200 videos, fetching 800 more...") logger.info("=" * 60) # Create config for backlog config = ScraperConfig( source_name="youtube", brand_name="hvacknowitall", data_dir=Path("data_production_backlog"), logs_dir=Path("logs_production_backlog"), timezone="America/Halifax" ) # Initialize scraper scraper = YouTubeScraper(config) # Clear state to fetch all videos from beginning if scraper.state_file.exists(): scraper.state_file.unlink() logger.info("Cleared state for full backlog capture") # Fetch 1000 videos (or all available if less) logger.info("Starting YouTube fetch - targeting 1000 videos total...") start_time = time.time() try: videos = scraper.fetch_channel_videos(max_videos=1000) if not videos: logger.error("No videos fetched") return False logger.info(f"✅ Fetched {len(videos)} videos") # Generate markdown markdown = scraper.format_markdown(videos) # Save with new timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"hvacknowitall_youtube_1000_backlog_{timestamp}.md" # Save to markdown directory output_dir = config.data_dir / "markdown_current" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / filename output_file.write_text(markdown, encoding='utf-8') logger.info(f"📄 Saved to: {output_file}") # Update state new_state = { 'last_update': datetime.now().isoformat(), 'last_item_count': len(videos), 'backlog_captured': True, 'total_videos': len(videos) } if videos: new_state['last_video_id'] = videos[-1].get('id') new_state['oldest_video_date'] = videos[-1].get('upload_date', '') scraper.save_state(new_state) # Statistics duration = time.time() - start_time logger.info("\n" + "=" * 60) logger.info("📊 YOUTUBE CAPTURE COMPLETE") logger.info(f"Total videos: {len(videos)}") logger.info(f"Duration: {duration:.1f} seconds") logger.info(f"Rate: {len(videos)/duration:.1f} videos/second") # Show date range if videos: newest_date = videos[0].get('upload_date', 'Unknown') oldest_date = videos[-1].get('upload_date', 'Unknown') logger.info(f"Date range: {oldest_date} to {newest_date}") # Check if we got all available videos if len(videos) < 1000: logger.info(f"⚠️ Channel has {len(videos)} total videos (less than 1000 requested)") else: logger.info("✅ Successfully fetched 1000 videos!") return True except Exception as e: logger.error(f"Error fetching videos: {e}") return False if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: logger.info("\nCapture interrupted by user") sys.exit(1) except Exception as e: logger.critical(f"Capture failed: {e}") sys.exit(2)