Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			127 lines
		
	
	
		
			No EOL
		
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			127 lines
		
	
	
		
			No EOL
		
	
	
		
			4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| Fetch additional YouTube videos to reach 1000 total
 | |
| """
 | |
| 
 | |
| import sys
 | |
| from pathlib import Path
 | |
| sys.path.insert(0, str(Path(__file__).parent))
 | |
| 
 | |
| from src.base_scraper import ScraperConfig
 | |
| from src.youtube_scraper import YouTubeScraper
 | |
| from datetime import datetime
 | |
| import logging
 | |
| import time
 | |
| 
 | |
| # Set up logging
 | |
| logging.basicConfig(
 | |
|     level=logging.INFO,
 | |
|     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | |
|     handlers=[
 | |
|         logging.FileHandler('youtube_1000.log'),
 | |
|         logging.StreamHandler()
 | |
|     ]
 | |
| )
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| def main():
 | |
|     """Fetch additional YouTube videos"""
 | |
|     logger.info("🎥 Fetching additional YouTube videos to reach 1000 total")
 | |
|     logger.info("Already have 200 videos, fetching 800 more...")
 | |
|     logger.info("=" * 60)
 | |
|     
 | |
|     # Create config for backlog
 | |
|     config = ScraperConfig(
 | |
|         source_name="youtube",
 | |
|         brand_name="hvacknowitall",
 | |
|         data_dir=Path("data_production_backlog"),
 | |
|         logs_dir=Path("logs_production_backlog"),
 | |
|         timezone="America/Halifax"
 | |
|     )
 | |
|     
 | |
|     # Initialize scraper
 | |
|     scraper = YouTubeScraper(config)
 | |
|     
 | |
|     # Clear state to fetch all videos from beginning
 | |
|     if scraper.state_file.exists():
 | |
|         scraper.state_file.unlink()
 | |
|         logger.info("Cleared state for full backlog capture")
 | |
|     
 | |
|     # Fetch 1000 videos (or all available if less)
 | |
|     logger.info("Starting YouTube fetch - targeting 1000 videos total...")
 | |
|     start_time = time.time()
 | |
|     
 | |
|     try:
 | |
|         videos = scraper.fetch_channel_videos(max_videos=1000)
 | |
|         
 | |
|         if not videos:
 | |
|             logger.error("No videos fetched")
 | |
|             return False
 | |
|         
 | |
|         logger.info(f"✅ Fetched {len(videos)} videos")
 | |
|         
 | |
|         # Generate markdown
 | |
|         markdown = scraper.format_markdown(videos)
 | |
|         
 | |
|         # Save with new timestamp
 | |
|         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 | |
|         filename = f"hvacknowitall_youtube_1000_backlog_{timestamp}.md"
 | |
|         
 | |
|         # Save to markdown directory
 | |
|         output_dir = config.data_dir / "markdown_current"
 | |
|         output_dir.mkdir(parents=True, exist_ok=True)
 | |
|         output_file = output_dir / filename
 | |
|         
 | |
|         output_file.write_text(markdown, encoding='utf-8')
 | |
|         logger.info(f"📄 Saved to: {output_file}")
 | |
|         
 | |
|         # Update state
 | |
|         new_state = {
 | |
|             'last_update': datetime.now().isoformat(),
 | |
|             'last_item_count': len(videos),
 | |
|             'backlog_captured': True,
 | |
|             'total_videos': len(videos)
 | |
|         }
 | |
|         
 | |
|         if videos:
 | |
|             new_state['last_video_id'] = videos[-1].get('id')
 | |
|             new_state['oldest_video_date'] = videos[-1].get('upload_date', '')
 | |
|         
 | |
|         scraper.save_state(new_state)
 | |
|         
 | |
|         # Statistics
 | |
|         duration = time.time() - start_time
 | |
|         logger.info("\n" + "=" * 60)
 | |
|         logger.info("📊 YOUTUBE CAPTURE COMPLETE")
 | |
|         logger.info(f"Total videos: {len(videos)}")
 | |
|         logger.info(f"Duration: {duration:.1f} seconds")
 | |
|         logger.info(f"Rate: {len(videos)/duration:.1f} videos/second")
 | |
|         
 | |
|         # Show date range
 | |
|         if videos:
 | |
|             newest_date = videos[0].get('upload_date', 'Unknown')
 | |
|             oldest_date = videos[-1].get('upload_date', 'Unknown')
 | |
|             logger.info(f"Date range: {oldest_date} to {newest_date}")
 | |
|         
 | |
|         # Check if we got all available videos
 | |
|         if len(videos) < 1000:
 | |
|             logger.info(f"⚠️ Channel has {len(videos)} total videos (less than 1000 requested)")
 | |
|         else:
 | |
|             logger.info("✅ Successfully fetched 1000 videos!")
 | |
|         
 | |
|         return True
 | |
|         
 | |
|     except Exception as e:
 | |
|         logger.error(f"Error fetching videos: {e}")
 | |
|         return False
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     try:
 | |
|         success = main()
 | |
|         sys.exit(0 if success else 1)
 | |
|     except KeyboardInterrupt:
 | |
|         logger.info("\nCapture interrupted by user")
 | |
|         sys.exit(1)
 | |
|     except Exception as e:
 | |
|         logger.critical(f"Capture failed: {e}")
 | |
|         sys.exit(2) |