#!/usr/bin/env python3 """ Production Backlog Capture Script This script performs a comprehensive backlog download for ALL sources with full media file downloading and NAS synchronization. Features: - Downloads complete historical content from all sources - Captures all available media files (images, videos, audio) - Organizes content by source and date - Syncs everything to NAS - Provides detailed progress reporting - Handles errors gracefully with retry logic """ import os import sys import time import json from pathlib import Path from datetime import datetime import logging from typing import Dict, Any # Add project to path sys.path.insert(0, str(Path(__file__).parent)) from src.orchestrator import ContentOrchestrator from src.base_scraper import ScraperConfig # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('production_backlog_capture.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class ProductionBacklogCapture: """Handles comprehensive backlog capture for production deployment""" def __init__(self, data_dir: Path = None): self.data_dir = data_dir or Path("data_production_backlog") self.logs_dir = Path("logs_production_backlog") self.start_time = time.time() # Create directories self.data_dir.mkdir(parents=True, exist_ok=True) self.logs_dir.mkdir(parents=True, exist_ok=True) # Initialize orchestrator self.orchestrator = ContentOrchestrator(self.data_dir, self.logs_dir) # Track results self.results = {} def capture_source_backlog(self, source_name: str, max_items: int = None) -> Dict[str, Any]: """Capture complete backlog for a specific source""" logger.info(f"Starting backlog capture for {source_name}...") start_time = time.time() try: scraper = self.orchestrator.scrapers.get(source_name) if not scraper: logger.error(f"Scraper not found: {source_name}") return {"success": False, "error": "Scraper not found", "items": 0} # Clear state for full backlog if scraper.state_file.exists(): scraper.state_file.unlink() logger.info(f"Cleared state for {source_name} - full backlog mode") # Fetch content with special handling for each source if source_name == "tiktok": # TikTok with captions for first 100 videos when fetching 1000 caption_count = min(100, max_items // 10) if max_items else 50 items = scraper.fetch_content( max_posts=max_items or 200, fetch_captions=True, max_caption_fetches=caption_count ) elif source_name == "youtube": items = scraper.fetch_channel_videos(max_videos=max_items or 100) elif source_name == "instagram": items = scraper.fetch_content(max_posts=max_items or 100) else: # RSS sources items = scraper.fetch_content(max_items=max_items) if not items: logger.warning(f"No items fetched for {source_name}") return {"success": True, "items": 0, "duration": time.time() - start_time} logger.info(f"Fetched {len(items)} items for {source_name}") # Download media files for items with media media_downloaded = 0 for i, item in enumerate(items): if i % 10 == 0: logger.info(f"Processing media for {source_name}: {i}/{len(items)}") # Download media based on item type media_urls = [] # Extract media URLs from various fields if 'image' in item and item['image']: media_urls.append((item['image'], 'image')) if 'thumbnail' in item and item['thumbnail']: media_urls.append((item['thumbnail'], 'image')) if 'video_url' in item and item['video_url']: media_urls.append((item['video_url'], 'video')) if 'audio_link' in item and item['audio_link']: media_urls.append((item['audio_link'], 'audio')) # Download each media file for url, media_type in media_urls: try: local_path = scraper.download_media(url, item.get('id', f'item_{i}'), media_type) if local_path: media_downloaded += 1 # Add local path to item if 'local_media' not in item: item['local_media'] = [] item['local_media'].append(local_path) except Exception as e: logger.warning(f"Failed to download media {url}: {e}") logger.info(f"Downloaded {media_downloaded} media files for {source_name}") # Generate and save markdown markdown = scraper.format_markdown(items) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"hvacknowitall_{source_name}_backlog_{timestamp}.md" # Save to current directory current_dir = scraper.config.data_dir / "markdown_current" current_dir.mkdir(parents=True, exist_ok=True) output_file = current_dir / filename output_file.write_text(markdown, encoding='utf-8') # Update state new_state = { 'last_update': datetime.now().isoformat(), 'last_item_count': len(items), 'backlog_captured': True, 'backlog_timestamp': timestamp } if items: new_state['last_id'] = items[-1].get('id') scraper.save_state(new_state) duration = time.time() - start_time logger.info(f"✅ {source_name}: {len(items)} items, {media_downloaded} media files in {duration:.1f}s") return { "success": True, "items": len(items), "media_files": media_downloaded, "duration": duration, "output_file": str(output_file) } except Exception as e: duration = time.time() - start_time logger.error(f"❌ {source_name} failed after {duration:.1f}s: {e}") return { "success": False, "error": str(e), "items": 0, "duration": duration } def capture_all_backlogs(self) -> Dict[str, Any]: """Capture backlogs for all sources""" logger.info("=" * 80) logger.info("STARTING PRODUCTION BACKLOG CAPTURE") logger.info("=" * 80) # Source configurations with appropriate limits sources_config = { "wordpress": {"max_items": None}, # All posts "mailchimp": {"max_items": None}, # All available (limited by RSS) "podcast": {"max_items": None}, # All episodes "youtube": {"max_items": 200}, # Last 200 videos "instagram": {"max_items": 200}, # Last 200 posts "tiktok": {"max_items": 300} # 300 videos with captions for first 50 } total_items = 0 total_media = 0 successful_sources = 0 for source_name, config in sources_config.items(): logger.info(f"\n{'-'*60}") logger.info(f"PROCESSING: {source_name.upper()}") logger.info(f"{'-'*60}") result = self.capture_source_backlog(source_name, config["max_items"]) self.results[source_name] = result if result["success"]: successful_sources += 1 total_items += result["items"] total_media += result.get("media_files", 0) # Add delay between sources to be respectful if source_name != list(sources_config.keys())[-1]: # Not last source logger.info("Waiting 30 seconds before next source...") time.sleep(30) # Generate summary total_duration = time.time() - self.start_time summary = { "timestamp": datetime.now().isoformat(), "total_duration": total_duration, "total_items": total_items, "total_media_files": total_media, "successful_sources": successful_sources, "total_sources": len(sources_config), "results": self.results } # Save summary summary_file = self.data_dir / f"backlog_capture_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(summary_file, 'w') as f: json.dump(summary, f, indent=2) logger.info("\n" + "=" * 80) logger.info("BACKLOG CAPTURE COMPLETE") logger.info("=" * 80) logger.info(f"Total items: {total_items:,}") logger.info(f"Total media files: {total_media:,}") logger.info(f"Successful sources: {successful_sources}/{len(sources_config)}") logger.info(f"Total duration: {total_duration/60:.1f} minutes") logger.info(f"Summary saved: {summary_file}") return summary def sync_to_nas(self) -> bool: """Sync all captured data to NAS""" logger.info("\n" + "=" * 60) logger.info("SYNCING TO NAS") logger.info("=" * 60) try: success = self.orchestrator.sync_to_nas() if success: logger.info("✅ NAS sync completed successfully") else: logger.error("❌ NAS sync failed") return success except Exception as e: logger.error(f"❌ NAS sync error: {e}") return False def main(): """Main execution function""" print("🚀 HVAC Know It All - Production Backlog Capture") print("=" * 60) print("This will download complete historical content from ALL sources") print("Including all available media files (images, videos, audio)") print("Estimated time: 2-4 hours depending on content volume") print("=" * 60) response = input("Proceed with full backlog capture? (y/N): ") if response.lower() != 'y': print("Backlog capture cancelled.") return False # Initialize capture capture = ProductionBacklogCapture() # Capture all backlogs summary = capture.capture_all_backlogs() # Sync to NAS if any content was captured if summary["total_items"] > 0: nas_success = capture.sync_to_nas() summary["nas_sync_success"] = nas_success else: logger.warning("No content captured - skipping NAS sync") summary["nas_sync_success"] = False # Final summary print(f"\n🎉 PRODUCTION BACKLOG CAPTURE COMPLETE!") print(f"📊 Summary:") print(f" • Total items captured: {summary['total_items']:,}") print(f" • Total media files: {summary['total_media_files']:,}") print(f" • Sources processed: {summary['successful_sources']}/{summary['total_sources']}") print(f" • Duration: {summary['total_duration']/60:.1f} minutes") print(f" • NAS sync: {'✅' if summary.get('nas_sync_success') else '❌'}") return summary["successful_sources"] > 0 if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: print("\n\nBacklog capture interrupted by user") sys.exit(1) except Exception as e: logger.critical(f"Backlog capture failed: {e}") sys.exit(2)