hvac-kia-content/production_backlog_capture.py

#!/usr/bin/env python3
"""
Production Backlog Capture Script

This script performs a comprehensive backlog download for ALL sources
with full media file downloading and NAS synchronization.

Features:
- Downloads complete historical content from all sources
- Captures all available media files (images, videos, audio)
- Organizes content by source and date
- Syncs everything to NAS
- Provides detailed progress reporting
- Handles errors gracefully with retry logic
"""

import os
import sys
import time
import json
from pathlib import Path
from datetime import datetime
import logging
from typing import Dict, Any

# Add project to path
sys.path.insert(0, str(Path(__file__).parent))

from src.orchestrator import ContentOrchestrator
from src.base_scraper import ScraperConfig

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('production_backlog_capture.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class ProductionBacklogCapture:
    """Handles comprehensive backlog capture for production deployment"""

    def __init__(self, data_dir: Path = None):
        self.data_dir = data_dir or Path("data_production_backlog")
        self.logs_dir = Path("logs_production_backlog")
        self.start_time = time.time()

        # Create directories
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.logs_dir.mkdir(parents=True, exist_ok=True)

        # Initialize orchestrator
        self.orchestrator = ContentOrchestrator(self.data_dir, self.logs_dir)

        # Track results
        self.results = {}

    def capture_source_backlog(self, source_name: str, max_items: int = None) -> Dict[str, Any]:
        """Capture complete backlog for a specific source"""
        logger.info(f"Starting backlog capture for {source_name}...")

        start_time = time.time()

        try:
            scraper = self.orchestrator.scrapers.get(source_name)
            if not scraper:
                logger.error(f"Scraper not found: {source_name}")
                return {"success": False, "error": "Scraper not found", "items": 0}

            # Clear state for full backlog
            if scraper.state_file.exists():
                scraper.state_file.unlink()
                logger.info(f"Cleared state for {source_name} - full backlog mode")

            # Fetch content with special handling for each source
            if source_name == "tiktok":
                # TikTok with captions for first 100 videos when fetching 1000
                caption_count = min(100, max_items // 10) if max_items else 50
                items = scraper.fetch_content(
                    max_posts=max_items or 200,
                    fetch_captions=True,
                    max_caption_fetches=caption_count
                )
            elif source_name == "youtube":
                items = scraper.fetch_channel_videos(max_videos=max_items or 100)
            elif source_name == "instagram":
                items = scraper.fetch_content(max_posts=max_items or 100)
            else:
                # RSS sources
                items = scraper.fetch_content(max_items=max_items)

            if not items:
                logger.warning(f"No items fetched for {source_name}")
                return {"success": True, "items": 0, "duration": time.time() - start_time}

            logger.info(f"Fetched {len(items)} items for {source_name}")

            # Download media files for items with media
            media_downloaded = 0
            for i, item in enumerate(items):
                if i % 10 == 0:
                    logger.info(f"Processing media for {source_name}: {i}/{len(items)}")

                # Download media based on item type
                media_urls = []

                # Extract media URLs from various fields
                if 'image' in item and item['image']:
                    media_urls.append((item['image'], 'image'))
                if 'thumbnail' in item and item['thumbnail']:
                    media_urls.append((item['thumbnail'], 'image'))
                if 'video_url' in item and item['video_url']:
                    media_urls.append((item['video_url'], 'video'))
                if 'audio_link' in item and item['audio_link']:
                    media_urls.append((item['audio_link'], 'audio'))

                # Download each media file
                for url, media_type in media_urls:
                    try:
                        local_path = scraper.download_media(url, item.get('id', f'item_{i}'), media_type)
                        if local_path:
                            media_downloaded += 1
                            # Add local path to item
                            if 'local_media' not in item:
                                item['local_media'] = []
                            item['local_media'].append(local_path)
                    except Exception as e:
                        logger.warning(f"Failed to download media {url}: {e}")

            logger.info(f"Downloaded {media_downloaded} media files for {source_name}")

            # Generate and save markdown
            markdown = scraper.format_markdown(items)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"hkia_{source_name}_backlog_{timestamp}.md"

            # Save to current directory
            current_dir = scraper.config.data_dir / "markdown_current"
            current_dir.mkdir(parents=True, exist_ok=True)
            output_file = current_dir / filename
            output_file.write_text(markdown, encoding='utf-8')

            # Update state
            new_state = {
                'last_update': datetime.now().isoformat(),
                'last_item_count': len(items),
                'backlog_captured': True,
                'backlog_timestamp': timestamp
            }

            if items:
                new_state['last_id'] = items[-1].get('id')

            scraper.save_state(new_state)

            duration = time.time() - start_time
            logger.info(f"✅ {source_name}: {len(items)} items, {media_downloaded} media files in {duration:.1f}s")

            return {
                "success": True,
                "items": len(items),
                "media_files": media_downloaded,
                "duration": duration,
                "output_file": str(output_file)
            }

        except Exception as e:
            duration = time.time() - start_time
            logger.error(f"❌ {source_name} failed after {duration:.1f}s: {e}")
            return {
                "success": False,
                "error": str(e),
                "items": 0,
                "duration": duration
            }

    def capture_all_backlogs(self) -> Dict[str, Any]:
        """Capture backlogs for all sources"""
        logger.info("=" * 80)
        logger.info("STARTING PRODUCTION BACKLOG CAPTURE")
        logger.info("=" * 80)

        # Source configurations with appropriate limits
        sources_config = {
            "wordpress": {"max_items": None},  # All posts
            "mailchimp": {"max_items": None},  # All available (limited by RSS)
            "podcast": {"max_items": None},    # All episodes
            "youtube": {"max_items": 200},     # Last 200 videos
            "instagram": {"max_items": 200},   # Last 200 posts
            "tiktok": {"max_items": 300}       # 300 videos with captions for first 50
        }

        total_items = 0
        total_media = 0
        successful_sources = 0

        for source_name, config in sources_config.items():
            logger.info(f"\n{'-'*60}")
            logger.info(f"PROCESSING: {source_name.upper()}")
            logger.info(f"{'-'*60}")

            result = self.capture_source_backlog(source_name, config["max_items"])
            self.results[source_name] = result

            if result["success"]:
                successful_sources += 1
                total_items += result["items"]
                total_media += result.get("media_files", 0)

            # Add delay between sources to be respectful
            if source_name != list(sources_config.keys())[-1]:  # Not last source
                logger.info("Waiting 30 seconds before next source...")
                time.sleep(30)

        # Generate summary
        total_duration = time.time() - self.start_time

        summary = {
            "timestamp": datetime.now().isoformat(),
            "total_duration": total_duration,
            "total_items": total_items,
            "total_media_files": total_media,
            "successful_sources": successful_sources,
            "total_sources": len(sources_config),
            "results": self.results
        }

        # Save summary
        summary_file = self.data_dir / f"backlog_capture_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)

        logger.info("\n" + "=" * 80)
        logger.info("BACKLOG CAPTURE COMPLETE")
        logger.info("=" * 80)
        logger.info(f"Total items: {total_items:,}")
        logger.info(f"Total media files: {total_media:,}")
        logger.info(f"Successful sources: {successful_sources}/{len(sources_config)}")
        logger.info(f"Total duration: {total_duration/60:.1f} minutes")
        logger.info(f"Summary saved: {summary_file}")

        return summary

    def sync_to_nas(self) -> bool:
        """Sync all captured data to NAS"""
        logger.info("\n" + "=" * 60)
        logger.info("SYNCING TO NAS")
        logger.info("=" * 60)

        try:
            success = self.orchestrator.sync_to_nas()
            if success:
                logger.info("✅ NAS sync completed successfully")
            else:
                logger.error("❌ NAS sync failed")
            return success
        except Exception as e:
            logger.error(f"❌ NAS sync error: {e}")
            return False


def main():
    """Main execution function"""
    print("🚀 HKIA - Production Backlog Capture")
    print("=" * 60)
    print("This will download complete historical content from ALL sources")
    print("Including all available media files (images, videos, audio)")
    print("Estimated time: 2-4 hours depending on content volume")
    print("=" * 60)

    response = input("Proceed with full backlog capture? (y/N): ")
    if response.lower() != 'y':
        print("Backlog capture cancelled.")
        return False

    # Initialize capture
    capture = ProductionBacklogCapture()

    # Capture all backlogs
    summary = capture.capture_all_backlogs()

    # Sync to NAS if any content was captured
    if summary["total_items"] > 0:
        nas_success = capture.sync_to_nas()
        summary["nas_sync_success"] = nas_success
    else:
        logger.warning("No content captured - skipping NAS sync")
        summary["nas_sync_success"] = False

    # Final summary
    print(f"\n🎉 PRODUCTION BACKLOG CAPTURE COMPLETE!")
    print(f"📊 Summary:")
    print(f"   • Total items captured: {summary['total_items']:,}")
    print(f"   • Total media files: {summary['total_media_files']:,}")
    print(f"   • Sources processed: {summary['successful_sources']}/{summary['total_sources']}")
    print(f"   • Duration: {summary['total_duration']/60:.1f} minutes")
    print(f"   • NAS sync: {'✅' if summary.get('nas_sync_success') else '❌'}")

    return summary["successful_sources"] > 0


if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        print("\n\nBacklog capture interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.critical(f"Backlog capture failed: {e}")
        sys.exit(2)