hvac-kia-content/src/orchestrator.py

#!/usr/bin/env python3
"""
HKIA Content Orchestrator
Coordinates all scrapers and handles NAS synchronization.
"""

import os
import sys
import time
import argparse
import subprocess
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytz
from dotenv import load_dotenv

# Import all scrapers
from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
from src.youtube_scraper import YouTubeScraper
from src.instagram_scraper import InstagramScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced

# Load environment variables
load_dotenv()


class ContentOrchestrator:
    """Orchestrates all content scrapers and handles synchronization."""

    def __init__(self, data_dir: Path = None, logs_dir: Path = None):
        """Initialize the orchestrator."""
        self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
        self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
        self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hkia'))
        self.timezone = os.getenv('TIMEZONE', 'America/Halifax')
        self.tz = pytz.timezone(self.timezone)

        # Ensure directories exist
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.logs_dir.mkdir(parents=True, exist_ok=True)

        # Configure scrapers
        self.scrapers = self._setup_scrapers()

        print(f"Orchestrator initialized with {len(self.scrapers)} scrapers")
        print(f"Data directory: {self.data_dir}")
        print(f"NAS path: {self.nas_path}")

    def _setup_scrapers(self) -> Dict[str, Any]:
        """Set up all scraper instances."""
        scrapers = {}

        # WordPress scraper
        config = ScraperConfig(
            source_name="wordpress",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['wordpress'] = WordPressScraper(config)

        # MailChimp RSS scraper
        config = ScraperConfig(
            source_name="mailchimp",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['mailchimp'] = RSSScraperMailChimp(config)

        # Podcast RSS scraper
        config = ScraperConfig(
            source_name="podcast",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['podcast'] = RSSScraperPodcast(config)

        # YouTube scraper
        config = ScraperConfig(
            source_name="youtube",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['youtube'] = YouTubeScraper(config)

        # Instagram scraper
        config = ScraperConfig(
            source_name="instagram",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['instagram'] = InstagramScraper(config)

        # TikTok scraper (advanced with headed browser)
        config = ScraperConfig(
            source_name="tiktok",
            brand_name="hkia",
            data_dir=self.data_dir,
            logs_dir=self.logs_dir,
            timezone=self.timezone
        )
        scrapers['tiktok'] = TikTokScraperAdvanced(config)

        return scrapers

    def run_scraper(self, name: str, scraper: Any, max_workers: int = 1) -> Dict[str, Any]:
        """Run a single scraper and return results."""
        start_time = time.time()

        try:
            print(f"Starting {name} scraper...")

            # Fetch content
            content = scraper.fetch_content()

            if not content:
                print(f"⚠️  {name}: No content fetched")
                return {
                    'name': name,
                    'success': False,
                    'error': 'No content fetched',
                    'duration': time.time() - start_time,
                    'items': 0
                }

            # Load existing state
            state = scraper.load_state()

            # Get incremental items (new items only)
            new_items = scraper.get_incremental_items(content, state)

            if not new_items:
                print(f"✅ {name}: No new items (all up to date)")
                return {
                    'name': name,
                    'success': True,
                    'duration': time.time() - start_time,
                    'items': 0,
                    'new_items': 0
                }

            # Archive existing markdown files
            scraper.archive_current_file()

            # Generate and save markdown
            markdown = scraper.format_markdown(new_items)
            timestamp = datetime.now(scraper.tz).strftime("%Y%m%d_%H%M%S")
            filename = f"hkia_{name}_{timestamp}.md"

            # Save to current markdown directory
            current_dir = scraper.config.data_dir / "markdown_current"
            current_dir.mkdir(parents=True, exist_ok=True)
            output_file = current_dir / filename
            output_file.write_text(markdown)

            # Update state
            updated_state = scraper.update_state(state, new_items)
            scraper.save_state(updated_state)

            print(f"✅ {name}: {len(new_items)} new items saved to {filename}")

            return {
                'name': name,
                'success': True,
                'duration': time.time() - start_time,
                'items': len(content),
                'new_items': len(new_items),
                'file': str(output_file)
            }

        except Exception as e:
            print(f"❌ {name}: Error - {e}")
            return {
                'name': name,
                'success': False,
                'error': str(e),
                'duration': time.time() - start_time,
                'items': 0
            }

    def run_all_scrapers(self, parallel: bool = True, max_workers: int = 3) -> List[Dict[str, Any]]:
        """Run all scrapers in parallel or sequentially."""
        print(f"Running {len(self.scrapers)} scrapers {'in parallel' if parallel else 'sequentially'}...")
        start_time = time.time()

        results = []

        if parallel:
            # Run scrapers in parallel (except TikTok which needs DISPLAY)
            non_gui_scrapers = {k: v for k, v in self.scrapers.items() if k != 'tiktok'}

            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                # Submit non-GUI scrapers
                future_to_name = {
                    executor.submit(self.run_scraper, name, scraper): name
                    for name, scraper in non_gui_scrapers.items()
                }

                # Collect results
                for future in as_completed(future_to_name):
                    result = future.result()
                    results.append(result)

            # Run TikTok separately (requires DISPLAY)
            if 'tiktok' in self.scrapers:
                print("Running TikTok scraper separately (requires GUI)...")
                tiktok_result = self.run_scraper('tiktok', self.scrapers['tiktok'])
                results.append(tiktok_result)

        else:
            # Run scrapers sequentially
            for name, scraper in self.scrapers.items():
                result = self.run_scraper(name, scraper)
                results.append(result)

        total_duration = time.time() - start_time
        successful = [r for r in results if r['success']]
        failed = [r for r in results if not r['success']]

        print(f"\n{'='*60}")
        print(f"ORCHESTRATOR SUMMARY")
        print(f"{'='*60}")
        print(f"Total duration: {total_duration:.2f} seconds")
        print(f"Successful: {len(successful)}/{len(results)}")
        print(f"Failed: {len(failed)}")

        for result in results:
            status = "✅" if result['success'] else "❌"
            duration = result['duration']
            items = result.get('new_items', result.get('items', 0))
            print(f"{status} {result['name']}: {items} items in {duration:.2f}s")

            if not result['success']:
                print(f"   Error: {result.get('error', 'Unknown error')}")

        return results

    def sync_to_nas(self) -> bool:
        """Synchronize markdown files to NAS."""
        print(f"\nSyncing to NAS: {self.nas_path}")

        try:
            # Ensure NAS directory exists
            self.nas_path.mkdir(parents=True, exist_ok=True)

            # Sync current markdown files
            current_dir = self.data_dir / "markdown_current"
            if current_dir.exists():
                nas_current = self.nas_path / "current"
                nas_current.mkdir(parents=True, exist_ok=True)

                cmd = [
                    'rsync', '-av', '--delete',
                    f"{current_dir}/",
                    f"{nas_current}/"
                ]

                result = subprocess.run(cmd, capture_output=True, text=True)
                if result.returncode != 0:
                    print(f"❌ Current sync failed: {result.stderr}")
                    return False

                print(f"✅ Current files synced to {nas_current}")

            # Sync archived files
            archive_dir = self.data_dir / "markdown_archives"
            if archive_dir.exists():
                nas_archives = self.nas_path / "archives"
                nas_archives.mkdir(parents=True, exist_ok=True)

                cmd = [
                    'rsync', '-av',
                    f"{archive_dir}/",
                    f"{nas_archives}/"
                ]

                result = subprocess.run(cmd, capture_output=True, text=True)
                if result.returncode != 0:
                    print(f"❌ Archive sync failed: {result.stderr}")
                    return False

                print(f"✅ Archive files synced to {nas_archives}")

            # Sync media files
            media_dir = self.data_dir / "media"
            if media_dir.exists():
                nas_media = self.nas_path / "media"
                nas_media.mkdir(parents=True, exist_ok=True)

                cmd = [
                    'rsync', '-av', '--delete',
                    f"{media_dir}/",
                    f"{nas_media}/"
                ]

                result = subprocess.run(cmd, capture_output=True, text=True)
                if result.returncode != 0:
                    print(f"❌ Media sync failed: {result.stderr}")
                    return False

                print(f"✅ Media files synced to {nas_media}")

            return True

        except Exception as e:
            print(f"❌ NAS sync error: {e}")
            return False


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description='HKIA Content Orchestrator')
    parser.add_argument('--data-dir', type=Path, help='Data directory path')
    parser.add_argument('--sync-nas', action='store_true', help='Sync to NAS after scraping')
    parser.add_argument('--nas-only', action='store_true', help='Only sync to NAS (no scraping)')
    parser.add_argument('--sequential', action='store_true', help='Run scrapers sequentially')
    parser.add_argument('--max-workers', type=int, default=3, help='Max parallel workers')
    parser.add_argument('--sources', nargs='+', help='Specific sources to run')

    args = parser.parse_args()

    # Initialize orchestrator
    orchestrator = ContentOrchestrator(data_dir=args.data_dir)

    if args.nas_only:
        # Only sync to NAS
        success = orchestrator.sync_to_nas()
        sys.exit(0 if success else 1)

    # Filter sources if specified
    if args.sources:
        filtered_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k in args.sources}
        orchestrator.scrapers = filtered_scrapers
        print(f"Running only: {', '.join(args.sources)}")

    # Run scrapers
    results = orchestrator.run_all_scrapers(
        parallel=not args.sequential,
        max_workers=args.max_workers
    )

    # Sync to NAS if requested
    if args.sync_nas:
        orchestrator.sync_to_nas()

    # Exit with appropriate code
    failed_count = sum(1 for r in results if not r['success'])
    sys.exit(failed_count)


if __name__ == "__main__":
    main()