#!/usr/bin/env python3 """ HKIA Content Orchestrator Coordinates all scrapers and handles NAS synchronization. """ import os import sys import time import argparse import subprocess from pathlib import Path from datetime import datetime from typing import List, Dict, Any from concurrent.futures import ThreadPoolExecutor, as_completed import pytz from dotenv import load_dotenv # Import all scrapers from src.base_scraper import ScraperConfig from src.wordpress_scraper import WordPressScraper from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast from src.youtube_scraper import YouTubeScraper from src.instagram_scraper import InstagramScraper from src.tiktok_scraper_advanced import TikTokScraperAdvanced # Load environment variables load_dotenv() class ContentOrchestrator: """Orchestrates all content scrapers and handles synchronization.""" def __init__(self, data_dir: Path = None, logs_dir: Path = None): """Initialize the orchestrator.""" self.data_dir = data_dir or Path("/opt/hvac-kia-content/data") self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs") self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hkia')) self.timezone = os.getenv('TIMEZONE', 'America/Halifax') self.tz = pytz.timezone(self.timezone) # Ensure directories exist self.data_dir.mkdir(parents=True, exist_ok=True) self.logs_dir.mkdir(parents=True, exist_ok=True) # Configure scrapers self.scrapers = self._setup_scrapers() print(f"Orchestrator initialized with {len(self.scrapers)} scrapers") print(f"Data directory: {self.data_dir}") print(f"NAS path: {self.nas_path}") def _setup_scrapers(self) -> Dict[str, Any]: """Set up all scraper instances.""" scrapers = {} # WordPress scraper config = ScraperConfig( source_name="wordpress", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['wordpress'] = WordPressScraper(config) # MailChimp RSS scraper config = ScraperConfig( source_name="mailchimp", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['mailchimp'] = RSSScraperMailChimp(config) # Podcast RSS scraper config = ScraperConfig( source_name="podcast", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['podcast'] = RSSScraperPodcast(config) # YouTube scraper config = ScraperConfig( source_name="youtube", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['youtube'] = YouTubeScraper(config) # Instagram scraper config = ScraperConfig( source_name="instagram", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['instagram'] = InstagramScraper(config) # TikTok scraper (advanced with headed browser) config = ScraperConfig( source_name="tiktok", brand_name="hkia", data_dir=self.data_dir, logs_dir=self.logs_dir, timezone=self.timezone ) scrapers['tiktok'] = TikTokScraperAdvanced(config) return scrapers def run_scraper(self, name: str, scraper: Any, max_workers: int = 1) -> Dict[str, Any]: """Run a single scraper and return results.""" start_time = time.time() try: print(f"Starting {name} scraper...") # Fetch content content = scraper.fetch_content() if not content: print(f"⚠️ {name}: No content fetched") return { 'name': name, 'success': False, 'error': 'No content fetched', 'duration': time.time() - start_time, 'items': 0 } # Load existing state state = scraper.load_state() # Get incremental items (new items only) new_items = scraper.get_incremental_items(content, state) if not new_items: print(f"✅ {name}: No new items (all up to date)") return { 'name': name, 'success': True, 'duration': time.time() - start_time, 'items': 0, 'new_items': 0 } # Archive existing markdown files scraper.archive_current_file() # Generate and save markdown markdown = scraper.format_markdown(new_items) timestamp = datetime.now(scraper.tz).strftime("%Y%m%d_%H%M%S") filename = f"hkia_{name}_{timestamp}.md" # Save to current markdown directory current_dir = scraper.config.data_dir / "markdown_current" current_dir.mkdir(parents=True, exist_ok=True) output_file = current_dir / filename output_file.write_text(markdown) # Update state updated_state = scraper.update_state(state, new_items) scraper.save_state(updated_state) print(f"✅ {name}: {len(new_items)} new items saved to {filename}") return { 'name': name, 'success': True, 'duration': time.time() - start_time, 'items': len(content), 'new_items': len(new_items), 'file': str(output_file) } except Exception as e: print(f"❌ {name}: Error - {e}") return { 'name': name, 'success': False, 'error': str(e), 'duration': time.time() - start_time, 'items': 0 } def run_all_scrapers(self, parallel: bool = True, max_workers: int = 3) -> List[Dict[str, Any]]: """Run all scrapers in parallel or sequentially.""" print(f"Running {len(self.scrapers)} scrapers {'in parallel' if parallel else 'sequentially'}...") start_time = time.time() results = [] if parallel: # Run scrapers in parallel (except TikTok which needs DISPLAY) non_gui_scrapers = {k: v for k, v in self.scrapers.items() if k != 'tiktok'} with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit non-GUI scrapers future_to_name = { executor.submit(self.run_scraper, name, scraper): name for name, scraper in non_gui_scrapers.items() } # Collect results for future in as_completed(future_to_name): result = future.result() results.append(result) # Run TikTok separately (requires DISPLAY) if 'tiktok' in self.scrapers: print("Running TikTok scraper separately (requires GUI)...") tiktok_result = self.run_scraper('tiktok', self.scrapers['tiktok']) results.append(tiktok_result) else: # Run scrapers sequentially for name, scraper in self.scrapers.items(): result = self.run_scraper(name, scraper) results.append(result) total_duration = time.time() - start_time successful = [r for r in results if r['success']] failed = [r for r in results if not r['success']] print(f"\n{'='*60}") print(f"ORCHESTRATOR SUMMARY") print(f"{'='*60}") print(f"Total duration: {total_duration:.2f} seconds") print(f"Successful: {len(successful)}/{len(results)}") print(f"Failed: {len(failed)}") for result in results: status = "✅" if result['success'] else "❌" duration = result['duration'] items = result.get('new_items', result.get('items', 0)) print(f"{status} {result['name']}: {items} items in {duration:.2f}s") if not result['success']: print(f" Error: {result.get('error', 'Unknown error')}") return results def sync_to_nas(self) -> bool: """Synchronize markdown files to NAS.""" print(f"\nSyncing to NAS: {self.nas_path}") try: # Ensure NAS directory exists self.nas_path.mkdir(parents=True, exist_ok=True) # Sync current markdown files current_dir = self.data_dir / "markdown_current" if current_dir.exists(): nas_current = self.nas_path / "current" nas_current.mkdir(parents=True, exist_ok=True) cmd = [ 'rsync', '-av', '--delete', f"{current_dir}/", f"{nas_current}/" ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"❌ Current sync failed: {result.stderr}") return False print(f"✅ Current files synced to {nas_current}") # Sync archived files archive_dir = self.data_dir / "markdown_archives" if archive_dir.exists(): nas_archives = self.nas_path / "archives" nas_archives.mkdir(parents=True, exist_ok=True) cmd = [ 'rsync', '-av', f"{archive_dir}/", f"{nas_archives}/" ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"❌ Archive sync failed: {result.stderr}") return False print(f"✅ Archive files synced to {nas_archives}") # Sync media files media_dir = self.data_dir / "media" if media_dir.exists(): nas_media = self.nas_path / "media" nas_media.mkdir(parents=True, exist_ok=True) cmd = [ 'rsync', '-av', '--delete', f"{media_dir}/", f"{nas_media}/" ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"❌ Media sync failed: {result.stderr}") return False print(f"✅ Media files synced to {nas_media}") return True except Exception as e: print(f"❌ NAS sync error: {e}") return False def main(): """Main entry point.""" parser = argparse.ArgumentParser(description='HKIA Content Orchestrator') parser.add_argument('--data-dir', type=Path, help='Data directory path') parser.add_argument('--sync-nas', action='store_true', help='Sync to NAS after scraping') parser.add_argument('--nas-only', action='store_true', help='Only sync to NAS (no scraping)') parser.add_argument('--sequential', action='store_true', help='Run scrapers sequentially') parser.add_argument('--max-workers', type=int, default=3, help='Max parallel workers') parser.add_argument('--sources', nargs='+', help='Specific sources to run') args = parser.parse_args() # Initialize orchestrator orchestrator = ContentOrchestrator(data_dir=args.data_dir) if args.nas_only: # Only sync to NAS success = orchestrator.sync_to_nas() sys.exit(0 if success else 1) # Filter sources if specified if args.sources: filtered_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k in args.sources} orchestrator.scrapers = filtered_scrapers print(f"Running only: {', '.join(args.sources)}") # Run scrapers results = orchestrator.run_all_scrapers( parallel=not args.sequential, max_workers=args.max_workers ) # Sync to NAS if requested if args.sync_nas: orchestrator.sync_to_nas() # Exit with appropriate code failed_count = sum(1 for r in results if not r['success']) sys.exit(failed_count) if __name__ == "__main__": main()