hvac-kia-content/src/orchestrator.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

365 lines
No EOL
13 KiB
Python

#!/usr/bin/env python3
"""
HKIA Content Orchestrator
Coordinates all scrapers and handles NAS synchronization.
"""
import os
import sys
import time
import argparse
import subprocess
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytz
from dotenv import load_dotenv
# Import all scrapers
from src.base_scraper import ScraperConfig
from src.wordpress_scraper import WordPressScraper
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
from src.youtube_scraper import YouTubeScraper
from src.instagram_scraper import InstagramScraper
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
# Load environment variables
load_dotenv()
class ContentOrchestrator:
"""Orchestrates all content scrapers and handles synchronization."""
def __init__(self, data_dir: Path = None, logs_dir: Path = None):
"""Initialize the orchestrator."""
self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hkia'))
self.timezone = os.getenv('TIMEZONE', 'America/Halifax')
self.tz = pytz.timezone(self.timezone)
# Ensure directories exist
self.data_dir.mkdir(parents=True, exist_ok=True)
self.logs_dir.mkdir(parents=True, exist_ok=True)
# Configure scrapers
self.scrapers = self._setup_scrapers()
print(f"Orchestrator initialized with {len(self.scrapers)} scrapers")
print(f"Data directory: {self.data_dir}")
print(f"NAS path: {self.nas_path}")
def _setup_scrapers(self) -> Dict[str, Any]:
"""Set up all scraper instances."""
scrapers = {}
# WordPress scraper
config = ScraperConfig(
source_name="wordpress",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['wordpress'] = WordPressScraper(config)
# MailChimp RSS scraper
config = ScraperConfig(
source_name="mailchimp",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['mailchimp'] = RSSScraperMailChimp(config)
# Podcast RSS scraper
config = ScraperConfig(
source_name="podcast",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['podcast'] = RSSScraperPodcast(config)
# YouTube scraper
config = ScraperConfig(
source_name="youtube",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['youtube'] = YouTubeScraper(config)
# Instagram scraper
config = ScraperConfig(
source_name="instagram",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['instagram'] = InstagramScraper(config)
# TikTok scraper (advanced with headed browser)
config = ScraperConfig(
source_name="tiktok",
brand_name="hkia",
data_dir=self.data_dir,
logs_dir=self.logs_dir,
timezone=self.timezone
)
scrapers['tiktok'] = TikTokScraperAdvanced(config)
return scrapers
def run_scraper(self, name: str, scraper: Any, max_workers: int = 1) -> Dict[str, Any]:
"""Run a single scraper and return results."""
start_time = time.time()
try:
print(f"Starting {name} scraper...")
# Fetch content
content = scraper.fetch_content()
if not content:
print(f"⚠️ {name}: No content fetched")
return {
'name': name,
'success': False,
'error': 'No content fetched',
'duration': time.time() - start_time,
'items': 0
}
# Load existing state
state = scraper.load_state()
# Get incremental items (new items only)
new_items = scraper.get_incremental_items(content, state)
if not new_items:
print(f"{name}: No new items (all up to date)")
return {
'name': name,
'success': True,
'duration': time.time() - start_time,
'items': 0,
'new_items': 0
}
# Archive existing markdown files
scraper.archive_current_file()
# Generate and save markdown
markdown = scraper.format_markdown(new_items)
timestamp = datetime.now(scraper.tz).strftime("%Y%m%d_%H%M%S")
filename = f"hkia_{name}_{timestamp}.md"
# Save to current markdown directory
current_dir = scraper.config.data_dir / "markdown_current"
current_dir.mkdir(parents=True, exist_ok=True)
output_file = current_dir / filename
output_file.write_text(markdown)
# Update state
updated_state = scraper.update_state(state, new_items)
scraper.save_state(updated_state)
print(f"{name}: {len(new_items)} new items saved to {filename}")
return {
'name': name,
'success': True,
'duration': time.time() - start_time,
'items': len(content),
'new_items': len(new_items),
'file': str(output_file)
}
except Exception as e:
print(f"{name}: Error - {e}")
return {
'name': name,
'success': False,
'error': str(e),
'duration': time.time() - start_time,
'items': 0
}
def run_all_scrapers(self, parallel: bool = True, max_workers: int = 3) -> List[Dict[str, Any]]:
"""Run all scrapers in parallel or sequentially."""
print(f"Running {len(self.scrapers)} scrapers {'in parallel' if parallel else 'sequentially'}...")
start_time = time.time()
results = []
if parallel:
# Run scrapers in parallel (except TikTok which needs DISPLAY)
non_gui_scrapers = {k: v for k, v in self.scrapers.items() if k != 'tiktok'}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit non-GUI scrapers
future_to_name = {
executor.submit(self.run_scraper, name, scraper): name
for name, scraper in non_gui_scrapers.items()
}
# Collect results
for future in as_completed(future_to_name):
result = future.result()
results.append(result)
# Run TikTok separately (requires DISPLAY)
if 'tiktok' in self.scrapers:
print("Running TikTok scraper separately (requires GUI)...")
tiktok_result = self.run_scraper('tiktok', self.scrapers['tiktok'])
results.append(tiktok_result)
else:
# Run scrapers sequentially
for name, scraper in self.scrapers.items():
result = self.run_scraper(name, scraper)
results.append(result)
total_duration = time.time() - start_time
successful = [r for r in results if r['success']]
failed = [r for r in results if not r['success']]
print(f"\n{'='*60}")
print(f"ORCHESTRATOR SUMMARY")
print(f"{'='*60}")
print(f"Total duration: {total_duration:.2f} seconds")
print(f"Successful: {len(successful)}/{len(results)}")
print(f"Failed: {len(failed)}")
for result in results:
status = "" if result['success'] else ""
duration = result['duration']
items = result.get('new_items', result.get('items', 0))
print(f"{status} {result['name']}: {items} items in {duration:.2f}s")
if not result['success']:
print(f" Error: {result.get('error', 'Unknown error')}")
return results
def sync_to_nas(self) -> bool:
"""Synchronize markdown files to NAS."""
print(f"\nSyncing to NAS: {self.nas_path}")
try:
# Ensure NAS directory exists
self.nas_path.mkdir(parents=True, exist_ok=True)
# Sync current markdown files
current_dir = self.data_dir / "markdown_current"
if current_dir.exists():
nas_current = self.nas_path / "current"
nas_current.mkdir(parents=True, exist_ok=True)
cmd = [
'rsync', '-av', '--delete',
f"{current_dir}/",
f"{nas_current}/"
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Current sync failed: {result.stderr}")
return False
print(f"✅ Current files synced to {nas_current}")
# Sync archived files
archive_dir = self.data_dir / "markdown_archives"
if archive_dir.exists():
nas_archives = self.nas_path / "archives"
nas_archives.mkdir(parents=True, exist_ok=True)
cmd = [
'rsync', '-av',
f"{archive_dir}/",
f"{nas_archives}/"
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Archive sync failed: {result.stderr}")
return False
print(f"✅ Archive files synced to {nas_archives}")
# Sync media files
media_dir = self.data_dir / "media"
if media_dir.exists():
nas_media = self.nas_path / "media"
nas_media.mkdir(parents=True, exist_ok=True)
cmd = [
'rsync', '-av', '--delete',
f"{media_dir}/",
f"{nas_media}/"
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Media sync failed: {result.stderr}")
return False
print(f"✅ Media files synced to {nas_media}")
return True
except Exception as e:
print(f"❌ NAS sync error: {e}")
return False
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description='HKIA Content Orchestrator')
parser.add_argument('--data-dir', type=Path, help='Data directory path')
parser.add_argument('--sync-nas', action='store_true', help='Sync to NAS after scraping')
parser.add_argument('--nas-only', action='store_true', help='Only sync to NAS (no scraping)')
parser.add_argument('--sequential', action='store_true', help='Run scrapers sequentially')
parser.add_argument('--max-workers', type=int, default=3, help='Max parallel workers')
parser.add_argument('--sources', nargs='+', help='Specific sources to run')
args = parser.parse_args()
# Initialize orchestrator
orchestrator = ContentOrchestrator(data_dir=args.data_dir)
if args.nas_only:
# Only sync to NAS
success = orchestrator.sync_to_nas()
sys.exit(0 if success else 1)
# Filter sources if specified
if args.sources:
filtered_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k in args.sources}
orchestrator.scrapers = filtered_scrapers
print(f"Running only: {', '.join(args.sources)}")
# Run scrapers
results = orchestrator.run_all_scrapers(
parallel=not args.sequential,
max_workers=args.max_workers
)
# Sync to NAS if requested
if args.sync_nas:
orchestrator.sync_to_nas()
# Exit with appropriate code
failed_count = sum(1 for r in results if not r['success'])
sys.exit(failed_count)
if __name__ == "__main__":
main()