Major Changes: - Updated all code references from hvacknowitall/hvacnkowitall to hkia - Renamed all existing markdown files to use hkia_ prefix - Updated configuration files, scrapers, and production scripts - Modified systemd service descriptions to use HKIA - Changed NAS sync path to /mnt/nas/hkia Files Updated: - 20+ source files updated with new naming convention - 34 markdown files renamed to hkia_* format - All ScraperConfig brand_name parameters now use 'hkia' - Documentation updated to reflect new naming Rationale: - Shorter, cleaner filenames - Consistent branding across all outputs - Easier to type and reference - Maintains same functionality with improved naming Next Steps: - Deploy updated services to production - Update any external references to old naming - Monitor scrapers to ensure proper operation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
365 lines
No EOL
13 KiB
Python
365 lines
No EOL
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
HKIA Content Orchestrator
|
|
Coordinates all scrapers and handles NAS synchronization.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
import subprocess
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import pytz
|
|
from dotenv import load_dotenv
|
|
|
|
# Import all scrapers
|
|
from src.base_scraper import ScraperConfig
|
|
from src.wordpress_scraper import WordPressScraper
|
|
from src.rss_scraper import RSSScraperMailChimp, RSSScraperPodcast
|
|
from src.youtube_scraper import YouTubeScraper
|
|
from src.instagram_scraper import InstagramScraper
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
|
|
class ContentOrchestrator:
|
|
"""Orchestrates all content scrapers and handles synchronization."""
|
|
|
|
def __init__(self, data_dir: Path = None, logs_dir: Path = None):
|
|
"""Initialize the orchestrator."""
|
|
self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
|
|
self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
|
|
self.nas_path = Path(os.getenv('NAS_PATH', '/mnt/nas/hkia'))
|
|
self.timezone = os.getenv('TIMEZONE', 'America/Halifax')
|
|
self.tz = pytz.timezone(self.timezone)
|
|
|
|
# Ensure directories exist
|
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
self.logs_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Configure scrapers
|
|
self.scrapers = self._setup_scrapers()
|
|
|
|
print(f"Orchestrator initialized with {len(self.scrapers)} scrapers")
|
|
print(f"Data directory: {self.data_dir}")
|
|
print(f"NAS path: {self.nas_path}")
|
|
|
|
def _setup_scrapers(self) -> Dict[str, Any]:
|
|
"""Set up all scraper instances."""
|
|
scrapers = {}
|
|
|
|
# WordPress scraper
|
|
config = ScraperConfig(
|
|
source_name="wordpress",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['wordpress'] = WordPressScraper(config)
|
|
|
|
# MailChimp RSS scraper
|
|
config = ScraperConfig(
|
|
source_name="mailchimp",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['mailchimp'] = RSSScraperMailChimp(config)
|
|
|
|
# Podcast RSS scraper
|
|
config = ScraperConfig(
|
|
source_name="podcast",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['podcast'] = RSSScraperPodcast(config)
|
|
|
|
# YouTube scraper
|
|
config = ScraperConfig(
|
|
source_name="youtube",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['youtube'] = YouTubeScraper(config)
|
|
|
|
# Instagram scraper
|
|
config = ScraperConfig(
|
|
source_name="instagram",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['instagram'] = InstagramScraper(config)
|
|
|
|
# TikTok scraper (advanced with headed browser)
|
|
config = ScraperConfig(
|
|
source_name="tiktok",
|
|
brand_name="hkia",
|
|
data_dir=self.data_dir,
|
|
logs_dir=self.logs_dir,
|
|
timezone=self.timezone
|
|
)
|
|
scrapers['tiktok'] = TikTokScraperAdvanced(config)
|
|
|
|
return scrapers
|
|
|
|
def run_scraper(self, name: str, scraper: Any, max_workers: int = 1) -> Dict[str, Any]:
|
|
"""Run a single scraper and return results."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
print(f"Starting {name} scraper...")
|
|
|
|
# Fetch content
|
|
content = scraper.fetch_content()
|
|
|
|
if not content:
|
|
print(f"⚠️ {name}: No content fetched")
|
|
return {
|
|
'name': name,
|
|
'success': False,
|
|
'error': 'No content fetched',
|
|
'duration': time.time() - start_time,
|
|
'items': 0
|
|
}
|
|
|
|
# Load existing state
|
|
state = scraper.load_state()
|
|
|
|
# Get incremental items (new items only)
|
|
new_items = scraper.get_incremental_items(content, state)
|
|
|
|
if not new_items:
|
|
print(f"✅ {name}: No new items (all up to date)")
|
|
return {
|
|
'name': name,
|
|
'success': True,
|
|
'duration': time.time() - start_time,
|
|
'items': 0,
|
|
'new_items': 0
|
|
}
|
|
|
|
# Archive existing markdown files
|
|
scraper.archive_current_file()
|
|
|
|
# Generate and save markdown
|
|
markdown = scraper.format_markdown(new_items)
|
|
timestamp = datetime.now(scraper.tz).strftime("%Y%m%d_%H%M%S")
|
|
filename = f"hkia_{name}_{timestamp}.md"
|
|
|
|
# Save to current markdown directory
|
|
current_dir = scraper.config.data_dir / "markdown_current"
|
|
current_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = current_dir / filename
|
|
output_file.write_text(markdown)
|
|
|
|
# Update state
|
|
updated_state = scraper.update_state(state, new_items)
|
|
scraper.save_state(updated_state)
|
|
|
|
print(f"✅ {name}: {len(new_items)} new items saved to {filename}")
|
|
|
|
return {
|
|
'name': name,
|
|
'success': True,
|
|
'duration': time.time() - start_time,
|
|
'items': len(content),
|
|
'new_items': len(new_items),
|
|
'file': str(output_file)
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"❌ {name}: Error - {e}")
|
|
return {
|
|
'name': name,
|
|
'success': False,
|
|
'error': str(e),
|
|
'duration': time.time() - start_time,
|
|
'items': 0
|
|
}
|
|
|
|
def run_all_scrapers(self, parallel: bool = True, max_workers: int = 3) -> List[Dict[str, Any]]:
|
|
"""Run all scrapers in parallel or sequentially."""
|
|
print(f"Running {len(self.scrapers)} scrapers {'in parallel' if parallel else 'sequentially'}...")
|
|
start_time = time.time()
|
|
|
|
results = []
|
|
|
|
if parallel:
|
|
# Run scrapers in parallel (except TikTok which needs DISPLAY)
|
|
non_gui_scrapers = {k: v for k, v in self.scrapers.items() if k != 'tiktok'}
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
# Submit non-GUI scrapers
|
|
future_to_name = {
|
|
executor.submit(self.run_scraper, name, scraper): name
|
|
for name, scraper in non_gui_scrapers.items()
|
|
}
|
|
|
|
# Collect results
|
|
for future in as_completed(future_to_name):
|
|
result = future.result()
|
|
results.append(result)
|
|
|
|
# Run TikTok separately (requires DISPLAY)
|
|
if 'tiktok' in self.scrapers:
|
|
print("Running TikTok scraper separately (requires GUI)...")
|
|
tiktok_result = self.run_scraper('tiktok', self.scrapers['tiktok'])
|
|
results.append(tiktok_result)
|
|
|
|
else:
|
|
# Run scrapers sequentially
|
|
for name, scraper in self.scrapers.items():
|
|
result = self.run_scraper(name, scraper)
|
|
results.append(result)
|
|
|
|
total_duration = time.time() - start_time
|
|
successful = [r for r in results if r['success']]
|
|
failed = [r for r in results if not r['success']]
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"ORCHESTRATOR SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Total duration: {total_duration:.2f} seconds")
|
|
print(f"Successful: {len(successful)}/{len(results)}")
|
|
print(f"Failed: {len(failed)}")
|
|
|
|
for result in results:
|
|
status = "✅" if result['success'] else "❌"
|
|
duration = result['duration']
|
|
items = result.get('new_items', result.get('items', 0))
|
|
print(f"{status} {result['name']}: {items} items in {duration:.2f}s")
|
|
|
|
if not result['success']:
|
|
print(f" Error: {result.get('error', 'Unknown error')}")
|
|
|
|
return results
|
|
|
|
def sync_to_nas(self) -> bool:
|
|
"""Synchronize markdown files to NAS."""
|
|
print(f"\nSyncing to NAS: {self.nas_path}")
|
|
|
|
try:
|
|
# Ensure NAS directory exists
|
|
self.nas_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sync current markdown files
|
|
current_dir = self.data_dir / "markdown_current"
|
|
if current_dir.exists():
|
|
nas_current = self.nas_path / "current"
|
|
nas_current.mkdir(parents=True, exist_ok=True)
|
|
|
|
cmd = [
|
|
'rsync', '-av', '--delete',
|
|
f"{current_dir}/",
|
|
f"{nas_current}/"
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
print(f"❌ Current sync failed: {result.stderr}")
|
|
return False
|
|
|
|
print(f"✅ Current files synced to {nas_current}")
|
|
|
|
# Sync archived files
|
|
archive_dir = self.data_dir / "markdown_archives"
|
|
if archive_dir.exists():
|
|
nas_archives = self.nas_path / "archives"
|
|
nas_archives.mkdir(parents=True, exist_ok=True)
|
|
|
|
cmd = [
|
|
'rsync', '-av',
|
|
f"{archive_dir}/",
|
|
f"{nas_archives}/"
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
print(f"❌ Archive sync failed: {result.stderr}")
|
|
return False
|
|
|
|
print(f"✅ Archive files synced to {nas_archives}")
|
|
|
|
# Sync media files
|
|
media_dir = self.data_dir / "media"
|
|
if media_dir.exists():
|
|
nas_media = self.nas_path / "media"
|
|
nas_media.mkdir(parents=True, exist_ok=True)
|
|
|
|
cmd = [
|
|
'rsync', '-av', '--delete',
|
|
f"{media_dir}/",
|
|
f"{nas_media}/"
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode != 0:
|
|
print(f"❌ Media sync failed: {result.stderr}")
|
|
return False
|
|
|
|
print(f"✅ Media files synced to {nas_media}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ NAS sync error: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description='HKIA Content Orchestrator')
|
|
parser.add_argument('--data-dir', type=Path, help='Data directory path')
|
|
parser.add_argument('--sync-nas', action='store_true', help='Sync to NAS after scraping')
|
|
parser.add_argument('--nas-only', action='store_true', help='Only sync to NAS (no scraping)')
|
|
parser.add_argument('--sequential', action='store_true', help='Run scrapers sequentially')
|
|
parser.add_argument('--max-workers', type=int, default=3, help='Max parallel workers')
|
|
parser.add_argument('--sources', nargs='+', help='Specific sources to run')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize orchestrator
|
|
orchestrator = ContentOrchestrator(data_dir=args.data_dir)
|
|
|
|
if args.nas_only:
|
|
# Only sync to NAS
|
|
success = orchestrator.sync_to_nas()
|
|
sys.exit(0 if success else 1)
|
|
|
|
# Filter sources if specified
|
|
if args.sources:
|
|
filtered_scrapers = {k: v for k, v in orchestrator.scrapers.items() if k in args.sources}
|
|
orchestrator.scrapers = filtered_scrapers
|
|
print(f"Running only: {', '.join(args.sources)}")
|
|
|
|
# Run scrapers
|
|
results = orchestrator.run_all_scrapers(
|
|
parallel=not args.sequential,
|
|
max_workers=args.max_workers
|
|
)
|
|
|
|
# Sync to NAS if requested
|
|
if args.sync_nas:
|
|
orchestrator.sync_to_nas()
|
|
|
|
# Exit with appropriate code
|
|
failed_count = sum(1 for r in results if not r['success'])
|
|
sys.exit(failed_count)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |