hvac-kia-content/run_production.py
Ben Reed daab901e35 refactor: Update naming convention from hvacknowitall to hkia
Major Changes:
- Updated all code references from hvacknowitall/hvacnkowitall to hkia
- Renamed all existing markdown files to use hkia_ prefix
- Updated configuration files, scrapers, and production scripts
- Modified systemd service descriptions to use HKIA
- Changed NAS sync path to /mnt/nas/hkia

Files Updated:
- 20+ source files updated with new naming convention
- 34 markdown files renamed to hkia_* format
- All ScraperConfig brand_name parameters now use 'hkia'
- Documentation updated to reflect new naming

Rationale:
- Shorter, cleaner filenames
- Consistent branding across all outputs
- Easier to type and reference
- Maintains same functionality with improved naming

Next Steps:
- Deploy updated services to production
- Update any external references to old naming
- Monitor scrapers to ensure proper operation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-19 13:35:23 -03:00

346 lines
No EOL
12 KiB
Python

#!/usr/bin/env python3
"""
Production runner for HKIA Content Aggregator
Handles both regular scraping and special TikTok caption jobs
"""
import sys
import os
import argparse
import logging
from pathlib import Path
from datetime import datetime
import time
import json
# Add project to path
sys.path.insert(0, str(Path(__file__).parent))
from src.orchestrator import ContentOrchestrator
from src.base_scraper import ScraperConfig
from config.production import (
SCRAPERS_CONFIG,
PARALLEL_PROCESSING,
OUTPUT_CONFIG,
DATA_DIR,
LOGS_DIR,
TIKTOK_CAPTION_JOB
)
# Set up logging
def setup_logging(job_type="regular"):
"""Set up production logging"""
log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
return logging.getLogger(__name__)
def validate_environment():
"""Validate required environment variables exist"""
required_vars = [
'WORDPRESS_USERNAME',
'WORDPRESS_API_KEY',
'YOUTUBE_CHANNEL_URL',
'INSTAGRAM_USERNAME',
'INSTAGRAM_PASSWORD',
'TIKTOK_TARGET',
'NAS_PATH'
]
missing = []
for var in required_vars:
if not os.getenv(var):
missing.append(var)
if missing:
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
return True
def validate_config():
"""Validate configuration values are reasonable"""
from config.production import SCRAPERS_CONFIG, RETRY_CONFIG, PARALLEL_PROCESSING
errors = []
# Validate scraper configs
for source, config in SCRAPERS_CONFIG.items():
# Check max items are positive
for key in ['max_posts', 'max_items', 'max_videos']:
if key in config and config[key] is not None:
if config[key] <= 0:
errors.append(f"{source}: {key} must be positive (got {config[key]})")
# Check max_caption_fetches is reasonable
if 'max_caption_fetches' in config:
if config['max_caption_fetches'] < 0:
errors.append(f"{source}: max_caption_fetches cannot be negative")
if config['max_caption_fetches'] > 100:
errors.append(f"{source}: max_caption_fetches too high (>100)")
# Validate retry config
if RETRY_CONFIG['max_attempts'] < 1:
errors.append("RETRY_CONFIG: max_attempts must be at least 1")
if RETRY_CONFIG['initial_delay'] < 0:
errors.append("RETRY_CONFIG: initial_delay cannot be negative")
if RETRY_CONFIG['max_delay'] < RETRY_CONFIG['initial_delay']:
errors.append("RETRY_CONFIG: max_delay must be >= initial_delay")
# Validate parallel processing
if PARALLEL_PROCESSING.get('max_workers', 1) < 1:
errors.append("PARALLEL_PROCESSING: max_workers must be at least 1")
if PARALLEL_PROCESSING.get('max_workers', 1) > 10:
errors.append("PARALLEL_PROCESSING: max_workers too high (>10)")
if errors:
raise ValueError(f"Configuration validation failed:\n" + "\n".join(errors))
return True
def run_regular_scraping():
"""Run regular incremental scraping for all sources"""
logger = setup_logging("regular")
logger.info("Starting regular production scraping run")
# Validate environment and config first
try:
validate_environment()
logger.info("Environment validation passed")
validate_config()
logger.info("Configuration validation passed")
except ValueError as e:
logger.error(f"Validation failed: {e}")
return False
start_time = time.time()
results = {}
try:
# Create orchestrator config
config = ScraperConfig(
source_name="production",
brand_name="hkia",
data_dir=DATA_DIR,
logs_dir=LOGS_DIR,
timezone="America/Halifax"
)
# Initialize orchestrator
orchestrator = ContentOrchestrator(config)
# Configure each scraper
for source, settings in SCRAPERS_CONFIG.items():
if not settings.get("enabled", True):
logger.info(f"Skipping {source} (disabled)")
continue
logger.info(f"Processing {source}...")
try:
scraper = orchestrator.scrapers.get(source)
if not scraper:
logger.warning(f"Scraper not found: {source}")
continue
# Set max items based on config
max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")
# Special handling for TikTok
if source == "tiktok":
items = scraper.fetch_content(
max_posts=max_items,
fetch_captions=settings.get("fetch_captions", False),
max_caption_fetches=settings.get("max_caption_fetches", 0)
)
elif source == "youtube":
items = scraper.fetch_channel_videos(max_videos=max_items)
elif source == "instagram":
items = scraper.fetch_content(max_posts=max_items)
else:
items = scraper.fetch_content(max_items=max_items)
# Apply incremental logic
if settings.get("incremental", True):
state = scraper.load_state()
new_items = scraper.get_incremental_items(items, state)
if new_items:
logger.info(f"Found {len(new_items)} new items for {source}")
# Update state
new_state = scraper.update_state(state, new_items)
scraper.save_state(new_state)
items = new_items
else:
logger.info(f"No new items for {source}")
items = []
results[source] = {
"count": len(items),
"success": True,
"items": items
}
except Exception as e:
logger.error(f"Error processing {source}: {e}")
results[source] = {
"count": 0,
"success": False,
"error": str(e)
}
# Combine and save results
if OUTPUT_CONFIG.get("combine_sources", True):
combined_markdown = []
combined_markdown.append(f"# HKIA Content Update")
combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
combined_markdown.append("")
for source, result in results.items():
if result["success"] and result["count"] > 0:
combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
combined_markdown.append("")
# Format items
scraper = orchestrator.scrapers.get(source)
if scraper and result["items"]:
markdown = scraper.format_markdown(result["items"])
combined_markdown.append(markdown)
# Save combined output with spec-compliant naming
# Format: hkia_combined_YYYY-MM-DD-THHMMSS.md
output_file = DATA_DIR / f"hkia_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
logger.info(f"Saved combined output to {output_file}")
# Log summary
duration = time.time() - start_time
total_items = sum(r["count"] for r in results.values())
logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")
# Save metrics
metrics_file = LOGS_DIR / "metrics.json"
metrics = {
"timestamp": datetime.now().isoformat(),
"duration": duration,
"results": results
}
with open(metrics_file, "a") as f:
f.write(json.dumps(metrics) + "\n")
# Sync to NAS if configured and items were found
if total_items > 0:
try:
logger.info("Starting NAS synchronization...")
if orchestrator.sync_to_nas():
logger.info("NAS sync completed successfully")
else:
logger.warning("NAS sync failed - check configuration")
except Exception as e:
logger.error(f"NAS sync error: {e}")
# Don't fail the entire run for NAS sync issues
# Send health check ping if configured
healthcheck_url = os.getenv("HEALTHCHECK_URL")
if healthcheck_url:
try:
import requests
# Include metrics in health check
health_data = {
"status": "success",
"items": total_items,
"duration": duration,
"sources": len([r for r in results.values() if r["success"]])
}
response = requests.post(healthcheck_url, json=health_data, timeout=5)
if response.status_code == 200:
logger.info("Health check ping sent successfully")
else:
logger.warning(f"Health check ping failed: {response.status_code}")
except Exception as e:
logger.warning(f"Could not send health check: {e}")
return True
except Exception as e:
logger.error(f"Production run failed: {e}")
return False
def run_tiktok_caption_job():
"""Special overnight job for fetching TikTok captions"""
if not TIKTOK_CAPTION_JOB.get("enabled", False):
return True
logger = setup_logging("tiktok_captions")
logger.info("Starting TikTok caption fetching job")
try:
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
config = ScraperConfig(
source_name="tiktok_captions",
brand_name="hkia",
data_dir=DATA_DIR / "tiktok_captions",
logs_dir=LOGS_DIR / "tiktok_captions",
timezone="America/Halifax"
)
scraper = TikTokScraperAdvanced(config)
# Fetch with captions
items = scraper.fetch_content(
max_posts=TIKTOK_CAPTION_JOB["max_posts"],
fetch_captions=True,
max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
)
# Save results
markdown = scraper.format_markdown(items)
output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
output_file.write_text(markdown, encoding="utf-8")
logger.info(f"TikTok caption job complete: {len(items)} videos processed")
return True
except Exception as e:
logger.error(f"TikTok caption job failed: {e}")
return False
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(description="Production content aggregator")
parser.add_argument(
"--job",
choices=["regular", "tiktok-captions", "all"],
default="regular",
help="Job type to run"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Test run without saving state"
)
args = parser.parse_args()
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
success = True
if args.job in ["regular", "all"]:
success = success and run_regular_scraping()
if args.job in ["tiktok-captions", "all"]:
success = success and run_tiktok_caption_job()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()