hvac-kia-content/run_production.py
Ben Reed 05218a873b Fix critical production issues and improve spec compliance
Production Readiness Improvements:
- Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM)
- Enabled NAS synchronization in production runner with error handling
- Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md)
- Made systemd services portable (removed hardcoded user/paths)
- Added environment variable validation on startup
- Moved DISPLAY/XAUTHORITY to .env configuration

Systemd Improvements:
- Created template service file (@.service) for any user
- Changed all paths to /opt/hvac-kia-content
- Updated installation script for portable deployment
- Fixed service dependencies and resource limits

Documentation:
- Created comprehensive PRODUCTION_TODO.md with 25 tasks
- Added PRODUCTION_GUIDE.md with deployment instructions
- Documented spec compliance gaps (65% complete)

Remaining work includes retry logic, connection pooling, media downloads,
and pytest test suite as documented in PRODUCTION_TODO.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 20:07:55 -03:00

284 lines
No EOL
9.6 KiB
Python

#!/usr/bin/env python3
"""
Production runner for HVAC Know It All Content Aggregator
Handles both regular scraping and special TikTok caption jobs
"""
import sys
import os
import argparse
import logging
from pathlib import Path
from datetime import datetime
import time
import json
# Add project to path
sys.path.insert(0, str(Path(__file__).parent))
from src.orchestrator import ContentOrchestrator
from src.base_scraper import ScraperConfig
from config.production import (
SCRAPERS_CONFIG,
PARALLEL_PROCESSING,
OUTPUT_CONFIG,
DATA_DIR,
LOGS_DIR,
TIKTOK_CAPTION_JOB
)
# Set up logging
def setup_logging(job_type="regular"):
"""Set up production logging"""
log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
return logging.getLogger(__name__)
def validate_environment():
"""Validate required environment variables exist"""
required_vars = [
'WORDPRESS_USERNAME',
'WORDPRESS_API_KEY',
'YOUTUBE_CHANNEL_URL',
'INSTAGRAM_USERNAME',
'INSTAGRAM_PASSWORD',
'TIKTOK_TARGET',
'NAS_PATH'
]
missing = []
for var in required_vars:
if not os.getenv(var):
missing.append(var)
if missing:
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
return True
def run_regular_scraping():
"""Run regular incremental scraping for all sources"""
logger = setup_logging("regular")
logger.info("Starting regular production scraping run")
# Validate environment first
try:
validate_environment()
logger.info("Environment validation passed")
except ValueError as e:
logger.error(f"Environment validation failed: {e}")
return False
start_time = time.time()
results = {}
try:
# Create orchestrator config
config = ScraperConfig(
source_name="production",
brand_name="hvacknowitall",
data_dir=DATA_DIR,
logs_dir=LOGS_DIR,
timezone="America/Halifax"
)
# Initialize orchestrator
orchestrator = ContentOrchestrator(config)
# Configure each scraper
for source, settings in SCRAPERS_CONFIG.items():
if not settings.get("enabled", True):
logger.info(f"Skipping {source} (disabled)")
continue
logger.info(f"Processing {source}...")
try:
scraper = orchestrator.scrapers.get(source)
if not scraper:
logger.warning(f"Scraper not found: {source}")
continue
# Set max items based on config
max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")
# Special handling for TikTok
if source == "tiktok":
items = scraper.fetch_content(
max_posts=max_items,
fetch_captions=settings.get("fetch_captions", False),
max_caption_fetches=settings.get("max_caption_fetches", 0)
)
elif source == "youtube":
items = scraper.fetch_channel_videos(max_videos=max_items)
elif source == "instagram":
items = scraper.fetch_content(max_posts=max_items)
else:
items = scraper.fetch_content(max_items=max_items)
# Apply incremental logic
if settings.get("incremental", True):
state = scraper.load_state()
new_items = scraper.get_incremental_items(items, state)
if new_items:
logger.info(f"Found {len(new_items)} new items for {source}")
# Update state
new_state = scraper.update_state(state, new_items)
scraper.save_state(new_state)
items = new_items
else:
logger.info(f"No new items for {source}")
items = []
results[source] = {
"count": len(items),
"success": True,
"items": items
}
except Exception as e:
logger.error(f"Error processing {source}: {e}")
results[source] = {
"count": 0,
"success": False,
"error": str(e)
}
# Combine and save results
if OUTPUT_CONFIG.get("combine_sources", True):
combined_markdown = []
combined_markdown.append(f"# HVAC Know It All Content Update")
combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
combined_markdown.append("")
for source, result in results.items():
if result["success"] and result["count"] > 0:
combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
combined_markdown.append("")
# Format items
scraper = orchestrator.scrapers.get(source)
if scraper and result["items"]:
markdown = scraper.format_markdown(result["items"])
combined_markdown.append(markdown)
# Save combined output with spec-compliant naming
# Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md
output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
logger.info(f"Saved combined output to {output_file}")
# Log summary
duration = time.time() - start_time
total_items = sum(r["count"] for r in results.values())
logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")
# Save metrics
metrics_file = LOGS_DIR / "metrics.json"
metrics = {
"timestamp": datetime.now().isoformat(),
"duration": duration,
"results": results
}
with open(metrics_file, "a") as f:
f.write(json.dumps(metrics) + "\n")
# Sync to NAS if configured and items were found
if total_items > 0:
try:
logger.info("Starting NAS synchronization...")
if orchestrator.sync_to_nas():
logger.info("NAS sync completed successfully")
else:
logger.warning("NAS sync failed - check configuration")
except Exception as e:
logger.error(f"NAS sync error: {e}")
# Don't fail the entire run for NAS sync issues
return True
except Exception as e:
logger.error(f"Production run failed: {e}")
return False
def run_tiktok_caption_job():
"""Special overnight job for fetching TikTok captions"""
if not TIKTOK_CAPTION_JOB.get("enabled", False):
return True
logger = setup_logging("tiktok_captions")
logger.info("Starting TikTok caption fetching job")
try:
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
config = ScraperConfig(
source_name="tiktok_captions",
brand_name="hvacknowitall",
data_dir=DATA_DIR / "tiktok_captions",
logs_dir=LOGS_DIR / "tiktok_captions",
timezone="America/Halifax"
)
scraper = TikTokScraperAdvanced(config)
# Fetch with captions
items = scraper.fetch_content(
max_posts=TIKTOK_CAPTION_JOB["max_posts"],
fetch_captions=True,
max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
)
# Save results
markdown = scraper.format_markdown(items)
output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
output_file.write_text(markdown, encoding="utf-8")
logger.info(f"TikTok caption job complete: {len(items)} videos processed")
return True
except Exception as e:
logger.error(f"TikTok caption job failed: {e}")
return False
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(description="Production content aggregator")
parser.add_argument(
"--job",
choices=["regular", "tiktok-captions", "all"],
default="regular",
help="Job type to run"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Test run without saving state"
)
args = parser.parse_args()
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
success = True
if args.job in ["regular", "all"]:
success = success and run_regular_scraping()
if args.job in ["tiktok-captions", "all"]:
success = success and run_tiktok_caption_job()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()