Production Readiness Improvements: - Fixed scheduling to match spec (8 AM & 12 PM ADT instead of 6 AM/6 PM) - Enabled NAS synchronization in production runner with error handling - Fixed file naming convention to spec format (hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md) - Made systemd services portable (removed hardcoded user/paths) - Added environment variable validation on startup - Moved DISPLAY/XAUTHORITY to .env configuration Systemd Improvements: - Created template service file (@.service) for any user - Changed all paths to /opt/hvac-kia-content - Updated installation script for portable deployment - Fixed service dependencies and resource limits Documentation: - Created comprehensive PRODUCTION_TODO.md with 25 tasks - Added PRODUCTION_GUIDE.md with deployment instructions - Documented spec compliance gaps (65% complete) Remaining work includes retry logic, connection pooling, media downloads, and pytest test suite as documented in PRODUCTION_TODO.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
284 lines
No EOL
9.6 KiB
Python
284 lines
No EOL
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Production runner for HVAC Know It All Content Aggregator
|
|
Handles both regular scraping and special TikTok caption jobs
|
|
"""
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import time
|
|
import json
|
|
|
|
# Add project to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.orchestrator import ContentOrchestrator
|
|
from src.base_scraper import ScraperConfig
|
|
from config.production import (
|
|
SCRAPERS_CONFIG,
|
|
PARALLEL_PROCESSING,
|
|
OUTPUT_CONFIG,
|
|
DATA_DIR,
|
|
LOGS_DIR,
|
|
TIKTOK_CAPTION_JOB
|
|
)
|
|
|
|
# Set up logging
|
|
def setup_logging(job_type="regular"):
|
|
"""Set up production logging"""
|
|
log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(log_file),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
return logging.getLogger(__name__)
|
|
|
|
def validate_environment():
|
|
"""Validate required environment variables exist"""
|
|
required_vars = [
|
|
'WORDPRESS_USERNAME',
|
|
'WORDPRESS_API_KEY',
|
|
'YOUTUBE_CHANNEL_URL',
|
|
'INSTAGRAM_USERNAME',
|
|
'INSTAGRAM_PASSWORD',
|
|
'TIKTOK_TARGET',
|
|
'NAS_PATH'
|
|
]
|
|
|
|
missing = []
|
|
for var in required_vars:
|
|
if not os.getenv(var):
|
|
missing.append(var)
|
|
|
|
if missing:
|
|
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
|
|
|
return True
|
|
|
|
def run_regular_scraping():
|
|
"""Run regular incremental scraping for all sources"""
|
|
logger = setup_logging("regular")
|
|
logger.info("Starting regular production scraping run")
|
|
|
|
# Validate environment first
|
|
try:
|
|
validate_environment()
|
|
logger.info("Environment validation passed")
|
|
except ValueError as e:
|
|
logger.error(f"Environment validation failed: {e}")
|
|
return False
|
|
|
|
start_time = time.time()
|
|
results = {}
|
|
|
|
try:
|
|
# Create orchestrator config
|
|
config = ScraperConfig(
|
|
source_name="production",
|
|
brand_name="hvacknowitall",
|
|
data_dir=DATA_DIR,
|
|
logs_dir=LOGS_DIR,
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
# Initialize orchestrator
|
|
orchestrator = ContentOrchestrator(config)
|
|
|
|
# Configure each scraper
|
|
for source, settings in SCRAPERS_CONFIG.items():
|
|
if not settings.get("enabled", True):
|
|
logger.info(f"Skipping {source} (disabled)")
|
|
continue
|
|
|
|
logger.info(f"Processing {source}...")
|
|
|
|
try:
|
|
scraper = orchestrator.scrapers.get(source)
|
|
if not scraper:
|
|
logger.warning(f"Scraper not found: {source}")
|
|
continue
|
|
|
|
# Set max items based on config
|
|
max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")
|
|
|
|
# Special handling for TikTok
|
|
if source == "tiktok":
|
|
items = scraper.fetch_content(
|
|
max_posts=max_items,
|
|
fetch_captions=settings.get("fetch_captions", False),
|
|
max_caption_fetches=settings.get("max_caption_fetches", 0)
|
|
)
|
|
elif source == "youtube":
|
|
items = scraper.fetch_channel_videos(max_videos=max_items)
|
|
elif source == "instagram":
|
|
items = scraper.fetch_content(max_posts=max_items)
|
|
else:
|
|
items = scraper.fetch_content(max_items=max_items)
|
|
|
|
# Apply incremental logic
|
|
if settings.get("incremental", True):
|
|
state = scraper.load_state()
|
|
new_items = scraper.get_incremental_items(items, state)
|
|
|
|
if new_items:
|
|
logger.info(f"Found {len(new_items)} new items for {source}")
|
|
# Update state
|
|
new_state = scraper.update_state(state, new_items)
|
|
scraper.save_state(new_state)
|
|
items = new_items
|
|
else:
|
|
logger.info(f"No new items for {source}")
|
|
items = []
|
|
|
|
results[source] = {
|
|
"count": len(items),
|
|
"success": True,
|
|
"items": items
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {source}: {e}")
|
|
results[source] = {
|
|
"count": 0,
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
# Combine and save results
|
|
if OUTPUT_CONFIG.get("combine_sources", True):
|
|
combined_markdown = []
|
|
combined_markdown.append(f"# HVAC Know It All Content Update")
|
|
combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
|
combined_markdown.append("")
|
|
|
|
for source, result in results.items():
|
|
if result["success"] and result["count"] > 0:
|
|
combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
|
|
combined_markdown.append("")
|
|
|
|
# Format items
|
|
scraper = orchestrator.scrapers.get(source)
|
|
if scraper and result["items"]:
|
|
markdown = scraper.format_markdown(result["items"])
|
|
combined_markdown.append(markdown)
|
|
|
|
# Save combined output with spec-compliant naming
|
|
# Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md
|
|
output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
|
|
output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
|
|
logger.info(f"Saved combined output to {output_file}")
|
|
|
|
# Log summary
|
|
duration = time.time() - start_time
|
|
total_items = sum(r["count"] for r in results.values())
|
|
logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")
|
|
|
|
# Save metrics
|
|
metrics_file = LOGS_DIR / "metrics.json"
|
|
metrics = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"duration": duration,
|
|
"results": results
|
|
}
|
|
with open(metrics_file, "a") as f:
|
|
f.write(json.dumps(metrics) + "\n")
|
|
|
|
# Sync to NAS if configured and items were found
|
|
if total_items > 0:
|
|
try:
|
|
logger.info("Starting NAS synchronization...")
|
|
if orchestrator.sync_to_nas():
|
|
logger.info("NAS sync completed successfully")
|
|
else:
|
|
logger.warning("NAS sync failed - check configuration")
|
|
except Exception as e:
|
|
logger.error(f"NAS sync error: {e}")
|
|
# Don't fail the entire run for NAS sync issues
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Production run failed: {e}")
|
|
return False
|
|
|
|
def run_tiktok_caption_job():
|
|
"""Special overnight job for fetching TikTok captions"""
|
|
if not TIKTOK_CAPTION_JOB.get("enabled", False):
|
|
return True
|
|
|
|
logger = setup_logging("tiktok_captions")
|
|
logger.info("Starting TikTok caption fetching job")
|
|
|
|
try:
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
|
|
config = ScraperConfig(
|
|
source_name="tiktok_captions",
|
|
brand_name="hvacknowitall",
|
|
data_dir=DATA_DIR / "tiktok_captions",
|
|
logs_dir=LOGS_DIR / "tiktok_captions",
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
scraper = TikTokScraperAdvanced(config)
|
|
|
|
# Fetch with captions
|
|
items = scraper.fetch_content(
|
|
max_posts=TIKTOK_CAPTION_JOB["max_posts"],
|
|
fetch_captions=True,
|
|
max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
|
|
)
|
|
|
|
# Save results
|
|
markdown = scraper.format_markdown(items)
|
|
output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
|
|
output_file.write_text(markdown, encoding="utf-8")
|
|
|
|
logger.info(f"TikTok caption job complete: {len(items)} videos processed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"TikTok caption job failed: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
parser = argparse.ArgumentParser(description="Production content aggregator")
|
|
parser.add_argument(
|
|
"--job",
|
|
choices=["regular", "tiktok-captions", "all"],
|
|
default="regular",
|
|
help="Job type to run"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Test run without saving state"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load environment variables
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
success = True
|
|
|
|
if args.job in ["regular", "all"]:
|
|
success = success and run_regular_scraping()
|
|
|
|
if args.job in ["tiktok-captions", "all"]:
|
|
success = success and run_tiktok_caption_job()
|
|
|
|
sys.exit(0 if success else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |