hvac-kia-content/run_production.py

#!/usr/bin/env python3
"""
Production runner for HVAC Know It All Content Aggregator
Handles both regular scraping and special TikTok caption jobs
"""
import sys
import os
import argparse
import logging
from pathlib import Path
from datetime import datetime
import time
import json

# Add project to path
sys.path.insert(0, str(Path(__file__).parent))

from src.orchestrator import ContentOrchestrator
from src.base_scraper import ScraperConfig
from config.production import (
    SCRAPERS_CONFIG,
    PARALLEL_PROCESSING,
    OUTPUT_CONFIG,
    DATA_DIR,
    LOGS_DIR,
    TIKTOK_CAPTION_JOB
)

# Set up logging
def setup_logging(job_type="regular"):
    """Set up production logging"""
    log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

def validate_environment():
    """Validate required environment variables exist"""
    required_vars = [
        'WORDPRESS_USERNAME',
        'WORDPRESS_API_KEY',
        'YOUTUBE_CHANNEL_URL',
        'INSTAGRAM_USERNAME',
        'INSTAGRAM_PASSWORD',
        'TIKTOK_TARGET',
        'NAS_PATH'
    ]

    missing = []
    for var in required_vars:
        if not os.getenv(var):
            missing.append(var)

    if missing:
        raise ValueError(f"Missing required environment variables: {', '.join(missing)}")

    return True

def validate_config():
    """Validate configuration values are reasonable"""
    from config.production import SCRAPERS_CONFIG, RETRY_CONFIG, PARALLEL_PROCESSING

    errors = []

    # Validate scraper configs
    for source, config in SCRAPERS_CONFIG.items():
        # Check max items are positive
        for key in ['max_posts', 'max_items', 'max_videos']:
            if key in config and config[key] is not None:
                if config[key] <= 0:
                    errors.append(f"{source}: {key} must be positive (got {config[key]})")

        # Check max_caption_fetches is reasonable
        if 'max_caption_fetches' in config:
            if config['max_caption_fetches'] < 0:
                errors.append(f"{source}: max_caption_fetches cannot be negative")
            if config['max_caption_fetches'] > 100:
                errors.append(f"{source}: max_caption_fetches too high (>100)")

    # Validate retry config
    if RETRY_CONFIG['max_attempts'] < 1:
        errors.append("RETRY_CONFIG: max_attempts must be at least 1")
    if RETRY_CONFIG['initial_delay'] < 0:
        errors.append("RETRY_CONFIG: initial_delay cannot be negative")
    if RETRY_CONFIG['max_delay'] < RETRY_CONFIG['initial_delay']:
        errors.append("RETRY_CONFIG: max_delay must be >= initial_delay")

    # Validate parallel processing
    if PARALLEL_PROCESSING.get('max_workers', 1) < 1:
        errors.append("PARALLEL_PROCESSING: max_workers must be at least 1")
    if PARALLEL_PROCESSING.get('max_workers', 1) > 10:
        errors.append("PARALLEL_PROCESSING: max_workers too high (>10)")

    if errors:
        raise ValueError(f"Configuration validation failed:\n" + "\n".join(errors))

    return True

def run_regular_scraping():
    """Run regular incremental scraping for all sources"""
    logger = setup_logging("regular")
    logger.info("Starting regular production scraping run")

    # Validate environment and config first
    try:
        validate_environment()
        logger.info("Environment validation passed")
        validate_config()
        logger.info("Configuration validation passed")
    except ValueError as e:
        logger.error(f"Validation failed: {e}")
        return False

    start_time = time.time()
    results = {}

    try:
        # Create orchestrator config
        config = ScraperConfig(
            source_name="production",
            brand_name="hvacknowitall",
            data_dir=DATA_DIR,
            logs_dir=LOGS_DIR,
            timezone="America/Halifax"
        )

        # Initialize orchestrator
        orchestrator = ContentOrchestrator(config)

        # Configure each scraper
        for source, settings in SCRAPERS_CONFIG.items():
            if not settings.get("enabled", True):
                logger.info(f"Skipping {source} (disabled)")
                continue

            logger.info(f"Processing {source}...")

            try:
                scraper = orchestrator.scrapers.get(source)
                if not scraper:
                    logger.warning(f"Scraper not found: {source}")
                    continue

                # Set max items based on config
                max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")

                # Special handling for TikTok
                if source == "tiktok":
                    items = scraper.fetch_content(
                        max_posts=max_items,
                        fetch_captions=settings.get("fetch_captions", False),
                        max_caption_fetches=settings.get("max_caption_fetches", 0)
                    )
                elif source == "youtube":
                    items = scraper.fetch_channel_videos(max_videos=max_items)
                elif source == "instagram":
                    items = scraper.fetch_content(max_posts=max_items)
                else:
                    items = scraper.fetch_content(max_items=max_items)

                # Apply incremental logic
                if settings.get("incremental", True):
                    state = scraper.load_state()
                    new_items = scraper.get_incremental_items(items, state)

                    if new_items:
                        logger.info(f"Found {len(new_items)} new items for {source}")
                        # Update state
                        new_state = scraper.update_state(state, new_items)
                        scraper.save_state(new_state)
                        items = new_items
                    else:
                        logger.info(f"No new items for {source}")
                        items = []

                results[source] = {
                    "count": len(items),
                    "success": True,
                    "items": items
                }

            except Exception as e:
                logger.error(f"Error processing {source}: {e}")
                results[source] = {
                    "count": 0,
                    "success": False,
                    "error": str(e)
                }

        # Combine and save results
        if OUTPUT_CONFIG.get("combine_sources", True):
            combined_markdown = []
            combined_markdown.append(f"# HVAC Know It All Content Update")
            combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
            combined_markdown.append("")

            for source, result in results.items():
                if result["success"] and result["count"] > 0:
                    combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
                    combined_markdown.append("")

                    # Format items
                    scraper = orchestrator.scrapers.get(source)
                    if scraper and result["items"]:
                        markdown = scraper.format_markdown(result["items"])
                        combined_markdown.append(markdown)

            # Save combined output with spec-compliant naming
            # Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md
            output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
            output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
            logger.info(f"Saved combined output to {output_file}")

        # Log summary
        duration = time.time() - start_time
        total_items = sum(r["count"] for r in results.values())
        logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")

        # Save metrics
        metrics_file = LOGS_DIR / "metrics.json"
        metrics = {
            "timestamp": datetime.now().isoformat(),
            "duration": duration,
            "results": results
        }
        with open(metrics_file, "a") as f:
            f.write(json.dumps(metrics) + "\n")

        # Sync to NAS if configured and items were found
        if total_items > 0:
            try:
                logger.info("Starting NAS synchronization...")
                if orchestrator.sync_to_nas():
                    logger.info("NAS sync completed successfully")
                else:
                    logger.warning("NAS sync failed - check configuration")
            except Exception as e:
                logger.error(f"NAS sync error: {e}")
                # Don't fail the entire run for NAS sync issues

        # Send health check ping if configured
        healthcheck_url = os.getenv("HEALTHCHECK_URL")
        if healthcheck_url:
            try:
                import requests
                # Include metrics in health check
                health_data = {
                    "status": "success",
                    "items": total_items,
                    "duration": duration,
                    "sources": len([r for r in results.values() if r["success"]])
                }
                response = requests.post(healthcheck_url, json=health_data, timeout=5)
                if response.status_code == 200:
                    logger.info("Health check ping sent successfully")
                else:
                    logger.warning(f"Health check ping failed: {response.status_code}")
            except Exception as e:
                logger.warning(f"Could not send health check: {e}")

        return True

    except Exception as e:
        logger.error(f"Production run failed: {e}")
        return False

def run_tiktok_caption_job():
    """Special overnight job for fetching TikTok captions"""
    if not TIKTOK_CAPTION_JOB.get("enabled", False):
        return True

    logger = setup_logging("tiktok_captions")
    logger.info("Starting TikTok caption fetching job")

    try:
        from src.tiktok_scraper_advanced import TikTokScraperAdvanced

        config = ScraperConfig(
            source_name="tiktok_captions",
            brand_name="hvacknowitall",
            data_dir=DATA_DIR / "tiktok_captions",
            logs_dir=LOGS_DIR / "tiktok_captions",
            timezone="America/Halifax"
        )

        scraper = TikTokScraperAdvanced(config)

        # Fetch with captions
        items = scraper.fetch_content(
            max_posts=TIKTOK_CAPTION_JOB["max_posts"],
            fetch_captions=True,
            max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
        )

        # Save results
        markdown = scraper.format_markdown(items)
        output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
        output_file.write_text(markdown, encoding="utf-8")

        logger.info(f"TikTok caption job complete: {len(items)} videos processed")
        return True

    except Exception as e:
        logger.error(f"TikTok caption job failed: {e}")
        return False

def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Production content aggregator")
    parser.add_argument(
        "--job",
        choices=["regular", "tiktok-captions", "all"],
        default="regular",
        help="Job type to run"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Test run without saving state"
    )

    args = parser.parse_args()

    # Load environment variables
    from dotenv import load_dotenv
    load_dotenv()

    success = True

    if args.job in ["regular", "all"]:
        success = success and run_regular_scraping()

    if args.job in ["tiktok-captions", "all"]:
        success = success and run_tiktok_caption_job()

    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()