hvac-kia-content/run_production.py

#!/usr/bin/env python3
"""
Production runner for HVAC Know It All Content Aggregator
Handles both regular scraping and special TikTok caption jobs
"""
import sys
import os
import argparse
import logging
from pathlib import Path
from datetime import datetime
import time
import json

# Add project to path
sys.path.insert(0, str(Path(__file__).parent))

from src.orchestrator import ContentOrchestrator
from src.base_scraper import ScraperConfig
from config.production import (
    SCRAPERS_CONFIG,
    PARALLEL_PROCESSING,
    OUTPUT_CONFIG,
    DATA_DIR,
    LOGS_DIR,
    TIKTOK_CAPTION_JOB
)

# Set up logging
def setup_logging(job_type="regular"):
    """Set up production logging"""
    log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

def validate_environment():
    """Validate required environment variables exist"""
    required_vars = [
        'WORDPRESS_USERNAME',
        'WORDPRESS_API_KEY',
        'YOUTUBE_CHANNEL_URL',
        'INSTAGRAM_USERNAME',
        'INSTAGRAM_PASSWORD',
        'TIKTOK_TARGET',
        'NAS_PATH'
    ]

    missing = []
    for var in required_vars:
        if not os.getenv(var):
            missing.append(var)

    if missing:
        raise ValueError(f"Missing required environment variables: {', '.join(missing)}")

    return True

def run_regular_scraping():
    """Run regular incremental scraping for all sources"""
    logger = setup_logging("regular")
    logger.info("Starting regular production scraping run")

    # Validate environment first
    try:
        validate_environment()
        logger.info("Environment validation passed")
    except ValueError as e:
        logger.error(f"Environment validation failed: {e}")
        return False

    start_time = time.time()
    results = {}

    try:
        # Create orchestrator config
        config = ScraperConfig(
            source_name="production",
            brand_name="hvacknowitall",
            data_dir=DATA_DIR,
            logs_dir=LOGS_DIR,
            timezone="America/Halifax"
        )

        # Initialize orchestrator
        orchestrator = ContentOrchestrator(config)

        # Configure each scraper
        for source, settings in SCRAPERS_CONFIG.items():
            if not settings.get("enabled", True):
                logger.info(f"Skipping {source} (disabled)")
                continue

            logger.info(f"Processing {source}...")

            try:
                scraper = orchestrator.scrapers.get(source)
                if not scraper:
                    logger.warning(f"Scraper not found: {source}")
                    continue

                # Set max items based on config
                max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")

                # Special handling for TikTok
                if source == "tiktok":
                    items = scraper.fetch_content(
                        max_posts=max_items,
                        fetch_captions=settings.get("fetch_captions", False),
                        max_caption_fetches=settings.get("max_caption_fetches", 0)
                    )
                elif source == "youtube":
                    items = scraper.fetch_channel_videos(max_videos=max_items)
                elif source == "instagram":
                    items = scraper.fetch_content(max_posts=max_items)
                else:
                    items = scraper.fetch_content(max_items=max_items)

                # Apply incremental logic
                if settings.get("incremental", True):
                    state = scraper.load_state()
                    new_items = scraper.get_incremental_items(items, state)

                    if new_items:
                        logger.info(f"Found {len(new_items)} new items for {source}")
                        # Update state
                        new_state = scraper.update_state(state, new_items)
                        scraper.save_state(new_state)
                        items = new_items
                    else:
                        logger.info(f"No new items for {source}")
                        items = []

                results[source] = {
                    "count": len(items),
                    "success": True,
                    "items": items
                }

            except Exception as e:
                logger.error(f"Error processing {source}: {e}")
                results[source] = {
                    "count": 0,
                    "success": False,
                    "error": str(e)
                }

        # Combine and save results
        if OUTPUT_CONFIG.get("combine_sources", True):
            combined_markdown = []
            combined_markdown.append(f"# HVAC Know It All Content Update")
            combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
            combined_markdown.append("")

            for source, result in results.items():
                if result["success"] and result["count"] > 0:
                    combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
                    combined_markdown.append("")

                    # Format items
                    scraper = orchestrator.scrapers.get(source)
                    if scraper and result["items"]:
                        markdown = scraper.format_markdown(result["items"])
                        combined_markdown.append(markdown)

            # Save combined output with spec-compliant naming
            # Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md
            output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
            output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
            logger.info(f"Saved combined output to {output_file}")

        # Log summary
        duration = time.time() - start_time
        total_items = sum(r["count"] for r in results.values())
        logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")

        # Save metrics
        metrics_file = LOGS_DIR / "metrics.json"
        metrics = {
            "timestamp": datetime.now().isoformat(),
            "duration": duration,
            "results": results
        }
        with open(metrics_file, "a") as f:
            f.write(json.dumps(metrics) + "\n")

        # Sync to NAS if configured and items were found
        if total_items > 0:
            try:
                logger.info("Starting NAS synchronization...")
                if orchestrator.sync_to_nas():
                    logger.info("NAS sync completed successfully")
                else:
                    logger.warning("NAS sync failed - check configuration")
            except Exception as e:
                logger.error(f"NAS sync error: {e}")
                # Don't fail the entire run for NAS sync issues

        return True

    except Exception as e:
        logger.error(f"Production run failed: {e}")
        return False

def run_tiktok_caption_job():
    """Special overnight job for fetching TikTok captions"""
    if not TIKTOK_CAPTION_JOB.get("enabled", False):
        return True

    logger = setup_logging("tiktok_captions")
    logger.info("Starting TikTok caption fetching job")

    try:
        from src.tiktok_scraper_advanced import TikTokScraperAdvanced

        config = ScraperConfig(
            source_name="tiktok_captions",
            brand_name="hvacknowitall",
            data_dir=DATA_DIR / "tiktok_captions",
            logs_dir=LOGS_DIR / "tiktok_captions",
            timezone="America/Halifax"
        )

        scraper = TikTokScraperAdvanced(config)

        # Fetch with captions
        items = scraper.fetch_content(
            max_posts=TIKTOK_CAPTION_JOB["max_posts"],
            fetch_captions=True,
            max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
        )

        # Save results
        markdown = scraper.format_markdown(items)
        output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
        output_file.write_text(markdown, encoding="utf-8")

        logger.info(f"TikTok caption job complete: {len(items)} videos processed")
        return True

    except Exception as e:
        logger.error(f"TikTok caption job failed: {e}")
        return False

def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Production content aggregator")
    parser.add_argument(
        "--job",
        choices=["regular", "tiktok-captions", "all"],
        default="regular",
        help="Job type to run"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Test run without saving state"
    )

    args = parser.parse_args()

    # Load environment variables
    from dotenv import load_dotenv
    load_dotenv()

    success = True

    if args.job in ["regular", "all"]:
        success = success and run_regular_scraping()

    if args.job in ["tiktok-captions", "all"]:
        success = success and run_tiktok_caption_job()

    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()