#!/usr/bin/env python3 """ Production runner for HVAC Know It All Content Aggregator Handles both regular scraping and special TikTok caption jobs """ import sys import os import argparse import logging from pathlib import Path from datetime import datetime import time import json # Add project to path sys.path.insert(0, str(Path(__file__).parent)) from src.orchestrator import ContentOrchestrator from src.base_scraper import ScraperConfig from config.production import ( SCRAPERS_CONFIG, PARALLEL_PROCESSING, OUTPUT_CONFIG, DATA_DIR, LOGS_DIR, TIKTOK_CAPTION_JOB ) # Set up logging def setup_logging(job_type="regular"): """Set up production logging""" log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file), logging.StreamHandler() ] ) return logging.getLogger(__name__) def validate_environment(): """Validate required environment variables exist""" required_vars = [ 'WORDPRESS_USERNAME', 'WORDPRESS_API_KEY', 'YOUTUBE_CHANNEL_URL', 'INSTAGRAM_USERNAME', 'INSTAGRAM_PASSWORD', 'TIKTOK_TARGET', 'NAS_PATH' ] missing = [] for var in required_vars: if not os.getenv(var): missing.append(var) if missing: raise ValueError(f"Missing required environment variables: {', '.join(missing)}") return True def run_regular_scraping(): """Run regular incremental scraping for all sources""" logger = setup_logging("regular") logger.info("Starting regular production scraping run") # Validate environment first try: validate_environment() logger.info("Environment validation passed") except ValueError as e: logger.error(f"Environment validation failed: {e}") return False start_time = time.time() results = {} try: # Create orchestrator config config = ScraperConfig( source_name="production", brand_name="hvacknowitall", data_dir=DATA_DIR, logs_dir=LOGS_DIR, timezone="America/Halifax" ) # Initialize orchestrator orchestrator = ContentOrchestrator(config) # Configure each scraper for source, settings in SCRAPERS_CONFIG.items(): if not settings.get("enabled", True): logger.info(f"Skipping {source} (disabled)") continue logger.info(f"Processing {source}...") try: scraper = orchestrator.scrapers.get(source) if not scraper: logger.warning(f"Scraper not found: {source}") continue # Set max items based on config max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos") # Special handling for TikTok if source == "tiktok": items = scraper.fetch_content( max_posts=max_items, fetch_captions=settings.get("fetch_captions", False), max_caption_fetches=settings.get("max_caption_fetches", 0) ) elif source == "youtube": items = scraper.fetch_channel_videos(max_videos=max_items) elif source == "instagram": items = scraper.fetch_content(max_posts=max_items) else: items = scraper.fetch_content(max_items=max_items) # Apply incremental logic if settings.get("incremental", True): state = scraper.load_state() new_items = scraper.get_incremental_items(items, state) if new_items: logger.info(f"Found {len(new_items)} new items for {source}") # Update state new_state = scraper.update_state(state, new_items) scraper.save_state(new_state) items = new_items else: logger.info(f"No new items for {source}") items = [] results[source] = { "count": len(items), "success": True, "items": items } except Exception as e: logger.error(f"Error processing {source}: {e}") results[source] = { "count": 0, "success": False, "error": str(e) } # Combine and save results if OUTPUT_CONFIG.get("combine_sources", True): combined_markdown = [] combined_markdown.append(f"# HVAC Know It All Content Update") combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}") combined_markdown.append("") for source, result in results.items(): if result["success"] and result["count"] > 0: combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)") combined_markdown.append("") # Format items scraper = orchestrator.scrapers.get(source) if scraper and result["items"]: markdown = scraper.format_markdown(result["items"]) combined_markdown.append(markdown) # Save combined output with spec-compliant naming # Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md" output_file.write_text("\n".join(combined_markdown), encoding="utf-8") logger.info(f"Saved combined output to {output_file}") # Log summary duration = time.time() - start_time total_items = sum(r["count"] for r in results.values()) logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s") # Save metrics metrics_file = LOGS_DIR / "metrics.json" metrics = { "timestamp": datetime.now().isoformat(), "duration": duration, "results": results } with open(metrics_file, "a") as f: f.write(json.dumps(metrics) + "\n") # Sync to NAS if configured and items were found if total_items > 0: try: logger.info("Starting NAS synchronization...") if orchestrator.sync_to_nas(): logger.info("NAS sync completed successfully") else: logger.warning("NAS sync failed - check configuration") except Exception as e: logger.error(f"NAS sync error: {e}") # Don't fail the entire run for NAS sync issues return True except Exception as e: logger.error(f"Production run failed: {e}") return False def run_tiktok_caption_job(): """Special overnight job for fetching TikTok captions""" if not TIKTOK_CAPTION_JOB.get("enabled", False): return True logger = setup_logging("tiktok_captions") logger.info("Starting TikTok caption fetching job") try: from src.tiktok_scraper_advanced import TikTokScraperAdvanced config = ScraperConfig( source_name="tiktok_captions", brand_name="hvacknowitall", data_dir=DATA_DIR / "tiktok_captions", logs_dir=LOGS_DIR / "tiktok_captions", timezone="America/Halifax" ) scraper = TikTokScraperAdvanced(config) # Fetch with captions items = scraper.fetch_content( max_posts=TIKTOK_CAPTION_JOB["max_posts"], fetch_captions=True, max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"] ) # Save results markdown = scraper.format_markdown(items) output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md" output_file.write_text(markdown, encoding="utf-8") logger.info(f"TikTok caption job complete: {len(items)} videos processed") return True except Exception as e: logger.error(f"TikTok caption job failed: {e}") return False def main(): """Main entry point""" parser = argparse.ArgumentParser(description="Production content aggregator") parser.add_argument( "--job", choices=["regular", "tiktok-captions", "all"], default="regular", help="Job type to run" ) parser.add_argument( "--dry-run", action="store_true", help="Test run without saving state" ) args = parser.parse_args() # Load environment variables from dotenv import load_dotenv load_dotenv() success = True if args.job in ["regular", "all"]: success = success and run_regular_scraping() if args.job in ["tiktok-captions", "all"]: success = success and run_tiktok_caption_job() sys.exit(0 if success else 1) if __name__ == "__main__": main()