Major Production Improvements: - Added retry logic with exponential backoff using tenacity - Implemented HTTP connection pooling via requests.Session - Added health check monitoring with metrics reporting - Implemented configuration validation for all numeric values - Fixed error isolation (verified continues on failure) Technical Changes: - BaseScraper: Added session management and make_request() method - WordPressScraper: Updated all HTTP calls to use retry logic - Production runner: Added validate_config() and health check ping - Retry config: 3 attempts, 5-60s exponential backoff System is now production-ready with robust error handling, automatic retries, and health monitoring. Remaining tasks focus on spec compliance (media downloads, markdown format) and testing/documentation. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
346 lines
No EOL
12 KiB
Python
346 lines
No EOL
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Production runner for HVAC Know It All Content Aggregator
|
|
Handles both regular scraping and special TikTok caption jobs
|
|
"""
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import time
|
|
import json
|
|
|
|
# Add project to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from src.orchestrator import ContentOrchestrator
|
|
from src.base_scraper import ScraperConfig
|
|
from config.production import (
|
|
SCRAPERS_CONFIG,
|
|
PARALLEL_PROCESSING,
|
|
OUTPUT_CONFIG,
|
|
DATA_DIR,
|
|
LOGS_DIR,
|
|
TIKTOK_CAPTION_JOB
|
|
)
|
|
|
|
# Set up logging
|
|
def setup_logging(job_type="regular"):
|
|
"""Set up production logging"""
|
|
log_file = LOGS_DIR / f"production_{job_type}_{datetime.now():%Y%m%d}.log"
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(log_file),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
return logging.getLogger(__name__)
|
|
|
|
def validate_environment():
|
|
"""Validate required environment variables exist"""
|
|
required_vars = [
|
|
'WORDPRESS_USERNAME',
|
|
'WORDPRESS_API_KEY',
|
|
'YOUTUBE_CHANNEL_URL',
|
|
'INSTAGRAM_USERNAME',
|
|
'INSTAGRAM_PASSWORD',
|
|
'TIKTOK_TARGET',
|
|
'NAS_PATH'
|
|
]
|
|
|
|
missing = []
|
|
for var in required_vars:
|
|
if not os.getenv(var):
|
|
missing.append(var)
|
|
|
|
if missing:
|
|
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
|
|
|
return True
|
|
|
|
def validate_config():
|
|
"""Validate configuration values are reasonable"""
|
|
from config.production import SCRAPERS_CONFIG, RETRY_CONFIG, PARALLEL_PROCESSING
|
|
|
|
errors = []
|
|
|
|
# Validate scraper configs
|
|
for source, config in SCRAPERS_CONFIG.items():
|
|
# Check max items are positive
|
|
for key in ['max_posts', 'max_items', 'max_videos']:
|
|
if key in config and config[key] is not None:
|
|
if config[key] <= 0:
|
|
errors.append(f"{source}: {key} must be positive (got {config[key]})")
|
|
|
|
# Check max_caption_fetches is reasonable
|
|
if 'max_caption_fetches' in config:
|
|
if config['max_caption_fetches'] < 0:
|
|
errors.append(f"{source}: max_caption_fetches cannot be negative")
|
|
if config['max_caption_fetches'] > 100:
|
|
errors.append(f"{source}: max_caption_fetches too high (>100)")
|
|
|
|
# Validate retry config
|
|
if RETRY_CONFIG['max_attempts'] < 1:
|
|
errors.append("RETRY_CONFIG: max_attempts must be at least 1")
|
|
if RETRY_CONFIG['initial_delay'] < 0:
|
|
errors.append("RETRY_CONFIG: initial_delay cannot be negative")
|
|
if RETRY_CONFIG['max_delay'] < RETRY_CONFIG['initial_delay']:
|
|
errors.append("RETRY_CONFIG: max_delay must be >= initial_delay")
|
|
|
|
# Validate parallel processing
|
|
if PARALLEL_PROCESSING.get('max_workers', 1) < 1:
|
|
errors.append("PARALLEL_PROCESSING: max_workers must be at least 1")
|
|
if PARALLEL_PROCESSING.get('max_workers', 1) > 10:
|
|
errors.append("PARALLEL_PROCESSING: max_workers too high (>10)")
|
|
|
|
if errors:
|
|
raise ValueError(f"Configuration validation failed:\n" + "\n".join(errors))
|
|
|
|
return True
|
|
|
|
def run_regular_scraping():
|
|
"""Run regular incremental scraping for all sources"""
|
|
logger = setup_logging("regular")
|
|
logger.info("Starting regular production scraping run")
|
|
|
|
# Validate environment and config first
|
|
try:
|
|
validate_environment()
|
|
logger.info("Environment validation passed")
|
|
validate_config()
|
|
logger.info("Configuration validation passed")
|
|
except ValueError as e:
|
|
logger.error(f"Validation failed: {e}")
|
|
return False
|
|
|
|
start_time = time.time()
|
|
results = {}
|
|
|
|
try:
|
|
# Create orchestrator config
|
|
config = ScraperConfig(
|
|
source_name="production",
|
|
brand_name="hvacknowitall",
|
|
data_dir=DATA_DIR,
|
|
logs_dir=LOGS_DIR,
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
# Initialize orchestrator
|
|
orchestrator = ContentOrchestrator(config)
|
|
|
|
# Configure each scraper
|
|
for source, settings in SCRAPERS_CONFIG.items():
|
|
if not settings.get("enabled", True):
|
|
logger.info(f"Skipping {source} (disabled)")
|
|
continue
|
|
|
|
logger.info(f"Processing {source}...")
|
|
|
|
try:
|
|
scraper = orchestrator.scrapers.get(source)
|
|
if not scraper:
|
|
logger.warning(f"Scraper not found: {source}")
|
|
continue
|
|
|
|
# Set max items based on config
|
|
max_items = settings.get("max_posts") or settings.get("max_items") or settings.get("max_videos")
|
|
|
|
# Special handling for TikTok
|
|
if source == "tiktok":
|
|
items = scraper.fetch_content(
|
|
max_posts=max_items,
|
|
fetch_captions=settings.get("fetch_captions", False),
|
|
max_caption_fetches=settings.get("max_caption_fetches", 0)
|
|
)
|
|
elif source == "youtube":
|
|
items = scraper.fetch_channel_videos(max_videos=max_items)
|
|
elif source == "instagram":
|
|
items = scraper.fetch_content(max_posts=max_items)
|
|
else:
|
|
items = scraper.fetch_content(max_items=max_items)
|
|
|
|
# Apply incremental logic
|
|
if settings.get("incremental", True):
|
|
state = scraper.load_state()
|
|
new_items = scraper.get_incremental_items(items, state)
|
|
|
|
if new_items:
|
|
logger.info(f"Found {len(new_items)} new items for {source}")
|
|
# Update state
|
|
new_state = scraper.update_state(state, new_items)
|
|
scraper.save_state(new_state)
|
|
items = new_items
|
|
else:
|
|
logger.info(f"No new items for {source}")
|
|
items = []
|
|
|
|
results[source] = {
|
|
"count": len(items),
|
|
"success": True,
|
|
"items": items
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {source}: {e}")
|
|
results[source] = {
|
|
"count": 0,
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
# Combine and save results
|
|
if OUTPUT_CONFIG.get("combine_sources", True):
|
|
combined_markdown = []
|
|
combined_markdown.append(f"# HVAC Know It All Content Update")
|
|
combined_markdown.append(f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
|
combined_markdown.append("")
|
|
|
|
for source, result in results.items():
|
|
if result["success"] and result["count"] > 0:
|
|
combined_markdown.append(f"\n## {source.upper()} ({result['count']} new items)")
|
|
combined_markdown.append("")
|
|
|
|
# Format items
|
|
scraper = orchestrator.scrapers.get(source)
|
|
if scraper and result["items"]:
|
|
markdown = scraper.format_markdown(result["items"])
|
|
combined_markdown.append(markdown)
|
|
|
|
# Save combined output with spec-compliant naming
|
|
# Format: hvacknowitall_combined_YYYY-MM-DD-THHMMSS.md
|
|
output_file = DATA_DIR / f"hvacknowitall_combined_{datetime.now():%Y-%m-%d-T%H%M%S}.md"
|
|
output_file.write_text("\n".join(combined_markdown), encoding="utf-8")
|
|
logger.info(f"Saved combined output to {output_file}")
|
|
|
|
# Log summary
|
|
duration = time.time() - start_time
|
|
total_items = sum(r["count"] for r in results.values())
|
|
logger.info(f"Production run complete: {total_items} total items in {duration:.1f}s")
|
|
|
|
# Save metrics
|
|
metrics_file = LOGS_DIR / "metrics.json"
|
|
metrics = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"duration": duration,
|
|
"results": results
|
|
}
|
|
with open(metrics_file, "a") as f:
|
|
f.write(json.dumps(metrics) + "\n")
|
|
|
|
# Sync to NAS if configured and items were found
|
|
if total_items > 0:
|
|
try:
|
|
logger.info("Starting NAS synchronization...")
|
|
if orchestrator.sync_to_nas():
|
|
logger.info("NAS sync completed successfully")
|
|
else:
|
|
logger.warning("NAS sync failed - check configuration")
|
|
except Exception as e:
|
|
logger.error(f"NAS sync error: {e}")
|
|
# Don't fail the entire run for NAS sync issues
|
|
|
|
# Send health check ping if configured
|
|
healthcheck_url = os.getenv("HEALTHCHECK_URL")
|
|
if healthcheck_url:
|
|
try:
|
|
import requests
|
|
# Include metrics in health check
|
|
health_data = {
|
|
"status": "success",
|
|
"items": total_items,
|
|
"duration": duration,
|
|
"sources": len([r for r in results.values() if r["success"]])
|
|
}
|
|
response = requests.post(healthcheck_url, json=health_data, timeout=5)
|
|
if response.status_code == 200:
|
|
logger.info("Health check ping sent successfully")
|
|
else:
|
|
logger.warning(f"Health check ping failed: {response.status_code}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not send health check: {e}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Production run failed: {e}")
|
|
return False
|
|
|
|
def run_tiktok_caption_job():
|
|
"""Special overnight job for fetching TikTok captions"""
|
|
if not TIKTOK_CAPTION_JOB.get("enabled", False):
|
|
return True
|
|
|
|
logger = setup_logging("tiktok_captions")
|
|
logger.info("Starting TikTok caption fetching job")
|
|
|
|
try:
|
|
from src.tiktok_scraper_advanced import TikTokScraperAdvanced
|
|
|
|
config = ScraperConfig(
|
|
source_name="tiktok_captions",
|
|
brand_name="hvacknowitall",
|
|
data_dir=DATA_DIR / "tiktok_captions",
|
|
logs_dir=LOGS_DIR / "tiktok_captions",
|
|
timezone="America/Halifax"
|
|
)
|
|
|
|
scraper = TikTokScraperAdvanced(config)
|
|
|
|
# Fetch with captions
|
|
items = scraper.fetch_content(
|
|
max_posts=TIKTOK_CAPTION_JOB["max_posts"],
|
|
fetch_captions=True,
|
|
max_caption_fetches=TIKTOK_CAPTION_JOB["max_caption_fetches"]
|
|
)
|
|
|
|
# Save results
|
|
markdown = scraper.format_markdown(items)
|
|
output_file = DATA_DIR / f"tiktok_captions_{datetime.now():%Y%m%d}.md"
|
|
output_file.write_text(markdown, encoding="utf-8")
|
|
|
|
logger.info(f"TikTok caption job complete: {len(items)} videos processed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"TikTok caption job failed: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
parser = argparse.ArgumentParser(description="Production content aggregator")
|
|
parser.add_argument(
|
|
"--job",
|
|
choices=["regular", "tiktok-captions", "all"],
|
|
default="regular",
|
|
help="Job type to run"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Test run without saving state"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load environment variables
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
success = True
|
|
|
|
if args.job in ["regular", "all"]:
|
|
success = success and run_regular_scraping()
|
|
|
|
if args.job in ["tiktok-captions", "all"]:
|
|
success = success and run_tiktok_caption_job()
|
|
|
|
sys.exit(0 if success else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main() |