#!/usr/bin/env python3 """ Monitoring setup script for HVAC Know It All Content Aggregation System This script sets up: 1. Health check endpoints 2. Metrics collection 3. Log monitoring 4. Alert configuration 5. Dashboard generation """ import json import os import time from pathlib import Path from typing import Dict, List, Any from datetime import datetime, timedelta import psutil import logging # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class SystemMonitor: """Monitor system health and performance metrics""" def __init__(self, data_dir: Path = None, logs_dir: Path = None): self.data_dir = data_dir or Path("/opt/hvac-kia-content/data") self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs") # Use relative monitoring paths when custom data/logs dirs are provided if data_dir or logs_dir: base_dir = (data_dir or logs_dir).parent self.metrics_dir = base_dir / "monitoring" / "metrics" self.alerts_dir = base_dir / "monitoring" / "alerts" else: self.metrics_dir = Path("/opt/hvac-kia-content/monitoring/metrics") self.alerts_dir = Path("/opt/hvac-kia-content/monitoring/alerts") # Create monitoring directories self.metrics_dir.mkdir(parents=True, exist_ok=True) self.alerts_dir.mkdir(parents=True, exist_ok=True) def collect_system_metrics(self) -> Dict[str, Any]: """Collect system-level metrics""" try: # CPU and Memory cpu_percent = psutil.cpu_percent(interval=1) memory = psutil.virtual_memory() disk = psutil.disk_usage('/') # Network (if available) try: network = psutil.net_io_counters() network_stats = { 'bytes_sent': network.bytes_sent, 'bytes_recv': network.bytes_recv, 'packets_sent': network.packets_sent, 'packets_recv': network.packets_recv } except: network_stats = None metrics = { 'timestamp': datetime.now().isoformat(), 'system': { 'cpu_percent': cpu_percent, 'memory_percent': memory.percent, 'memory_available_gb': memory.available / (1024**3), 'disk_percent': disk.percent, 'disk_free_gb': disk.free / (1024**3), 'load_average': os.getloadavg() if hasattr(os, 'getloadavg') else None, 'uptime_hours': (time.time() - psutil.boot_time()) / 3600 }, 'network': network_stats } return metrics except Exception as e: logger.error(f"Error collecting system metrics: {e}") return {'error': str(e), 'timestamp': datetime.now().isoformat()} def collect_application_metrics(self) -> Dict[str, Any]: """Collect application-specific metrics""" try: metrics = { 'timestamp': datetime.now().isoformat(), 'data_directories': {}, 'log_files': {}, 'scrapers': {} } # Check data directory sizes if self.data_dir.exists(): for subdir in ['markdown_current', 'markdown_archives', 'media', '.state']: dir_path = self.data_dir / subdir if dir_path.exists(): size_mb = sum(f.stat().st_size for f in dir_path.rglob('*') if f.is_file()) / (1024**2) file_count = sum(1 for f in dir_path.rglob('*') if f.is_file()) metrics['data_directories'][subdir] = { 'size_mb': round(size_mb, 2), 'file_count': file_count } # Check log file sizes and recent activity if self.logs_dir.exists(): for source_dir in self.logs_dir.iterdir(): if source_dir.is_dir(): log_files = list(source_dir.glob('*.log')) if log_files: latest_log = max(log_files, key=lambda f: f.stat().st_mtime) size_mb = latest_log.stat().st_size / (1024**2) last_modified = datetime.fromtimestamp(latest_log.stat().st_mtime) metrics['log_files'][source_dir.name] = { 'size_mb': round(size_mb, 2), 'last_modified': last_modified.isoformat(), 'minutes_since_update': (datetime.now() - last_modified).total_seconds() / 60 } # Check scraper state files state_dir = self.data_dir / '.state' if state_dir.exists(): for state_file in state_dir.glob('*_state.json'): try: with open(state_file) as f: state_data = json.load(f) scraper_name = state_file.stem.replace('_state', '') last_update = state_data.get('last_update') if last_update: last_update_dt = datetime.fromisoformat(last_update.replace('Z', '+00:00')) minutes_since = (datetime.now() - last_update_dt.replace(tzinfo=None)).total_seconds() / 60 else: minutes_since = None metrics['scrapers'][scraper_name] = { 'last_item_count': state_data.get('last_item_count', 0), 'last_update': last_update, 'minutes_since_update': minutes_since, 'last_id': state_data.get('last_id') } except Exception as e: logger.warning(f"Error reading state file {state_file}: {e}") return metrics except Exception as e: logger.error(f"Error collecting application metrics: {e}") return {'error': str(e), 'timestamp': datetime.now().isoformat()} def save_metrics(self, metrics: Dict[str, Any], metric_type: str): """Save metrics to file with timestamp""" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{metric_type}_{timestamp}.json" filepath = self.metrics_dir / filename try: with open(filepath, 'w') as f: json.dump(metrics, f, indent=2) logger.info(f"Saved {metric_type} metrics to {filepath}") except Exception as e: logger.error(f"Error saving metrics to {filepath}: {e}") def check_alerts(self, system_metrics: Dict[str, Any], app_metrics: Dict[str, Any]) -> List[Dict[str, Any]]: """Check for alert conditions""" alerts = [] try: # System alerts if 'system' in system_metrics: sys = system_metrics['system'] if sys.get('cpu_percent', 0) > 80: alerts.append({ 'type': 'CRITICAL', 'component': 'system', 'message': f"High CPU usage: {sys['cpu_percent']:.1f}%", 'timestamp': datetime.now().isoformat() }) if sys.get('memory_percent', 0) > 85: alerts.append({ 'type': 'CRITICAL', 'component': 'system', 'message': f"High memory usage: {sys['memory_percent']:.1f}%", 'timestamp': datetime.now().isoformat() }) if sys.get('disk_percent', 0) > 90: alerts.append({ 'type': 'CRITICAL', 'component': 'system', 'message': f"High disk usage: {sys['disk_percent']:.1f}%", 'timestamp': datetime.now().isoformat() }) # Application alerts if 'scrapers' in app_metrics: for scraper_name, scraper_data in app_metrics['scrapers'].items(): minutes_since = scraper_data.get('minutes_since_update') if minutes_since and minutes_since > 1440: # 24 hours alerts.append({ 'type': 'WARNING', 'component': f'scraper_{scraper_name}', 'message': f"Scraper {scraper_name} hasn't updated in {minutes_since/60:.1f} hours", 'timestamp': datetime.now().isoformat() }) # Log file alerts if 'log_files' in app_metrics: for source, log_data in app_metrics['log_files'].items(): if log_data.get('size_mb', 0) > 100: # 100MB log files alerts.append({ 'type': 'WARNING', 'component': f'logs_{source}', 'message': f"Large log file for {source}: {log_data['size_mb']:.1f}MB", 'timestamp': datetime.now().isoformat() }) except Exception as e: logger.error(f"Error checking alerts: {e}") alerts.append({ 'type': 'ERROR', 'component': 'monitoring', 'message': f"Alert check failed: {e}", 'timestamp': datetime.now().isoformat() }) return alerts def save_alerts(self, alerts: List[Dict[str, Any]]): """Save alerts to file""" if not alerts: return timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"alerts_{timestamp}.json" filepath = self.alerts_dir / filename try: with open(filepath, 'w') as f: json.dump(alerts, f, indent=2) # Also log critical alerts for alert in alerts: if alert['type'] == 'CRITICAL': logger.critical(f"ALERT: {alert['message']}") elif alert['type'] == 'WARNING': logger.warning(f"ALERT: {alert['message']}") except Exception as e: logger.error(f"Error saving alerts to {filepath}: {e}") def generate_health_report(self) -> Dict[str, Any]: """Generate comprehensive health report""" logger.info("Generating health report...") # Collect metrics system_metrics = self.collect_system_metrics() app_metrics = self.collect_application_metrics() # Check alerts alerts = self.check_alerts(system_metrics, app_metrics) # Save to files self.save_metrics(system_metrics, 'system') self.save_metrics(app_metrics, 'application') if alerts: self.save_alerts(alerts) # Generate summary health_status = 'HEALTHY' if any(alert['type'] == 'CRITICAL' for alert in alerts): health_status = 'CRITICAL' elif any(alert['type'] == 'WARNING' for alert in alerts): health_status = 'WARNING' elif any(alert['type'] == 'ERROR' for alert in alerts): health_status = 'ERROR' report = { 'timestamp': datetime.now().isoformat(), 'health_status': health_status, 'system_metrics': system_metrics, 'application_metrics': app_metrics, 'alerts': alerts, 'summary': { 'total_alerts': len(alerts), 'critical_alerts': len([a for a in alerts if a['type'] == 'CRITICAL']), 'warning_alerts': len([a for a in alerts if a['type'] == 'WARNING']), 'error_alerts': len([a for a in alerts if a['type'] == 'ERROR']) } } return report def cleanup_old_metrics(self, days_to_keep: int = 7): """Clean up old metric files""" cutoff_date = datetime.now() - timedelta(days=days_to_keep) for metrics_file in self.metrics_dir.glob('*.json'): try: file_date = datetime.fromtimestamp(metrics_file.stat().st_mtime) if file_date < cutoff_date: metrics_file.unlink() logger.info(f"Cleaned up old metrics file: {metrics_file}") except Exception as e: logger.warning(f"Error cleaning up {metrics_file}: {e}") for alerts_file in self.alerts_dir.glob('*.json'): try: file_date = datetime.fromtimestamp(alerts_file.stat().st_mtime) if file_date < cutoff_date: alerts_file.unlink() logger.info(f"Cleaned up old alerts file: {alerts_file}") except Exception as e: logger.warning(f"Error cleaning up {alerts_file}: {e}") def main(): """Main monitoring function""" logger.info("Starting monitoring system...") monitor = SystemMonitor() # Generate health report health_report = monitor.generate_health_report() # Save full health report timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') report_file = monitor.metrics_dir / f"health_report_{timestamp}.json" try: with open(report_file, 'w') as f: json.dump(health_report, f, indent=2) logger.info(f"Health report saved to {report_file}") except Exception as e: logger.error(f"Error saving health report: {e}") # Print summary print(f"\n{'='*60}") print(f"HVAC KNOW IT ALL - SYSTEM HEALTH REPORT") print(f"{'='*60}") print(f"Status: {health_report['health_status']}") print(f"Timestamp: {health_report['timestamp']}") print(f"Total Alerts: {health_report['summary']['total_alerts']}") if health_report['summary']['critical_alerts'] > 0: print(f"🔴 Critical Alerts: {health_report['summary']['critical_alerts']}") if health_report['summary']['warning_alerts'] > 0: print(f"🟡 Warning Alerts: {health_report['summary']['warning_alerts']}") if health_report['summary']['error_alerts'] > 0: print(f"🟠 Error Alerts: {health_report['summary']['error_alerts']}") if health_report['alerts']: print(f"\nRecent Alerts:") for alert in health_report['alerts'][-5:]: # Show last 5 alerts emoji = "🔴" if alert['type'] == 'CRITICAL' else "🟡" if alert['type'] == 'WARNING' else "🟠" print(f" {emoji} {alert['component']}: {alert['message']}") # System summary if 'system' in health_report['system_metrics']: sys = health_report['system_metrics']['system'] print(f"\nSystem Resources:") print(f" CPU: {sys.get('cpu_percent', 'N/A'):.1f}%") print(f" Memory: {sys.get('memory_percent', 'N/A'):.1f}%") print(f" Disk: {sys.get('disk_percent', 'N/A'):.1f}%") # Scraper summary if 'scrapers' in health_report['application_metrics']: scrapers = health_report['application_metrics']['scrapers'] print(f"\nScraper Status ({len(scrapers)} scrapers):") for name, data in scrapers.items(): last_count = data.get('last_item_count', 0) minutes_since = data.get('minutes_since_update') if minutes_since is not None: hours_since = minutes_since / 60 time_str = f"{hours_since:.1f}h ago" if hours_since > 1 else f"{minutes_since:.0f}m ago" else: time_str = "Never" print(f" {name}: {last_count} items, last update {time_str}") print(f"{'='*60}\n") # Cleanup old files monitor.cleanup_old_metrics() return health_report['health_status'] == 'HEALTHY' if __name__ == '__main__': try: success = main() exit(0 if success else 1) except Exception as e: logger.critical(f"Monitoring failed: {e}") exit(2)