hvac-kia-content/monitoring/setup_monitoring.py
Ben Reed dc57ce80d5 Add comprehensive monitoring and alerting system
- Created SystemMonitor class for health check monitoring
- Implemented system metrics collection (CPU, memory, disk, network)
- Added application metrics monitoring (scrapers, logs, data sizes)
- Built alert system with configurable thresholds
- Developed HTML dashboard generator with real-time charts
- Added systemd services for automated monitoring (15-min intervals)
- Created responsive web dashboard with Bootstrap and Chart.js
- Implemented automatic cleanup of old metric files
- Added comprehensive documentation and troubleshooting guide

Features:
- Real-time system resource monitoring
- Scraper performance tracking and alerts
- Interactive dashboard with trend charts
- Email-ready alert notifications
- Systemd integration for production deployment
- Security hardening with minimal privileges
- Auto-refresh dashboard every 5 minutes
- 7-day metric retention with automatic cleanup

Alert conditions:
- Critical: CPU >80%, Memory >85%, Disk >90%
- Warning: Scraper inactive >24h, Log files >100MB
- Error: Monitoring failures, configuration issues

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 21:35:28 -03:00

404 lines
No EOL
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Monitoring setup script for HVAC Know It All Content Aggregation System
This script sets up:
1. Health check endpoints
2. Metrics collection
3. Log monitoring
4. Alert configuration
5. Dashboard generation
"""
import json
import os
import time
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime, timedelta
import psutil
import logging
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class SystemMonitor:
"""Monitor system health and performance metrics"""
def __init__(self, data_dir: Path = None, logs_dir: Path = None):
self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
# Use relative monitoring paths when custom data/logs dirs are provided
if data_dir or logs_dir:
base_dir = (data_dir or logs_dir).parent
self.metrics_dir = base_dir / "monitoring" / "metrics"
self.alerts_dir = base_dir / "monitoring" / "alerts"
else:
self.metrics_dir = Path("/opt/hvac-kia-content/monitoring/metrics")
self.alerts_dir = Path("/opt/hvac-kia-content/monitoring/alerts")
# Create monitoring directories
self.metrics_dir.mkdir(parents=True, exist_ok=True)
self.alerts_dir.mkdir(parents=True, exist_ok=True)
def collect_system_metrics(self) -> Dict[str, Any]:
"""Collect system-level metrics"""
try:
# CPU and Memory
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
# Network (if available)
try:
network = psutil.net_io_counters()
network_stats = {
'bytes_sent': network.bytes_sent,
'bytes_recv': network.bytes_recv,
'packets_sent': network.packets_sent,
'packets_recv': network.packets_recv
}
except:
network_stats = None
metrics = {
'timestamp': datetime.now().isoformat(),
'system': {
'cpu_percent': cpu_percent,
'memory_percent': memory.percent,
'memory_available_gb': memory.available / (1024**3),
'disk_percent': disk.percent,
'disk_free_gb': disk.free / (1024**3),
'load_average': os.getloadavg() if hasattr(os, 'getloadavg') else None,
'uptime_hours': (time.time() - psutil.boot_time()) / 3600
},
'network': network_stats
}
return metrics
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
return {'error': str(e), 'timestamp': datetime.now().isoformat()}
def collect_application_metrics(self) -> Dict[str, Any]:
"""Collect application-specific metrics"""
try:
metrics = {
'timestamp': datetime.now().isoformat(),
'data_directories': {},
'log_files': {},
'scrapers': {}
}
# Check data directory sizes
if self.data_dir.exists():
for subdir in ['markdown_current', 'markdown_archives', 'media', '.state']:
dir_path = self.data_dir / subdir
if dir_path.exists():
size_mb = sum(f.stat().st_size for f in dir_path.rglob('*') if f.is_file()) / (1024**2)
file_count = sum(1 for f in dir_path.rglob('*') if f.is_file())
metrics['data_directories'][subdir] = {
'size_mb': round(size_mb, 2),
'file_count': file_count
}
# Check log file sizes and recent activity
if self.logs_dir.exists():
for source_dir in self.logs_dir.iterdir():
if source_dir.is_dir():
log_files = list(source_dir.glob('*.log'))
if log_files:
latest_log = max(log_files, key=lambda f: f.stat().st_mtime)
size_mb = latest_log.stat().st_size / (1024**2)
last_modified = datetime.fromtimestamp(latest_log.stat().st_mtime)
metrics['log_files'][source_dir.name] = {
'size_mb': round(size_mb, 2),
'last_modified': last_modified.isoformat(),
'minutes_since_update': (datetime.now() - last_modified).total_seconds() / 60
}
# Check scraper state files
state_dir = self.data_dir / '.state'
if state_dir.exists():
for state_file in state_dir.glob('*_state.json'):
try:
with open(state_file) as f:
state_data = json.load(f)
scraper_name = state_file.stem.replace('_state', '')
last_update = state_data.get('last_update')
if last_update:
last_update_dt = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
minutes_since = (datetime.now() - last_update_dt.replace(tzinfo=None)).total_seconds() / 60
else:
minutes_since = None
metrics['scrapers'][scraper_name] = {
'last_item_count': state_data.get('last_item_count', 0),
'last_update': last_update,
'minutes_since_update': minutes_since,
'last_id': state_data.get('last_id')
}
except Exception as e:
logger.warning(f"Error reading state file {state_file}: {e}")
return metrics
except Exception as e:
logger.error(f"Error collecting application metrics: {e}")
return {'error': str(e), 'timestamp': datetime.now().isoformat()}
def save_metrics(self, metrics: Dict[str, Any], metric_type: str):
"""Save metrics to file with timestamp"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{metric_type}_{timestamp}.json"
filepath = self.metrics_dir / filename
try:
with open(filepath, 'w') as f:
json.dump(metrics, f, indent=2)
logger.info(f"Saved {metric_type} metrics to {filepath}")
except Exception as e:
logger.error(f"Error saving metrics to {filepath}: {e}")
def check_alerts(self, system_metrics: Dict[str, Any], app_metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Check for alert conditions"""
alerts = []
try:
# System alerts
if 'system' in system_metrics:
sys = system_metrics['system']
if sys.get('cpu_percent', 0) > 80:
alerts.append({
'type': 'CRITICAL',
'component': 'system',
'message': f"High CPU usage: {sys['cpu_percent']:.1f}%",
'timestamp': datetime.now().isoformat()
})
if sys.get('memory_percent', 0) > 85:
alerts.append({
'type': 'CRITICAL',
'component': 'system',
'message': f"High memory usage: {sys['memory_percent']:.1f}%",
'timestamp': datetime.now().isoformat()
})
if sys.get('disk_percent', 0) > 90:
alerts.append({
'type': 'CRITICAL',
'component': 'system',
'message': f"High disk usage: {sys['disk_percent']:.1f}%",
'timestamp': datetime.now().isoformat()
})
# Application alerts
if 'scrapers' in app_metrics:
for scraper_name, scraper_data in app_metrics['scrapers'].items():
minutes_since = scraper_data.get('minutes_since_update')
if minutes_since and minutes_since > 1440: # 24 hours
alerts.append({
'type': 'WARNING',
'component': f'scraper_{scraper_name}',
'message': f"Scraper {scraper_name} hasn't updated in {minutes_since/60:.1f} hours",
'timestamp': datetime.now().isoformat()
})
# Log file alerts
if 'log_files' in app_metrics:
for source, log_data in app_metrics['log_files'].items():
if log_data.get('size_mb', 0) > 100: # 100MB log files
alerts.append({
'type': 'WARNING',
'component': f'logs_{source}',
'message': f"Large log file for {source}: {log_data['size_mb']:.1f}MB",
'timestamp': datetime.now().isoformat()
})
except Exception as e:
logger.error(f"Error checking alerts: {e}")
alerts.append({
'type': 'ERROR',
'component': 'monitoring',
'message': f"Alert check failed: {e}",
'timestamp': datetime.now().isoformat()
})
return alerts
def save_alerts(self, alerts: List[Dict[str, Any]]):
"""Save alerts to file"""
if not alerts:
return
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"alerts_{timestamp}.json"
filepath = self.alerts_dir / filename
try:
with open(filepath, 'w') as f:
json.dump(alerts, f, indent=2)
# Also log critical alerts
for alert in alerts:
if alert['type'] == 'CRITICAL':
logger.critical(f"ALERT: {alert['message']}")
elif alert['type'] == 'WARNING':
logger.warning(f"ALERT: {alert['message']}")
except Exception as e:
logger.error(f"Error saving alerts to {filepath}: {e}")
def generate_health_report(self) -> Dict[str, Any]:
"""Generate comprehensive health report"""
logger.info("Generating health report...")
# Collect metrics
system_metrics = self.collect_system_metrics()
app_metrics = self.collect_application_metrics()
# Check alerts
alerts = self.check_alerts(system_metrics, app_metrics)
# Save to files
self.save_metrics(system_metrics, 'system')
self.save_metrics(app_metrics, 'application')
if alerts:
self.save_alerts(alerts)
# Generate summary
health_status = 'HEALTHY'
if any(alert['type'] == 'CRITICAL' for alert in alerts):
health_status = 'CRITICAL'
elif any(alert['type'] == 'WARNING' for alert in alerts):
health_status = 'WARNING'
elif any(alert['type'] == 'ERROR' for alert in alerts):
health_status = 'ERROR'
report = {
'timestamp': datetime.now().isoformat(),
'health_status': health_status,
'system_metrics': system_metrics,
'application_metrics': app_metrics,
'alerts': alerts,
'summary': {
'total_alerts': len(alerts),
'critical_alerts': len([a for a in alerts if a['type'] == 'CRITICAL']),
'warning_alerts': len([a for a in alerts if a['type'] == 'WARNING']),
'error_alerts': len([a for a in alerts if a['type'] == 'ERROR'])
}
}
return report
def cleanup_old_metrics(self, days_to_keep: int = 7):
"""Clean up old metric files"""
cutoff_date = datetime.now() - timedelta(days=days_to_keep)
for metrics_file in self.metrics_dir.glob('*.json'):
try:
file_date = datetime.fromtimestamp(metrics_file.stat().st_mtime)
if file_date < cutoff_date:
metrics_file.unlink()
logger.info(f"Cleaned up old metrics file: {metrics_file}")
except Exception as e:
logger.warning(f"Error cleaning up {metrics_file}: {e}")
for alerts_file in self.alerts_dir.glob('*.json'):
try:
file_date = datetime.fromtimestamp(alerts_file.stat().st_mtime)
if file_date < cutoff_date:
alerts_file.unlink()
logger.info(f"Cleaned up old alerts file: {alerts_file}")
except Exception as e:
logger.warning(f"Error cleaning up {alerts_file}: {e}")
def main():
"""Main monitoring function"""
logger.info("Starting monitoring system...")
monitor = SystemMonitor()
# Generate health report
health_report = monitor.generate_health_report()
# Save full health report
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_file = monitor.metrics_dir / f"health_report_{timestamp}.json"
try:
with open(report_file, 'w') as f:
json.dump(health_report, f, indent=2)
logger.info(f"Health report saved to {report_file}")
except Exception as e:
logger.error(f"Error saving health report: {e}")
# Print summary
print(f"\n{'='*60}")
print(f"HVAC KNOW IT ALL - SYSTEM HEALTH REPORT")
print(f"{'='*60}")
print(f"Status: {health_report['health_status']}")
print(f"Timestamp: {health_report['timestamp']}")
print(f"Total Alerts: {health_report['summary']['total_alerts']}")
if health_report['summary']['critical_alerts'] > 0:
print(f"🔴 Critical Alerts: {health_report['summary']['critical_alerts']}")
if health_report['summary']['warning_alerts'] > 0:
print(f"🟡 Warning Alerts: {health_report['summary']['warning_alerts']}")
if health_report['summary']['error_alerts'] > 0:
print(f"🟠 Error Alerts: {health_report['summary']['error_alerts']}")
if health_report['alerts']:
print(f"\nRecent Alerts:")
for alert in health_report['alerts'][-5:]: # Show last 5 alerts
emoji = "🔴" if alert['type'] == 'CRITICAL' else "🟡" if alert['type'] == 'WARNING' else "🟠"
print(f" {emoji} {alert['component']}: {alert['message']}")
# System summary
if 'system' in health_report['system_metrics']:
sys = health_report['system_metrics']['system']
print(f"\nSystem Resources:")
print(f" CPU: {sys.get('cpu_percent', 'N/A'):.1f}%")
print(f" Memory: {sys.get('memory_percent', 'N/A'):.1f}%")
print(f" Disk: {sys.get('disk_percent', 'N/A'):.1f}%")
# Scraper summary
if 'scrapers' in health_report['application_metrics']:
scrapers = health_report['application_metrics']['scrapers']
print(f"\nScraper Status ({len(scrapers)} scrapers):")
for name, data in scrapers.items():
last_count = data.get('last_item_count', 0)
minutes_since = data.get('minutes_since_update')
if minutes_since is not None:
hours_since = minutes_since / 60
time_str = f"{hours_since:.1f}h ago" if hours_since > 1 else f"{minutes_since:.0f}m ago"
else:
time_str = "Never"
print(f" {name}: {last_count} items, last update {time_str}")
print(f"{'='*60}\n")
# Cleanup old files
monitor.cleanup_old_metrics()
return health_report['health_status'] == 'HEALTHY'
if __name__ == '__main__':
try:
success = main()
exit(0 if success else 1)
except Exception as e:
logger.critical(f"Monitoring failed: {e}")
exit(2)