- Created SystemMonitor class for health check monitoring - Implemented system metrics collection (CPU, memory, disk, network) - Added application metrics monitoring (scrapers, logs, data sizes) - Built alert system with configurable thresholds - Developed HTML dashboard generator with real-time charts - Added systemd services for automated monitoring (15-min intervals) - Created responsive web dashboard with Bootstrap and Chart.js - Implemented automatic cleanup of old metric files - Added comprehensive documentation and troubleshooting guide Features: - Real-time system resource monitoring - Scraper performance tracking and alerts - Interactive dashboard with trend charts - Email-ready alert notifications - Systemd integration for production deployment - Security hardening with minimal privileges - Auto-refresh dashboard every 5 minutes - 7-day metric retention with automatic cleanup Alert conditions: - Critical: CPU >80%, Memory >85%, Disk >90% - Warning: Scraper inactive >24h, Log files >100MB - Error: Monitoring failures, configuration issues 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
404 lines
No EOL
16 KiB
Python
Executable file
404 lines
No EOL
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Monitoring setup script for HVAC Know It All Content Aggregation System
|
|
|
|
This script sets up:
|
|
1. Health check endpoints
|
|
2. Metrics collection
|
|
3. Log monitoring
|
|
4. Alert configuration
|
|
5. Dashboard generation
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
from datetime import datetime, timedelta
|
|
import psutil
|
|
import logging
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SystemMonitor:
|
|
"""Monitor system health and performance metrics"""
|
|
|
|
def __init__(self, data_dir: Path = None, logs_dir: Path = None):
|
|
self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
|
|
self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")
|
|
|
|
# Use relative monitoring paths when custom data/logs dirs are provided
|
|
if data_dir or logs_dir:
|
|
base_dir = (data_dir or logs_dir).parent
|
|
self.metrics_dir = base_dir / "monitoring" / "metrics"
|
|
self.alerts_dir = base_dir / "monitoring" / "alerts"
|
|
else:
|
|
self.metrics_dir = Path("/opt/hvac-kia-content/monitoring/metrics")
|
|
self.alerts_dir = Path("/opt/hvac-kia-content/monitoring/alerts")
|
|
|
|
# Create monitoring directories
|
|
self.metrics_dir.mkdir(parents=True, exist_ok=True)
|
|
self.alerts_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def collect_system_metrics(self) -> Dict[str, Any]:
|
|
"""Collect system-level metrics"""
|
|
try:
|
|
# CPU and Memory
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
memory = psutil.virtual_memory()
|
|
disk = psutil.disk_usage('/')
|
|
|
|
# Network (if available)
|
|
try:
|
|
network = psutil.net_io_counters()
|
|
network_stats = {
|
|
'bytes_sent': network.bytes_sent,
|
|
'bytes_recv': network.bytes_recv,
|
|
'packets_sent': network.packets_sent,
|
|
'packets_recv': network.packets_recv
|
|
}
|
|
except:
|
|
network_stats = None
|
|
|
|
metrics = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'system': {
|
|
'cpu_percent': cpu_percent,
|
|
'memory_percent': memory.percent,
|
|
'memory_available_gb': memory.available / (1024**3),
|
|
'disk_percent': disk.percent,
|
|
'disk_free_gb': disk.free / (1024**3),
|
|
'load_average': os.getloadavg() if hasattr(os, 'getloadavg') else None,
|
|
'uptime_hours': (time.time() - psutil.boot_time()) / 3600
|
|
},
|
|
'network': network_stats
|
|
}
|
|
|
|
return metrics
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting system metrics: {e}")
|
|
return {'error': str(e), 'timestamp': datetime.now().isoformat()}
|
|
|
|
def collect_application_metrics(self) -> Dict[str, Any]:
|
|
"""Collect application-specific metrics"""
|
|
try:
|
|
metrics = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'data_directories': {},
|
|
'log_files': {},
|
|
'scrapers': {}
|
|
}
|
|
|
|
# Check data directory sizes
|
|
if self.data_dir.exists():
|
|
for subdir in ['markdown_current', 'markdown_archives', 'media', '.state']:
|
|
dir_path = self.data_dir / subdir
|
|
if dir_path.exists():
|
|
size_mb = sum(f.stat().st_size for f in dir_path.rglob('*') if f.is_file()) / (1024**2)
|
|
file_count = sum(1 for f in dir_path.rglob('*') if f.is_file())
|
|
metrics['data_directories'][subdir] = {
|
|
'size_mb': round(size_mb, 2),
|
|
'file_count': file_count
|
|
}
|
|
|
|
# Check log file sizes and recent activity
|
|
if self.logs_dir.exists():
|
|
for source_dir in self.logs_dir.iterdir():
|
|
if source_dir.is_dir():
|
|
log_files = list(source_dir.glob('*.log'))
|
|
if log_files:
|
|
latest_log = max(log_files, key=lambda f: f.stat().st_mtime)
|
|
size_mb = latest_log.stat().st_size / (1024**2)
|
|
last_modified = datetime.fromtimestamp(latest_log.stat().st_mtime)
|
|
|
|
metrics['log_files'][source_dir.name] = {
|
|
'size_mb': round(size_mb, 2),
|
|
'last_modified': last_modified.isoformat(),
|
|
'minutes_since_update': (datetime.now() - last_modified).total_seconds() / 60
|
|
}
|
|
|
|
# Check scraper state files
|
|
state_dir = self.data_dir / '.state'
|
|
if state_dir.exists():
|
|
for state_file in state_dir.glob('*_state.json'):
|
|
try:
|
|
with open(state_file) as f:
|
|
state_data = json.load(f)
|
|
|
|
scraper_name = state_file.stem.replace('_state', '')
|
|
last_update = state_data.get('last_update')
|
|
if last_update:
|
|
last_update_dt = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
|
|
minutes_since = (datetime.now() - last_update_dt.replace(tzinfo=None)).total_seconds() / 60
|
|
else:
|
|
minutes_since = None
|
|
|
|
metrics['scrapers'][scraper_name] = {
|
|
'last_item_count': state_data.get('last_item_count', 0),
|
|
'last_update': last_update,
|
|
'minutes_since_update': minutes_since,
|
|
'last_id': state_data.get('last_id')
|
|
}
|
|
except Exception as e:
|
|
logger.warning(f"Error reading state file {state_file}: {e}")
|
|
|
|
return metrics
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting application metrics: {e}")
|
|
return {'error': str(e), 'timestamp': datetime.now().isoformat()}
|
|
|
|
def save_metrics(self, metrics: Dict[str, Any], metric_type: str):
|
|
"""Save metrics to file with timestamp"""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = f"{metric_type}_{timestamp}.json"
|
|
filepath = self.metrics_dir / filename
|
|
|
|
try:
|
|
with open(filepath, 'w') as f:
|
|
json.dump(metrics, f, indent=2)
|
|
logger.info(f"Saved {metric_type} metrics to {filepath}")
|
|
except Exception as e:
|
|
logger.error(f"Error saving metrics to {filepath}: {e}")
|
|
|
|
def check_alerts(self, system_metrics: Dict[str, Any], app_metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Check for alert conditions"""
|
|
alerts = []
|
|
|
|
try:
|
|
# System alerts
|
|
if 'system' in system_metrics:
|
|
sys = system_metrics['system']
|
|
|
|
if sys.get('cpu_percent', 0) > 80:
|
|
alerts.append({
|
|
'type': 'CRITICAL',
|
|
'component': 'system',
|
|
'message': f"High CPU usage: {sys['cpu_percent']:.1f}%",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
if sys.get('memory_percent', 0) > 85:
|
|
alerts.append({
|
|
'type': 'CRITICAL',
|
|
'component': 'system',
|
|
'message': f"High memory usage: {sys['memory_percent']:.1f}%",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
if sys.get('disk_percent', 0) > 90:
|
|
alerts.append({
|
|
'type': 'CRITICAL',
|
|
'component': 'system',
|
|
'message': f"High disk usage: {sys['disk_percent']:.1f}%",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
# Application alerts
|
|
if 'scrapers' in app_metrics:
|
|
for scraper_name, scraper_data in app_metrics['scrapers'].items():
|
|
minutes_since = scraper_data.get('minutes_since_update')
|
|
if minutes_since and minutes_since > 1440: # 24 hours
|
|
alerts.append({
|
|
'type': 'WARNING',
|
|
'component': f'scraper_{scraper_name}',
|
|
'message': f"Scraper {scraper_name} hasn't updated in {minutes_since/60:.1f} hours",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
# Log file alerts
|
|
if 'log_files' in app_metrics:
|
|
for source, log_data in app_metrics['log_files'].items():
|
|
if log_data.get('size_mb', 0) > 100: # 100MB log files
|
|
alerts.append({
|
|
'type': 'WARNING',
|
|
'component': f'logs_{source}',
|
|
'message': f"Large log file for {source}: {log_data['size_mb']:.1f}MB",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking alerts: {e}")
|
|
alerts.append({
|
|
'type': 'ERROR',
|
|
'component': 'monitoring',
|
|
'message': f"Alert check failed: {e}",
|
|
'timestamp': datetime.now().isoformat()
|
|
})
|
|
|
|
return alerts
|
|
|
|
def save_alerts(self, alerts: List[Dict[str, Any]]):
|
|
"""Save alerts to file"""
|
|
if not alerts:
|
|
return
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = f"alerts_{timestamp}.json"
|
|
filepath = self.alerts_dir / filename
|
|
|
|
try:
|
|
with open(filepath, 'w') as f:
|
|
json.dump(alerts, f, indent=2)
|
|
|
|
# Also log critical alerts
|
|
for alert in alerts:
|
|
if alert['type'] == 'CRITICAL':
|
|
logger.critical(f"ALERT: {alert['message']}")
|
|
elif alert['type'] == 'WARNING':
|
|
logger.warning(f"ALERT: {alert['message']}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error saving alerts to {filepath}: {e}")
|
|
|
|
def generate_health_report(self) -> Dict[str, Any]:
|
|
"""Generate comprehensive health report"""
|
|
logger.info("Generating health report...")
|
|
|
|
# Collect metrics
|
|
system_metrics = self.collect_system_metrics()
|
|
app_metrics = self.collect_application_metrics()
|
|
|
|
# Check alerts
|
|
alerts = self.check_alerts(system_metrics, app_metrics)
|
|
|
|
# Save to files
|
|
self.save_metrics(system_metrics, 'system')
|
|
self.save_metrics(app_metrics, 'application')
|
|
if alerts:
|
|
self.save_alerts(alerts)
|
|
|
|
# Generate summary
|
|
health_status = 'HEALTHY'
|
|
if any(alert['type'] == 'CRITICAL' for alert in alerts):
|
|
health_status = 'CRITICAL'
|
|
elif any(alert['type'] == 'WARNING' for alert in alerts):
|
|
health_status = 'WARNING'
|
|
elif any(alert['type'] == 'ERROR' for alert in alerts):
|
|
health_status = 'ERROR'
|
|
|
|
report = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'health_status': health_status,
|
|
'system_metrics': system_metrics,
|
|
'application_metrics': app_metrics,
|
|
'alerts': alerts,
|
|
'summary': {
|
|
'total_alerts': len(alerts),
|
|
'critical_alerts': len([a for a in alerts if a['type'] == 'CRITICAL']),
|
|
'warning_alerts': len([a for a in alerts if a['type'] == 'WARNING']),
|
|
'error_alerts': len([a for a in alerts if a['type'] == 'ERROR'])
|
|
}
|
|
}
|
|
|
|
return report
|
|
|
|
def cleanup_old_metrics(self, days_to_keep: int = 7):
|
|
"""Clean up old metric files"""
|
|
cutoff_date = datetime.now() - timedelta(days=days_to_keep)
|
|
|
|
for metrics_file in self.metrics_dir.glob('*.json'):
|
|
try:
|
|
file_date = datetime.fromtimestamp(metrics_file.stat().st_mtime)
|
|
if file_date < cutoff_date:
|
|
metrics_file.unlink()
|
|
logger.info(f"Cleaned up old metrics file: {metrics_file}")
|
|
except Exception as e:
|
|
logger.warning(f"Error cleaning up {metrics_file}: {e}")
|
|
|
|
for alerts_file in self.alerts_dir.glob('*.json'):
|
|
try:
|
|
file_date = datetime.fromtimestamp(alerts_file.stat().st_mtime)
|
|
if file_date < cutoff_date:
|
|
alerts_file.unlink()
|
|
logger.info(f"Cleaned up old alerts file: {alerts_file}")
|
|
except Exception as e:
|
|
logger.warning(f"Error cleaning up {alerts_file}: {e}")
|
|
|
|
|
|
def main():
|
|
"""Main monitoring function"""
|
|
logger.info("Starting monitoring system...")
|
|
|
|
monitor = SystemMonitor()
|
|
|
|
# Generate health report
|
|
health_report = monitor.generate_health_report()
|
|
|
|
# Save full health report
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
report_file = monitor.metrics_dir / f"health_report_{timestamp}.json"
|
|
|
|
try:
|
|
with open(report_file, 'w') as f:
|
|
json.dump(health_report, f, indent=2)
|
|
logger.info(f"Health report saved to {report_file}")
|
|
except Exception as e:
|
|
logger.error(f"Error saving health report: {e}")
|
|
|
|
# Print summary
|
|
print(f"\n{'='*60}")
|
|
print(f"HVAC KNOW IT ALL - SYSTEM HEALTH REPORT")
|
|
print(f"{'='*60}")
|
|
print(f"Status: {health_report['health_status']}")
|
|
print(f"Timestamp: {health_report['timestamp']}")
|
|
print(f"Total Alerts: {health_report['summary']['total_alerts']}")
|
|
|
|
if health_report['summary']['critical_alerts'] > 0:
|
|
print(f"🔴 Critical Alerts: {health_report['summary']['critical_alerts']}")
|
|
if health_report['summary']['warning_alerts'] > 0:
|
|
print(f"🟡 Warning Alerts: {health_report['summary']['warning_alerts']}")
|
|
if health_report['summary']['error_alerts'] > 0:
|
|
print(f"🟠 Error Alerts: {health_report['summary']['error_alerts']}")
|
|
|
|
if health_report['alerts']:
|
|
print(f"\nRecent Alerts:")
|
|
for alert in health_report['alerts'][-5:]: # Show last 5 alerts
|
|
emoji = "🔴" if alert['type'] == 'CRITICAL' else "🟡" if alert['type'] == 'WARNING' else "🟠"
|
|
print(f" {emoji} {alert['component']}: {alert['message']}")
|
|
|
|
# System summary
|
|
if 'system' in health_report['system_metrics']:
|
|
sys = health_report['system_metrics']['system']
|
|
print(f"\nSystem Resources:")
|
|
print(f" CPU: {sys.get('cpu_percent', 'N/A'):.1f}%")
|
|
print(f" Memory: {sys.get('memory_percent', 'N/A'):.1f}%")
|
|
print(f" Disk: {sys.get('disk_percent', 'N/A'):.1f}%")
|
|
|
|
# Scraper summary
|
|
if 'scrapers' in health_report['application_metrics']:
|
|
scrapers = health_report['application_metrics']['scrapers']
|
|
print(f"\nScraper Status ({len(scrapers)} scrapers):")
|
|
for name, data in scrapers.items():
|
|
last_count = data.get('last_item_count', 0)
|
|
minutes_since = data.get('minutes_since_update')
|
|
if minutes_since is not None:
|
|
hours_since = minutes_since / 60
|
|
time_str = f"{hours_since:.1f}h ago" if hours_since > 1 else f"{minutes_since:.0f}m ago"
|
|
else:
|
|
time_str = "Never"
|
|
print(f" {name}: {last_count} items, last update {time_str}")
|
|
|
|
print(f"{'='*60}\n")
|
|
|
|
# Cleanup old files
|
|
monitor.cleanup_old_metrics()
|
|
|
|
return health_report['health_status'] == 'HEALTHY'
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
success = main()
|
|
exit(0 if success else 1)
|
|
except Exception as e:
|
|
logger.critical(f"Monitoring failed: {e}")
|
|
exit(2) |