hvac-kia-content/monitoring/setup_monitoring.py

#!/usr/bin/env python3
"""
Monitoring setup script for HVAC Know It All Content Aggregation System

This script sets up:
1. Health check endpoints
2. Metrics collection
3. Log monitoring
4. Alert configuration
5. Dashboard generation
"""

import json
import os
import time
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime, timedelta
import psutil
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class SystemMonitor:
    """Monitor system health and performance metrics"""

    def __init__(self, data_dir: Path = None, logs_dir: Path = None):
        self.data_dir = data_dir or Path("/opt/hvac-kia-content/data")
        self.logs_dir = logs_dir or Path("/opt/hvac-kia-content/logs")

        # Use relative monitoring paths when custom data/logs dirs are provided
        if data_dir or logs_dir:
            base_dir = (data_dir or logs_dir).parent
            self.metrics_dir = base_dir / "monitoring" / "metrics"
            self.alerts_dir = base_dir / "monitoring" / "alerts"
        else:
            self.metrics_dir = Path("/opt/hvac-kia-content/monitoring/metrics")
            self.alerts_dir = Path("/opt/hvac-kia-content/monitoring/alerts")

        # Create monitoring directories
        self.metrics_dir.mkdir(parents=True, exist_ok=True)
        self.alerts_dir.mkdir(parents=True, exist_ok=True)

    def collect_system_metrics(self) -> Dict[str, Any]:
        """Collect system-level metrics"""
        try:
            # CPU and Memory
            cpu_percent = psutil.cpu_percent(interval=1)
            memory = psutil.virtual_memory()
            disk = psutil.disk_usage('/')

            # Network (if available)
            try:
                network = psutil.net_io_counters()
                network_stats = {
                    'bytes_sent': network.bytes_sent,
                    'bytes_recv': network.bytes_recv,
                    'packets_sent': network.packets_sent,
                    'packets_recv': network.packets_recv
                }
            except:
                network_stats = None

            metrics = {
                'timestamp': datetime.now().isoformat(),
                'system': {
                    'cpu_percent': cpu_percent,
                    'memory_percent': memory.percent,
                    'memory_available_gb': memory.available / (1024**3),
                    'disk_percent': disk.percent,
                    'disk_free_gb': disk.free / (1024**3),
                    'load_average': os.getloadavg() if hasattr(os, 'getloadavg') else None,
                    'uptime_hours': (time.time() - psutil.boot_time()) / 3600
                },
                'network': network_stats
            }

            return metrics

        except Exception as e:
            logger.error(f"Error collecting system metrics: {e}")
            return {'error': str(e), 'timestamp': datetime.now().isoformat()}

    def collect_application_metrics(self) -> Dict[str, Any]:
        """Collect application-specific metrics"""
        try:
            metrics = {
                'timestamp': datetime.now().isoformat(),
                'data_directories': {},
                'log_files': {},
                'scrapers': {}
            }

            # Check data directory sizes
            if self.data_dir.exists():
                for subdir in ['markdown_current', 'markdown_archives', 'media', '.state']:
                    dir_path = self.data_dir / subdir
                    if dir_path.exists():
                        size_mb = sum(f.stat().st_size for f in dir_path.rglob('*') if f.is_file()) / (1024**2)
                        file_count = sum(1 for f in dir_path.rglob('*') if f.is_file())
                        metrics['data_directories'][subdir] = {
                            'size_mb': round(size_mb, 2),
                            'file_count': file_count
                        }

            # Check log file sizes and recent activity
            if self.logs_dir.exists():
                for source_dir in self.logs_dir.iterdir():
                    if source_dir.is_dir():
                        log_files = list(source_dir.glob('*.log'))
                        if log_files:
                            latest_log = max(log_files, key=lambda f: f.stat().st_mtime)
                            size_mb = latest_log.stat().st_size / (1024**2)
                            last_modified = datetime.fromtimestamp(latest_log.stat().st_mtime)

                            metrics['log_files'][source_dir.name] = {
                                'size_mb': round(size_mb, 2),
                                'last_modified': last_modified.isoformat(),
                                'minutes_since_update': (datetime.now() - last_modified).total_seconds() / 60
                            }

            # Check scraper state files
            state_dir = self.data_dir / '.state'
            if state_dir.exists():
                for state_file in state_dir.glob('*_state.json'):
                    try:
                        with open(state_file) as f:
                            state_data = json.load(f)

                        scraper_name = state_file.stem.replace('_state', '')
                        last_update = state_data.get('last_update')
                        if last_update:
                            last_update_dt = datetime.fromisoformat(last_update.replace('Z', '+00:00'))
                            minutes_since = (datetime.now() - last_update_dt.replace(tzinfo=None)).total_seconds() / 60
                        else:
                            minutes_since = None

                        metrics['scrapers'][scraper_name] = {
                            'last_item_count': state_data.get('last_item_count', 0),
                            'last_update': last_update,
                            'minutes_since_update': minutes_since,
                            'last_id': state_data.get('last_id')
                        }
                    except Exception as e:
                        logger.warning(f"Error reading state file {state_file}: {e}")

            return metrics

        except Exception as e:
            logger.error(f"Error collecting application metrics: {e}")
            return {'error': str(e), 'timestamp': datetime.now().isoformat()}

    def save_metrics(self, metrics: Dict[str, Any], metric_type: str):
        """Save metrics to file with timestamp"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{metric_type}_{timestamp}.json"
        filepath = self.metrics_dir / filename

        try:
            with open(filepath, 'w') as f:
                json.dump(metrics, f, indent=2)
            logger.info(f"Saved {metric_type} metrics to {filepath}")
        except Exception as e:
            logger.error(f"Error saving metrics to {filepath}: {e}")

    def check_alerts(self, system_metrics: Dict[str, Any], app_metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Check for alert conditions"""
        alerts = []

        try:
            # System alerts
            if 'system' in system_metrics:
                sys = system_metrics['system']

                if sys.get('cpu_percent', 0) > 80:
                    alerts.append({
                        'type': 'CRITICAL',
                        'component': 'system',
                        'message': f"High CPU usage: {sys['cpu_percent']:.1f}%",
                        'timestamp': datetime.now().isoformat()
                    })

                if sys.get('memory_percent', 0) > 85:
                    alerts.append({
                        'type': 'CRITICAL',
                        'component': 'system',
                        'message': f"High memory usage: {sys['memory_percent']:.1f}%",
                        'timestamp': datetime.now().isoformat()
                    })

                if sys.get('disk_percent', 0) > 90:
                    alerts.append({
                        'type': 'CRITICAL',
                        'component': 'system',
                        'message': f"High disk usage: {sys['disk_percent']:.1f}%",
                        'timestamp': datetime.now().isoformat()
                    })

            # Application alerts
            if 'scrapers' in app_metrics:
                for scraper_name, scraper_data in app_metrics['scrapers'].items():
                    minutes_since = scraper_data.get('minutes_since_update')
                    if minutes_since and minutes_since > 1440:  # 24 hours
                        alerts.append({
                            'type': 'WARNING',
                            'component': f'scraper_{scraper_name}',
                            'message': f"Scraper {scraper_name} hasn't updated in {minutes_since/60:.1f} hours",
                            'timestamp': datetime.now().isoformat()
                        })

            # Log file alerts
            if 'log_files' in app_metrics:
                for source, log_data in app_metrics['log_files'].items():
                    if log_data.get('size_mb', 0) > 100:  # 100MB log files
                        alerts.append({
                            'type': 'WARNING',
                            'component': f'logs_{source}',
                            'message': f"Large log file for {source}: {log_data['size_mb']:.1f}MB",
                            'timestamp': datetime.now().isoformat()
                        })

        except Exception as e:
            logger.error(f"Error checking alerts: {e}")
            alerts.append({
                'type': 'ERROR',
                'component': 'monitoring',
                'message': f"Alert check failed: {e}",
                'timestamp': datetime.now().isoformat()
            })

        return alerts

    def save_alerts(self, alerts: List[Dict[str, Any]]):
        """Save alerts to file"""
        if not alerts:
            return

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"alerts_{timestamp}.json"
        filepath = self.alerts_dir / filename

        try:
            with open(filepath, 'w') as f:
                json.dump(alerts, f, indent=2)

            # Also log critical alerts
            for alert in alerts:
                if alert['type'] == 'CRITICAL':
                    logger.critical(f"ALERT: {alert['message']}")
                elif alert['type'] == 'WARNING':
                    logger.warning(f"ALERT: {alert['message']}")

        except Exception as e:
            logger.error(f"Error saving alerts to {filepath}: {e}")

    def generate_health_report(self) -> Dict[str, Any]:
        """Generate comprehensive health report"""
        logger.info("Generating health report...")

        # Collect metrics
        system_metrics = self.collect_system_metrics()
        app_metrics = self.collect_application_metrics()

        # Check alerts
        alerts = self.check_alerts(system_metrics, app_metrics)

        # Save to files
        self.save_metrics(system_metrics, 'system')
        self.save_metrics(app_metrics, 'application')
        if alerts:
            self.save_alerts(alerts)

        # Generate summary
        health_status = 'HEALTHY'
        if any(alert['type'] == 'CRITICAL' for alert in alerts):
            health_status = 'CRITICAL'
        elif any(alert['type'] == 'WARNING' for alert in alerts):
            health_status = 'WARNING'
        elif any(alert['type'] == 'ERROR' for alert in alerts):
            health_status = 'ERROR'

        report = {
            'timestamp': datetime.now().isoformat(),
            'health_status': health_status,
            'system_metrics': system_metrics,
            'application_metrics': app_metrics,
            'alerts': alerts,
            'summary': {
                'total_alerts': len(alerts),
                'critical_alerts': len([a for a in alerts if a['type'] == 'CRITICAL']),
                'warning_alerts': len([a for a in alerts if a['type'] == 'WARNING']),
                'error_alerts': len([a for a in alerts if a['type'] == 'ERROR'])
            }
        }

        return report

    def cleanup_old_metrics(self, days_to_keep: int = 7):
        """Clean up old metric files"""
        cutoff_date = datetime.now() - timedelta(days=days_to_keep)

        for metrics_file in self.metrics_dir.glob('*.json'):
            try:
                file_date = datetime.fromtimestamp(metrics_file.stat().st_mtime)
                if file_date < cutoff_date:
                    metrics_file.unlink()
                    logger.info(f"Cleaned up old metrics file: {metrics_file}")
            except Exception as e:
                logger.warning(f"Error cleaning up {metrics_file}: {e}")

        for alerts_file in self.alerts_dir.glob('*.json'):
            try:
                file_date = datetime.fromtimestamp(alerts_file.stat().st_mtime)
                if file_date < cutoff_date:
                    alerts_file.unlink()
                    logger.info(f"Cleaned up old alerts file: {alerts_file}")
            except Exception as e:
                logger.warning(f"Error cleaning up {alerts_file}: {e}")


def main():
    """Main monitoring function"""
    logger.info("Starting monitoring system...")

    monitor = SystemMonitor()

    # Generate health report
    health_report = monitor.generate_health_report()

    # Save full health report
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_file = monitor.metrics_dir / f"health_report_{timestamp}.json"

    try:
        with open(report_file, 'w') as f:
            json.dump(health_report, f, indent=2)
        logger.info(f"Health report saved to {report_file}")
    except Exception as e:
        logger.error(f"Error saving health report: {e}")

    # Print summary
    print(f"\n{'='*60}")
    print(f"HVAC KNOW IT ALL - SYSTEM HEALTH REPORT")
    print(f"{'='*60}")
    print(f"Status: {health_report['health_status']}")
    print(f"Timestamp: {health_report['timestamp']}")
    print(f"Total Alerts: {health_report['summary']['total_alerts']}")

    if health_report['summary']['critical_alerts'] > 0:
        print(f"🔴 Critical Alerts: {health_report['summary']['critical_alerts']}")
    if health_report['summary']['warning_alerts'] > 0:
        print(f"🟡 Warning Alerts: {health_report['summary']['warning_alerts']}")
    if health_report['summary']['error_alerts'] > 0:
        print(f"🟠 Error Alerts: {health_report['summary']['error_alerts']}")

    if health_report['alerts']:
        print(f"\nRecent Alerts:")
        for alert in health_report['alerts'][-5:]:  # Show last 5 alerts
            emoji = "🔴" if alert['type'] == 'CRITICAL' else "🟡" if alert['type'] == 'WARNING' else "🟠"
            print(f"  {emoji} {alert['component']}: {alert['message']}")

    # System summary
    if 'system' in health_report['system_metrics']:
        sys = health_report['system_metrics']['system']
        print(f"\nSystem Resources:")
        print(f"  CPU: {sys.get('cpu_percent', 'N/A'):.1f}%")
        print(f"  Memory: {sys.get('memory_percent', 'N/A'):.1f}%")
        print(f"  Disk: {sys.get('disk_percent', 'N/A'):.1f}%")

    # Scraper summary
    if 'scrapers' in health_report['application_metrics']:
        scrapers = health_report['application_metrics']['scrapers']
        print(f"\nScraper Status ({len(scrapers)} scrapers):")
        for name, data in scrapers.items():
            last_count = data.get('last_item_count', 0)
            minutes_since = data.get('minutes_since_update')
            if minutes_since is not None:
                hours_since = minutes_since / 60
                time_str = f"{hours_since:.1f}h ago" if hours_since > 1 else f"{minutes_since:.0f}m ago"
            else:
                time_str = "Never"
            print(f"  {name}: {last_count} items, last update {time_str}")

    print(f"{'='*60}\n")

    # Cleanup old files
    monitor.cleanup_old_metrics()

    return health_report['health_status'] == 'HEALTHY'


if __name__ == '__main__':
    try:
        success = main()
        exit(0 if success else 1)
    except Exception as e:
        logger.critical(f"Monitoring failed: {e}")
        exit(2)