groups: - name: service_alerts interval: 30s rules: # Service Down Alerts - alert: ServiceDown expr: up == 0 for: 2m labels: severity: critical component: "{{ $labels.service }}" annotations: summary: "Service {{ $labels.service }} is down" description: "{{ $labels.service }} has been down for more than 2 minutes." # High CPU Usage - alert: HighCPUUsage expr: rate(process_cpu_seconds_total[5m]) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.service }}" description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.service }}." # High Memory Usage - alert: HighMemoryUsage expr: (process_resident_memory_bytes / 1024 / 1024) > 1000 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.service }}" description: "Memory usage is above 1GB for more than 5 minutes on {{ $labels.service }}." - name: license_alerts interval: 30s rules: # High License Validation Failure Rate - alert: HighLicenseValidationFailureRate expr: rate(license_validations_failed_total[5m]) / rate(license_validations_total[5m]) > 0.1 for: 5m labels: severity: warning service: license-server annotations: summary: "High license validation failure rate" description: "More than 10% of license validations are failing." # No License Validations - alert: NoLicenseValidations expr: rate(license_validations_total[10m]) == 0 for: 10m labels: severity: warning service: license-server annotations: summary: "No license validations occurring" description: "No license validations have been processed in the last 10 minutes." # High Anomaly Detection Rate - alert: HighAnomalyRate expr: rate(anomalies_detected_total[5m]) > 10 for: 5m labels: severity: critical service: license-server annotations: summary: "High anomaly detection rate" description: "More than 10 anomalies detected per minute." - name: database_alerts interval: 30s rules: # PostgreSQL Down - alert: PostgreSQLDown expr: pg_up == 0 for: 1m labels: severity: critical service: postgres annotations: summary: "PostgreSQL is down" description: "PostgreSQL has been down for more than 1 minute." # High Database Connections - alert: HighDatabaseConnections expr: pg_stat_database_numbackends{datname="v2_adminpanel"} > 80 for: 5m labels: severity: warning service: postgres annotations: summary: "High number of database connections" description: "More than 80 active connections to v2_adminpanel database." # Slow Queries - alert: SlowQueries expr: rate(pg_stat_statements_mean_time_seconds[5m]) > 1 for: 5m labels: severity: warning service: postgres annotations: summary: "Slow database queries detected" description: "Average query time is above 1 second." - name: redis_alerts interval: 30s rules: # Redis Down - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical service: redis annotations: summary: "Redis is down" description: "Redis has been down for more than 1 minute." # High Redis Memory Usage - alert: HighRedisMemory expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning service: redis annotations: summary: "High Redis memory usage" description: "Redis memory usage is above 90% of max memory." # Low Cache Hit Rate - alert: LowCacheHitRate expr: redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) < 0.8 for: 10m labels: severity: warning service: redis annotations: summary: "Low Redis cache hit rate" description: "Redis cache hit rate is below 80%." - name: api_alerts interval: 30s rules: # High API Response Time - alert: HighAPIResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High API response time on {{ $labels.service }}" description: "95th percentile response time is above 2 seconds on {{ $labels.service }}." # High Error Rate - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: critical annotations: summary: "High error rate on {{ $labels.service }}" description: "Error rate is above 5% on {{ $labels.service }}." # Rate Limit Exceeded - alert: RateLimitExceeded expr: rate(rate_limit_exceeded_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "Rate limits being exceeded frequently" description: "Rate limits are being exceeded more than 10 times per minute." - name: infrastructure_alerts interval: 30s rules: # High Disk Usage - alert: HighDiskUsage expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 20 for: 5m labels: severity: warning annotations: summary: "Low disk space" description: "Disk space is below 20% on root filesystem." # High Load Average - alert: HighLoadAverage expr: node_load5 > 4 for: 5m labels: severity: warning annotations: summary: "High system load" description: "5-minute load average is above 4." # Certificate Expiry - alert: CertificateExpiringSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 7 for: 1h labels: severity: warning annotations: summary: "SSL certificate expiring soon" description: "SSL certificate will expire in less than 7 days."