groups: - name: license_server_alerts interval: 30s rules: # High error rate - alert: HighLicenseValidationErrorRate expr: | ( sum(rate(license_validation_errors_total[5m])) / sum(rate(license_validation_total[5m])) ) > 0.05 for: 5m labels: severity: warning service: license-server annotations: summary: "High license validation error rate ({{ $value | humanizePercentage }})" description: "License validation error rate is above 5% for the last 5 minutes" # License abuse detection - alert: PossibleLicenseAbuse expr: | rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1 for: 10m labels: severity: critical service: license-server annotations: summary: "Possible license abuse detected" description: "High rate of validations from multiple IPs for same license" # Service down - alert: LicenseServerDown expr: up{job="license-server"} == 0 for: 2m labels: severity: critical service: license-server annotations: summary: "License server is down" description: "License server has been down for more than 2 minutes" # High response time - alert: HighLicenseValidationLatency expr: | histogram_quantile(0.95, sum(rate(license_validation_duration_seconds_bucket[5m])) by (le) ) > 0.5 for: 5m labels: severity: warning service: license-server annotations: summary: "High license validation latency" description: "95th percentile latency is above 500ms" # Anomaly detection - alert: HighAnomalyDetectionRate expr: | sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5 for: 5m labels: severity: critical service: license-server annotations: summary: "High rate of critical anomalies detected" description: "More than 0.5 critical anomalies per second detected" - name: database_alerts interval: 30s rules: # Database connection pool exhaustion - alert: DatabaseConnectionPoolExhausted expr: | ( pg_stat_database_numbackends{datname="v2_adminpanel"} / pg_settings_max_connections ) > 0.9 for: 5m labels: severity: critical service: postgres annotations: summary: "Database connection pool nearly exhausted" description: "PostgreSQL connection usage is above 90%" # Database replication lag - alert: DatabaseReplicationLag expr: | pg_replication_lag_seconds > 10 for: 5m labels: severity: warning service: postgres annotations: summary: "Database replication lag detected" description: "Replication lag is {{ $value }} seconds" - name: infrastructure_alerts interval: 30s rules: # High CPU usage - alert: HighCPUUsage expr: | ( 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) ) > 80 for: 10m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% for 10 minutes" # High memory usage - alert: HighMemoryUsage expr: | ( 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) ) > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 90%" # Disk space - alert: LowDiskSpace expr: | ( node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} ) < 0.1 for: 5m labels: severity: critical annotations: summary: "Low disk space on {{ $labels.instance }}" description: "Less than 10% disk space remaining" - name: cache_alerts interval: 30s rules: # Redis connection errors - alert: RedisConnectionErrors expr: | rate(redis_connection_errors_total[5m]) > 0.1 for: 5m labels: severity: warning service: redis annotations: summary: "Redis connection errors detected" description: "Redis connection error rate is {{ $value }} per second" # Cache hit rate - alert: LowCacheHitRate expr: | ( redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) ) < 0.7 for: 10m labels: severity: warning service: redis annotations: summary: "Low Redis cache hit rate" description: "Cache hit rate is below 70%"