174 Zeilen
5.1 KiB
YAML
174 Zeilen
5.1 KiB
YAML
groups:
|
|
- name: license_server_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High error rate
|
|
- alert: HighLicenseValidationErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(license_validation_errors_total[5m]))
|
|
/
|
|
sum(rate(license_validation_total[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: license-server
|
|
annotations:
|
|
summary: "High license validation error rate ({{ $value | humanizePercentage }})"
|
|
description: "License validation error rate is above 5% for the last 5 minutes"
|
|
|
|
# License abuse detection
|
|
- alert: PossibleLicenseAbuse
|
|
expr: |
|
|
rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
service: license-server
|
|
annotations:
|
|
summary: "Possible license abuse detected"
|
|
description: "High rate of validations from multiple IPs for same license"
|
|
|
|
# Service down
|
|
- alert: LicenseServerDown
|
|
expr: up{job="license-server"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: license-server
|
|
annotations:
|
|
summary: "License server is down"
|
|
description: "License server has been down for more than 2 minutes"
|
|
|
|
# High response time
|
|
- alert: HighLicenseValidationLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)
|
|
) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: license-server
|
|
annotations:
|
|
summary: "High license validation latency"
|
|
description: "95th percentile latency is above 500ms"
|
|
|
|
# Anomaly detection
|
|
- alert: HighAnomalyDetectionRate
|
|
expr: |
|
|
sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: license-server
|
|
annotations:
|
|
summary: "High rate of critical anomalies detected"
|
|
description: "More than 0.5 critical anomalies per second detected"
|
|
|
|
- name: database_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Database connection pool exhaustion
|
|
- alert: DatabaseConnectionPoolExhausted
|
|
expr: |
|
|
(
|
|
pg_stat_database_numbackends{datname="v2_adminpanel"}
|
|
/
|
|
pg_settings_max_connections
|
|
) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
annotations:
|
|
summary: "Database connection pool nearly exhausted"
|
|
description: "PostgreSQL connection usage is above 90%"
|
|
|
|
# Database replication lag
|
|
- alert: DatabaseReplicationLag
|
|
expr: |
|
|
pg_replication_lag_seconds > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
annotations:
|
|
summary: "Database replication lag detected"
|
|
description: "Replication lag is {{ $value }} seconds"
|
|
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: |
|
|
(
|
|
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% for 10 minutes"
|
|
|
|
# High memory usage
|
|
- alert: HighMemoryUsage
|
|
expr: |
|
|
(
|
|
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
|
) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 90%"
|
|
|
|
# Disk space
|
|
- alert: LowDiskSpace
|
|
expr: |
|
|
(
|
|
node_filesystem_avail_bytes{mountpoint="/"}
|
|
/
|
|
node_filesystem_size_bytes{mountpoint="/"}
|
|
) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }}"
|
|
description: "Less than 10% disk space remaining"
|
|
|
|
- name: cache_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis connection errors
|
|
- alert: RedisConnectionErrors
|
|
expr: |
|
|
rate(redis_connection_errors_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "Redis connection errors detected"
|
|
description: "Redis connection error rate is {{ $value }} per second"
|
|
|
|
# Cache hit rate
|
|
- alert: LowCacheHitRate
|
|
expr: |
|
|
(
|
|
redis_keyspace_hits_total
|
|
/
|
|
(redis_keyspace_hits_total + redis_keyspace_misses_total)
|
|
) < 0.7
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "Low Redis cache hit rate"
|
|
description: "Cache hit rate is below 70%" |