Dateien
Hetzner-Backup/monitoring/prometheus/rules/license-server-alerts.yml
2025-06-18 23:22:38 +02:00

174 Zeilen
5.1 KiB
YAML

groups:
- name: license_server_alerts
interval: 30s
rules:
# High error rate
- alert: HighLicenseValidationErrorRate
expr: |
(
sum(rate(license_validation_errors_total[5m]))
/
sum(rate(license_validation_total[5m]))
) > 0.05
for: 5m
labels:
severity: warning
service: license-server
annotations:
summary: "High license validation error rate ({{ $value | humanizePercentage }})"
description: "License validation error rate is above 5% for the last 5 minutes"
# License abuse detection
- alert: PossibleLicenseAbuse
expr: |
rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1
for: 10m
labels:
severity: critical
service: license-server
annotations:
summary: "Possible license abuse detected"
description: "High rate of validations from multiple IPs for same license"
# Service down
- alert: LicenseServerDown
expr: up{job="license-server"} == 0
for: 2m
labels:
severity: critical
service: license-server
annotations:
summary: "License server is down"
description: "License server has been down for more than 2 minutes"
# High response time
- alert: HighLicenseValidationLatency
expr: |
histogram_quantile(0.95,
sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)
) > 0.5
for: 5m
labels:
severity: warning
service: license-server
annotations:
summary: "High license validation latency"
description: "95th percentile latency is above 500ms"
# Anomaly detection
- alert: HighAnomalyDetectionRate
expr: |
sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5
for: 5m
labels:
severity: critical
service: license-server
annotations:
summary: "High rate of critical anomalies detected"
description: "More than 0.5 critical anomalies per second detected"
- name: database_alerts
interval: 30s
rules:
# Database connection pool exhaustion
- alert: DatabaseConnectionPoolExhausted
expr: |
(
pg_stat_database_numbackends{datname="v2_adminpanel"}
/
pg_settings_max_connections
) > 0.9
for: 5m
labels:
severity: critical
service: postgres
annotations:
summary: "Database connection pool nearly exhausted"
description: "PostgreSQL connection usage is above 90%"
# Database replication lag
- alert: DatabaseReplicationLag
expr: |
pg_replication_lag_seconds > 10
for: 5m
labels:
severity: warning
service: postgres
annotations:
summary: "Database replication lag detected"
description: "Replication lag is {{ $value }} seconds"
- name: infrastructure_alerts
interval: 30s
rules:
# High CPU usage
- alert: HighCPUUsage
expr: |
(
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 10 minutes"
# High memory usage
- alert: HighMemoryUsage
expr: |
(
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 90%"
# Disk space
- alert: LowDiskSpace
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"}
) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Less than 10% disk space remaining"
- name: cache_alerts
interval: 30s
rules:
# Redis connection errors
- alert: RedisConnectionErrors
expr: |
rate(redis_connection_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
service: redis
annotations:
summary: "Redis connection errors detected"
description: "Redis connection error rate is {{ $value }} per second"
# Cache hit rate
- alert: LowCacheHitRate
expr: |
(
redis_keyspace_hits_total
/
(redis_keyspace_hits_total + redis_keyspace_misses_total)
) < 0.7
for: 10m
labels:
severity: warning
service: redis
annotations:
summary: "Low Redis cache hit rate"
description: "Cache hit rate is below 70%"