208 Zeilen
6.5 KiB
YAML
208 Zeilen
6.5 KiB
YAML
groups:
|
|
- name: service_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Service Down Alerts
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: "{{ $labels.service }}"
|
|
annotations:
|
|
summary: "Service {{ $labels.service }} is down"
|
|
description: "{{ $labels.service }} has been down for more than 2 minutes."
|
|
|
|
# High CPU Usage
|
|
- alert: HighCPUUsage
|
|
expr: rate(process_cpu_seconds_total[5m]) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.service }}"
|
|
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.service }}."
|
|
|
|
# High Memory Usage
|
|
- alert: HighMemoryUsage
|
|
expr: (process_resident_memory_bytes / 1024 / 1024) > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.service }}"
|
|
description: "Memory usage is above 1GB for more than 5 minutes on {{ $labels.service }}."
|
|
|
|
- name: license_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High License Validation Failure Rate
|
|
- alert: HighLicenseValidationFailureRate
|
|
expr: rate(license_validations_failed_total[5m]) / rate(license_validations_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: license-server
|
|
annotations:
|
|
summary: "High license validation failure rate"
|
|
description: "More than 10% of license validations are failing."
|
|
|
|
# No License Validations
|
|
- alert: NoLicenseValidations
|
|
expr: rate(license_validations_total[10m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: license-server
|
|
annotations:
|
|
summary: "No license validations occurring"
|
|
description: "No license validations have been processed in the last 10 minutes."
|
|
|
|
# High Anomaly Detection Rate
|
|
- alert: HighAnomalyRate
|
|
expr: rate(anomalies_detected_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: license-server
|
|
annotations:
|
|
summary: "High anomaly detection rate"
|
|
description: "More than 10 anomalies detected per minute."
|
|
|
|
- name: database_alerts
|
|
interval: 30s
|
|
rules:
|
|
# PostgreSQL Down
|
|
- alert: PostgreSQLDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: postgres
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL has been down for more than 1 minute."
|
|
|
|
# High Database Connections
|
|
- alert: HighDatabaseConnections
|
|
expr: pg_stat_database_numbackends{datname="v2_adminpanel"} > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
annotations:
|
|
summary: "High number of database connections"
|
|
description: "More than 80 active connections to v2_adminpanel database."
|
|
|
|
# Slow Queries
|
|
- alert: SlowQueries
|
|
expr: rate(pg_stat_statements_mean_time_seconds[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgres
|
|
annotations:
|
|
summary: "Slow database queries detected"
|
|
description: "Average query time is above 1 second."
|
|
|
|
- name: redis_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis Down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis has been down for more than 1 minute."
|
|
|
|
# High Redis Memory Usage
|
|
- alert: HighRedisMemory
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "High Redis memory usage"
|
|
description: "Redis memory usage is above 90% of max memory."
|
|
|
|
# Low Cache Hit Rate
|
|
- alert: LowCacheHitRate
|
|
expr: redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) < 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "Low Redis cache hit rate"
|
|
description: "Redis cache hit rate is below 80%."
|
|
|
|
- name: api_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High API Response Time
|
|
- alert: HighAPIResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High API response time on {{ $labels.service }}"
|
|
description: "95th percentile response time is above 2 seconds on {{ $labels.service }}."
|
|
|
|
# High Error Rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High error rate on {{ $labels.service }}"
|
|
description: "Error rate is above 5% on {{ $labels.service }}."
|
|
|
|
# Rate Limit Exceeded
|
|
- alert: RateLimitExceeded
|
|
expr: rate(rate_limit_exceeded_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Rate limits being exceeded frequently"
|
|
description: "Rate limits are being exceeded more than 10 times per minute."
|
|
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High Disk Usage
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space"
|
|
description: "Disk space is below 20% on root filesystem."
|
|
|
|
# High Load Average
|
|
- alert: HighLoadAverage
|
|
expr: node_load5 > 4
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High system load"
|
|
description: "5-minute load average is above 4."
|
|
|
|
# Certificate Expiry
|
|
- alert: CertificateExpiringSoon
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 7
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "SSL certificate expiring soon"
|
|
description: "SSL certificate will expire in less than 7 days." |