Lizenzserver ist fertig
Dieser Commit ist enthalten in:
111
monitoring/prometheus/prometheus.yml
Normale Datei
111
monitoring/prometheus/prometheus.yml
Normale Datei
@@ -0,0 +1,111 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
monitor: 'v2-docker-monitor'
|
||||
environment: 'production'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load rules once and periodically evaluate them
|
||||
rule_files:
|
||||
- '/etc/prometheus/rules/*.yml'
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
service: 'prometheus'
|
||||
|
||||
# License Server metrics
|
||||
- job_name: 'license-server'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['license-server:8443']
|
||||
labels:
|
||||
service: 'license-server'
|
||||
component: 'api'
|
||||
|
||||
# Auth Service metrics
|
||||
- job_name: 'auth-service'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['auth-service:5001']
|
||||
labels:
|
||||
service: 'auth-service'
|
||||
component: 'authentication'
|
||||
|
||||
# Analytics Service metrics
|
||||
- job_name: 'analytics-service'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['analytics-service:5003']
|
||||
labels:
|
||||
service: 'analytics-service'
|
||||
component: 'analytics'
|
||||
|
||||
# Admin API Service metrics
|
||||
- job_name: 'admin-api-service'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['admin-api-service:5004']
|
||||
labels:
|
||||
service: 'admin-api-service'
|
||||
component: 'admin'
|
||||
|
||||
# Admin Panel metrics
|
||||
- job_name: 'admin-panel'
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['admin-panel:5000']
|
||||
labels:
|
||||
service: 'admin-panel'
|
||||
component: 'ui'
|
||||
|
||||
# PostgreSQL Exporter
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
labels:
|
||||
service: 'postgres'
|
||||
component: 'database'
|
||||
|
||||
# Redis Exporter
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
labels:
|
||||
service: 'redis'
|
||||
component: 'cache'
|
||||
|
||||
# RabbitMQ metrics
|
||||
- job_name: 'rabbitmq'
|
||||
static_configs:
|
||||
- targets: ['rabbitmq:15692']
|
||||
labels:
|
||||
service: 'rabbitmq'
|
||||
component: 'messaging'
|
||||
|
||||
# Node Exporter for host metrics
|
||||
- job_name: 'node'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
service: 'node-exporter'
|
||||
component: 'infrastructure'
|
||||
|
||||
# Nginx metrics
|
||||
- job_name: 'nginx'
|
||||
static_configs:
|
||||
- targets: ['nginx-exporter:9113']
|
||||
labels:
|
||||
service: 'nginx'
|
||||
component: 'proxy'
|
||||
174
monitoring/prometheus/rules/license-server-alerts.yml
Normale Datei
174
monitoring/prometheus/rules/license-server-alerts.yml
Normale Datei
@@ -0,0 +1,174 @@
|
||||
groups:
|
||||
- name: license_server_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# High error rate
|
||||
- alert: HighLicenseValidationErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(license_validation_errors_total[5m]))
|
||||
/
|
||||
sum(rate(license_validation_total[5m]))
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: license-server
|
||||
annotations:
|
||||
summary: "High license validation error rate ({{ $value | humanizePercentage }})"
|
||||
description: "License validation error rate is above 5% for the last 5 minutes"
|
||||
|
||||
# License abuse detection
|
||||
- alert: PossibleLicenseAbuse
|
||||
expr: |
|
||||
rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: license-server
|
||||
annotations:
|
||||
summary: "Possible license abuse detected"
|
||||
description: "High rate of validations from multiple IPs for same license"
|
||||
|
||||
# Service down
|
||||
- alert: LicenseServerDown
|
||||
expr: up{job="license-server"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: license-server
|
||||
annotations:
|
||||
summary: "License server is down"
|
||||
description: "License server has been down for more than 2 minutes"
|
||||
|
||||
# High response time
|
||||
- alert: HighLicenseValidationLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)
|
||||
) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: license-server
|
||||
annotations:
|
||||
summary: "High license validation latency"
|
||||
description: "95th percentile latency is above 500ms"
|
||||
|
||||
# Anomaly detection
|
||||
- alert: HighAnomalyDetectionRate
|
||||
expr: |
|
||||
sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: license-server
|
||||
annotations:
|
||||
summary: "High rate of critical anomalies detected"
|
||||
description: "More than 0.5 critical anomalies per second detected"
|
||||
|
||||
- name: database_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# Database connection pool exhaustion
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: |
|
||||
(
|
||||
pg_stat_database_numbackends{datname="v2_adminpanel"}
|
||||
/
|
||||
pg_settings_max_connections
|
||||
) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "Database connection pool nearly exhausted"
|
||||
description: "PostgreSQL connection usage is above 90%"
|
||||
|
||||
# Database replication lag
|
||||
- alert: DatabaseReplicationLag
|
||||
expr: |
|
||||
pg_replication_lag_seconds > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
annotations:
|
||||
summary: "Database replication lag detected"
|
||||
description: "Replication lag is {{ $value }} seconds"
|
||||
|
||||
- name: infrastructure_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# High CPU usage
|
||||
- alert: HighCPUUsage
|
||||
expr: |
|
||||
(
|
||||
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% for 10 minutes"
|
||||
|
||||
# High memory usage
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(
|
||||
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
||||
) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 90%"
|
||||
|
||||
# Disk space
|
||||
- alert: LowDiskSpace
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{mountpoint="/"}
|
||||
/
|
||||
node_filesystem_size_bytes{mountpoint="/"}
|
||||
) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Less than 10% disk space remaining"
|
||||
|
||||
- name: cache_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# Redis connection errors
|
||||
- alert: RedisConnectionErrors
|
||||
expr: |
|
||||
rate(redis_connection_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "Redis connection errors detected"
|
||||
description: "Redis connection error rate is {{ $value }} per second"
|
||||
|
||||
# Cache hit rate
|
||||
- alert: LowCacheHitRate
|
||||
expr: |
|
||||
(
|
||||
redis_keyspace_hits_total
|
||||
/
|
||||
(redis_keyspace_hits_total + redis_keyspace_misses_total)
|
||||
) < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
annotations:
|
||||
summary: "Low Redis cache hit rate"
|
||||
description: "Cache hit rate is below 70%"
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren