Lizenzserver ist fertig

Dieser Commit ist enthalten in:
2025-06-18 23:22:38 +02:00
Ursprung 6d1a52b7e3
Commit 7017549fcd
21 geänderte Dateien mit 1650 neuen und 5 gelöschten Zeilen

Datei anzeigen

@@ -0,0 +1,111 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'v2-docker-monitor'
environment: 'production'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them
rule_files:
- '/etc/prometheus/rules/*.yml'
# Scrape configurations
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
service: 'prometheus'
# License Server metrics
- job_name: 'license-server'
metrics_path: '/metrics'
static_configs:
- targets: ['license-server:8443']
labels:
service: 'license-server'
component: 'api'
# Auth Service metrics
- job_name: 'auth-service'
metrics_path: '/metrics'
static_configs:
- targets: ['auth-service:5001']
labels:
service: 'auth-service'
component: 'authentication'
# Analytics Service metrics
- job_name: 'analytics-service'
metrics_path: '/metrics'
static_configs:
- targets: ['analytics-service:5003']
labels:
service: 'analytics-service'
component: 'analytics'
# Admin API Service metrics
- job_name: 'admin-api-service'
metrics_path: '/metrics'
static_configs:
- targets: ['admin-api-service:5004']
labels:
service: 'admin-api-service'
component: 'admin'
# Admin Panel metrics
- job_name: 'admin-panel'
metrics_path: '/metrics'
static_configs:
- targets: ['admin-panel:5000']
labels:
service: 'admin-panel'
component: 'ui'
# PostgreSQL Exporter
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
labels:
service: 'postgres'
component: 'database'
# Redis Exporter
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
labels:
service: 'redis'
component: 'cache'
# RabbitMQ metrics
- job_name: 'rabbitmq'
static_configs:
- targets: ['rabbitmq:15692']
labels:
service: 'rabbitmq'
component: 'messaging'
# Node Exporter for host metrics
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
labels:
service: 'node-exporter'
component: 'infrastructure'
# Nginx metrics
- job_name: 'nginx'
static_configs:
- targets: ['nginx-exporter:9113']
labels:
service: 'nginx'
component: 'proxy'

Datei anzeigen

@@ -0,0 +1,174 @@
groups:
- name: license_server_alerts
interval: 30s
rules:
# High error rate
- alert: HighLicenseValidationErrorRate
expr: |
(
sum(rate(license_validation_errors_total[5m]))
/
sum(rate(license_validation_total[5m]))
) > 0.05
for: 5m
labels:
severity: warning
service: license-server
annotations:
summary: "High license validation error rate ({{ $value | humanizePercentage }})"
description: "License validation error rate is above 5% for the last 5 minutes"
# License abuse detection
- alert: PossibleLicenseAbuse
expr: |
rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1
for: 10m
labels:
severity: critical
service: license-server
annotations:
summary: "Possible license abuse detected"
description: "High rate of validations from multiple IPs for same license"
# Service down
- alert: LicenseServerDown
expr: up{job="license-server"} == 0
for: 2m
labels:
severity: critical
service: license-server
annotations:
summary: "License server is down"
description: "License server has been down for more than 2 minutes"
# High response time
- alert: HighLicenseValidationLatency
expr: |
histogram_quantile(0.95,
sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)
) > 0.5
for: 5m
labels:
severity: warning
service: license-server
annotations:
summary: "High license validation latency"
description: "95th percentile latency is above 500ms"
# Anomaly detection
- alert: HighAnomalyDetectionRate
expr: |
sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5
for: 5m
labels:
severity: critical
service: license-server
annotations:
summary: "High rate of critical anomalies detected"
description: "More than 0.5 critical anomalies per second detected"
- name: database_alerts
interval: 30s
rules:
# Database connection pool exhaustion
- alert: DatabaseConnectionPoolExhausted
expr: |
(
pg_stat_database_numbackends{datname="v2_adminpanel"}
/
pg_settings_max_connections
) > 0.9
for: 5m
labels:
severity: critical
service: postgres
annotations:
summary: "Database connection pool nearly exhausted"
description: "PostgreSQL connection usage is above 90%"
# Database replication lag
- alert: DatabaseReplicationLag
expr: |
pg_replication_lag_seconds > 10
for: 5m
labels:
severity: warning
service: postgres
annotations:
summary: "Database replication lag detected"
description: "Replication lag is {{ $value }} seconds"
- name: infrastructure_alerts
interval: 30s
rules:
# High CPU usage
- alert: HighCPUUsage
expr: |
(
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 10 minutes"
# High memory usage
- alert: HighMemoryUsage
expr: |
(
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 90%"
# Disk space
- alert: LowDiskSpace
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"}
) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Less than 10% disk space remaining"
- name: cache_alerts
interval: 30s
rules:
# Redis connection errors
- alert: RedisConnectionErrors
expr: |
rate(redis_connection_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
service: redis
annotations:
summary: "Redis connection errors detected"
description: "Redis connection error rate is {{ $value }} per second"
# Cache hit rate
- alert: LowCacheHitRate
expr: |
(
redis_keyspace_hits_total
/
(redis_keyspace_hits_total + redis_keyspace_misses_total)
) < 0.7
for: 10m
labels:
severity: warning
service: redis
annotations:
summary: "Low Redis cache hit rate"
description: "Cache hit rate is below 70%"