diff --git a/lizenzserver/services/auth/app.py b/lizenzserver/services/auth/app.py index f156468..b560c05 100644 --- a/lizenzserver/services/auth/app.py +++ b/lizenzserver/services/auth/app.py @@ -6,6 +6,7 @@ import jwt from datetime import datetime, timedelta import logging from functools import wraps +from prometheus_flask_exporter import PrometheusMetrics # Add parent directory to path for imports sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) @@ -23,6 +24,10 @@ config = get_config() app.config.from_object(config) CORS(app) +# Initialize Prometheus metrics +metrics = PrometheusMetrics(app) +metrics.info('auth_service_info', 'Auth Service Information', version='1.0.0') + # Initialize repository db_repo = BaseRepository(config.DATABASE_URL) diff --git a/lizenzserver/services/auth/requirements.txt b/lizenzserver/services/auth/requirements.txt index 6e8b018..1c13f39 100644 --- a/lizenzserver/services/auth/requirements.txt +++ b/lizenzserver/services/auth/requirements.txt @@ -5,4 +5,5 @@ psycopg2-binary==2.9.9 redis==5.0.1 python-dotenv==1.0.0 gunicorn==21.2.0 -marshmallow==3.20.1 \ No newline at end of file +marshmallow==3.20.1 +prometheus-flask-exporter==0.23.0 \ No newline at end of file diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 0000000..1c32722 --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,272 @@ +# V2 Docker Monitoring Stack + +## Übersicht + +Die Monitoring-Lösung für V2 Docker basiert auf dem Prometheus-Stack und bietet umfassende Einblicke in die Performance und Gesundheit aller Services. + +## Komponenten + +### 1. **Prometheus** (Port 9090) +- Zentrale Metrik-Sammlung +- Konfigurierte Scrape-Jobs für alle Services +- 30 Tage Datenaufbewahrung +- Alert-Rules für kritische Ereignisse + +### 2. **Grafana** (Port 3000) +- Visualisierung der Metriken +- Vorkonfigurierte Dashboards +- Alerting-Integration +- Standard-Login: admin/admin (beim ersten Login ändern) + +### 3. **Alertmanager** (Port 9093) +- Alert-Routing und -Gruppierung +- Email-Benachrichtigungen +- Webhook-Integration +- Alert-Silencing und -Inhibition + +### 4. **Exporters** +- **PostgreSQL Exporter**: Datenbank-Metriken +- **Redis Exporter**: Cache-Metriken +- **Node Exporter**: System-Metriken +- **Nginx Exporter**: Proxy-Metriken + +## Installation + +### 1. Monitoring-Stack starten + +```bash +cd monitoring +docker-compose -f docker-compose.monitoring.yml up -d +``` + +### 2. Services überprüfen + +```bash +docker-compose -f docker-compose.monitoring.yml ps +``` + +### 3. Grafana-Zugang + +1. Öffnen Sie https://monitoring.v2-docker.com (oder http://localhost:3000) +2. Login mit admin/admin +3. Neues Passwort setzen +4. Dashboard "License Server Overview" öffnen + +## Konfiguration + +### Environment-Variablen + +Erstellen Sie eine `.env` Datei im monitoring-Verzeichnis: + +```env +# Grafana +GRAFANA_USER=admin +GRAFANA_PASSWORD=secure-password + +# PostgreSQL Connection +POSTGRES_PASSWORD=your-postgres-password + +# Alertmanager SMTP +SMTP_USERNAME=alerts@yourdomain.com +SMTP_PASSWORD=smtp-password + +# Webhook URLs +WEBHOOK_CRITICAL=https://your-webhook-url/critical +WEBHOOK_SECURITY=https://your-webhook-url/security +``` + +### Alert-Konfiguration + +Alerts sind in `prometheus/rules/license-server-alerts.yml` definiert: + +- **HighLicenseValidationErrorRate**: Fehlerrate > 5% +- **PossibleLicenseAbuse**: Verdächtige Aktivitäten +- **LicenseServerDown**: Service nicht erreichbar +- **HighLicenseValidationLatency**: Antwortzeit > 500ms +- **DatabaseConnectionPoolExhausted**: DB-Verbindungen > 90% + +### Neue Alerts hinzufügen + +1. Editieren Sie `prometheus/rules/license-server-alerts.yml` +2. Fügen Sie neue Alert-Regel hinzu: + +```yaml +- alert: YourAlertName + expr: your_prometheus_query > threshold + for: 5m + labels: + severity: warning + service: your-service + annotations: + summary: "Alert summary" + description: "Detailed description" +``` + +3. Prometheus neu laden: + +```bash +curl -X POST http://localhost:9090/-/reload +``` + +## Dashboards + +### License Server Overview + +Zeigt wichtige Metriken: +- Aktive Lizenzen +- Validierungen pro Sekunde +- Fehlerrate +- Response Time Percentiles +- Anomalie-Erkennung +- Top 10 aktivste Lizenzen + +### Neue Dashboards erstellen + +1. In Grafana einloggen +2. Create → Dashboard +3. Panel hinzufügen +4. Prometheus-Query eingeben +5. Dashboard speichern +6. Export als JSON für Backup + +## Metriken + +### License Server Metriken + +- `license_validation_total`: Anzahl der Validierungen +- `license_validation_duration_seconds`: Validierungs-Dauer +- `active_licenses_total`: Aktive Lizenzen +- `anomaly_detections_total`: Erkannte Anomalien + +### System Metriken + +- `node_cpu_seconds_total`: CPU-Auslastung +- `node_memory_MemAvailable_bytes`: Verfügbarer Speicher +- `node_filesystem_avail_bytes`: Verfügbarer Festplattenspeicher + +### Datenbank Metriken + +- `pg_stat_database_numbackends`: Aktive DB-Verbindungen +- `pg_stat_database_tup_fetched`: Abgerufene Tupel +- `pg_stat_database_conflicts`: Konflikte + +## Troubleshooting + +### Prometheus erreicht Service nicht + +1. Netzwerk überprüfen: +```bash +docker network inspect v2_internal_net +``` + +2. Service-Discovery testen: +```bash +docker exec prometheus wget -O- http://license-server:8443/metrics +``` + +### Keine Daten in Grafana + +1. Datasource überprüfen: + - Settings → Data Sources → Prometheus + - Test Connection + +2. Prometheus Targets checken: + - http://localhost:9090/targets + - Alle Targets sollten "UP" sein + +### Alerts werden nicht gesendet + +1. Alertmanager Logs prüfen: +```bash +docker logs alertmanager +``` + +2. SMTP-Konfiguration verifizieren +3. Webhook-URLs testen + +## Wartung + +### Backup + +1. Prometheus-Daten: +```bash +docker exec prometheus tar czf /prometheus/backup.tar.gz /prometheus +docker cp prometheus:/prometheus/backup.tar.gz ./backups/ +``` + +2. Grafana-Dashboards: + - Export über UI als JSON + - Speichern in `grafana/dashboards/` + +### Updates + +1. Images updaten: +```bash +docker-compose -f docker-compose.monitoring.yml pull +docker-compose -f docker-compose.monitoring.yml up -d +``` + +2. Konfiguration neu laden: +```bash +# Prometheus +curl -X POST http://localhost:9090/-/reload + +# Alertmanager +curl -X POST http://localhost:9093/-/reload +``` + +## Performance-Optimierung + +### Retention anpassen + +In `docker-compose.monitoring.yml`: +```yaml +command: + - '--storage.tsdb.retention.time=15d' # Reduzieren für weniger Speicher +``` + +### Scrape-Intervalle + +In `prometheus/prometheus.yml`: +```yaml +global: + scrape_interval: 30s # Erhöhen für weniger Last +``` + +### Resource Limits + +Passen Sie die Limits in `docker-compose.monitoring.yml` an Ihre Umgebung an. + +## Sicherheit + +1. **Grafana**: Ändern Sie das Standard-Passwort sofort +2. **Prometheus**: Kein öffentlicher Zugriff (nur intern) +3. **Alertmanager**: Webhook-URLs geheim halten +4. **Exporters**: Nur im internen Netzwerk erreichbar + +## Integration + +### In CI/CD Pipeline + +```bash +# Deployment-Metriken senden +curl -X POST http://prometheus-pushgateway:9091/metrics/job/deployment \ + -d 'deployment_status{version="1.2.3",environment="production"} 1' +``` + +### Custom Metriken + +In Ihrer Anwendung: +```python +from prometheus_client import Counter, Histogram + +custom_metric = Counter('my_custom_total', 'Description') +custom_metric.inc() +``` + +## Support + +Bei Problemen: +1. Logs überprüfen: `docker-compose -f docker-compose.monitoring.yml logs [service]` +2. Dokumentation: https://prometheus.io/docs/ +3. Grafana Docs: https://grafana.com/docs/ \ No newline at end of file diff --git a/monitoring/alertmanager/alertmanager.yml b/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..65835a0 --- /dev/null +++ b/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,94 @@ +global: + resolve_timeout: 5m + smtp_from: 'alerts@v2-docker.com' + smtp_smarthost: 'smtp.gmail.com:587' + smtp_auth_username: '${SMTP_USERNAME}' + smtp_auth_password: '${SMTP_PASSWORD}' + smtp_require_tls: true + +# Templates for notifications +templates: + - '/etc/alertmanager/templates/*.tmpl' + +# Route tree +route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 12h + receiver: 'default' + + routes: + # Critical alerts + - match: + severity: critical + receiver: 'critical' + continue: true + + # License abuse alerts + - match: + alertname: PossibleLicenseAbuse + receiver: 'security' + repeat_interval: 1h + + # Database alerts + - match: + service: postgres + receiver: 'database' + + # Infrastructure alerts + - match_re: + alertname: ^(HighCPUUsage|HighMemoryUsage|LowDiskSpace)$ + receiver: 'infrastructure' + +# Receivers +receivers: + - name: 'default' + email_configs: + - to: 'admin@v2-docker.com' + headers: + Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}' + html: | +

Alert: {{ .GroupLabels.alertname }}

+

Status: {{ .Status }}

+ {{ range .Alerts }} +
+

Summary: {{ .Annotations.summary }}

+

Description: {{ .Annotations.description }}

+

Labels:

+ + {{ end }} + + - name: 'critical' + email_configs: + - to: 'critical-alerts@v2-docker.com' + send_resolved: true + webhook_configs: + - url: '${WEBHOOK_CRITICAL}' + send_resolved: true + + - name: 'security' + email_configs: + - to: 'security@v2-docker.com' + webhook_configs: + - url: '${WEBHOOK_SECURITY}' + + - name: 'database' + email_configs: + - to: 'dba@v2-docker.com' + + - name: 'infrastructure' + email_configs: + - to: 'ops@v2-docker.com' + +# Inhibition rules +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'instance'] \ No newline at end of file diff --git a/monitoring/docker-compose.monitoring.yml b/monitoring/docker-compose.monitoring.yml new file mode 100644 index 0000000..860c8d8 --- /dev/null +++ b/monitoring/docker-compose.monitoring.yml @@ -0,0 +1,149 @@ +version: '3.8' + +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/rules:/etc/prometheus/rules + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + networks: + - v2_internal_net + ports: + - "9090:9090" + deploy: + resources: + limits: + cpus: '1' + memory: 2g + + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_SERVER_ROOT_URL=https://monitoring.v2-docker.com + - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/var/lib/grafana/dashboards + networks: + - v2_internal_net + ports: + - "3000:3000" + depends_on: + - prometheus + deploy: + resources: + limits: + cpus: '0.5' + memory: 512m + + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + restart: unless-stopped + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - alertmanager_data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + networks: + - v2_internal_net + ports: + - "9093:9093" + deploy: + resources: + limits: + cpus: '0.5' + memory: 256m + + # PostgreSQL Exporter + postgres-exporter: + image: prometheuscommunity/postgres-exporter:latest + container_name: postgres-exporter + restart: unless-stopped + environment: + DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/v2_adminpanel?sslmode=disable" + networks: + - v2_internal_net + deploy: + resources: + limits: + cpus: '0.25' + memory: 128m + + # Redis Exporter + redis-exporter: + image: oliver006/redis_exporter:latest + container_name: redis-exporter + restart: unless-stopped + environment: + REDIS_ADDR: "redis://redis:6379" + networks: + - v2_internal_net + deploy: + resources: + limits: + cpus: '0.25' + memory: 128m + + # Node Exporter (for host metrics) + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + networks: + - v2_internal_net + deploy: + resources: + limits: + cpus: '0.25' + memory: 128m + + # Nginx Exporter + nginx-exporter: + image: nginx/nginx-prometheus-exporter:latest + container_name: nginx-exporter + restart: unless-stopped + command: + - '-nginx.scrape-uri=http://nginx-proxy:8080/nginx_status' + networks: + - v2_internal_net + deploy: + resources: + limits: + cpus: '0.25' + memory: 128m + +networks: + v2_internal_net: + external: true + +volumes: + prometheus_data: + grafana_data: + alertmanager_data: \ No newline at end of file diff --git a/monitoring/grafana/dashboards/license-server-dashboard.json b/monitoring/grafana/dashboards/license-server-dashboard.json new file mode 100644 index 0000000..ae21f2a --- /dev/null +++ b/monitoring/grafana/dashboards/license-server-dashboard.json @@ -0,0 +1,562 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(active_licenses_total)", + "refId": "A" + } + ], + "title": "Active Licenses", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(rate(license_validation_total[5m]))", + "refId": "A" + } + ], + "title": "Validations/sec", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(rate(license_validation_errors_total[5m])) / sum(rate(license_validation_total[5m]))", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 200 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)) * 1000", + "refId": "A" + } + ], + "title": "95th Percentile Latency", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + } + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(rate(license_validation_total{result=\"success\"}[5m]))", + "legendFormat": "Success", + "refId": "A" + }, + { + "expr": "sum(rate(license_validation_total{result=\"invalid\"}[5m]))", + "legendFormat": "Invalid", + "refId": "B" + }, + { + "expr": "sum(rate(license_validation_total{result=\"expired\"}[5m]))", + "legendFormat": "Expired", + "refId": "C" + } + ], + "title": "License Validation Rate", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + } + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "50th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "95th percentile", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "99th percentile", + "refId": "C" + } + ], + "title": "Response Time Percentiles", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "tooltip": { + "mode": "single" + }, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + } + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(rate(anomaly_detections_total{severity=\"low\"}[5m]))", + "legendFormat": "Low", + "refId": "A" + }, + { + "expr": "sum(rate(anomaly_detections_total{severity=\"medium\"}[5m]))", + "legendFormat": "Medium", + "refId": "B" + }, + { + "expr": "sum(rate(anomaly_detections_total{severity=\"high\"}[5m]))", + "legendFormat": "High", + "refId": "C" + }, + { + "expr": "sum(rate(anomaly_detections_total{severity=\"critical\"}[5m]))", + "legendFormat": "Critical", + "refId": "D" + } + ], + "title": "Anomaly Detection Rate by Severity", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 8, + "options": { + "showHeader": true + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "topk(10, sum by (license_id) (rate(license_validation_total[1h])))", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Top 10 Most Active Licenses (Last Hour)", + "type": "table" + } + ], + "refresh": "10s", + "schemaVersion": 27, + "style": "dark", + "tags": ["license-server", "monitoring"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "License Server Overview", + "uid": "license-server-overview", + "version": 0 +} \ No newline at end of file diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..a64800e --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'V2 Docker Dashboards' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..a308c4c --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: 15s + queryTimeout: 60s + httpMethod: POST \ No newline at end of file diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..e6a105a --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,111 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'v2-docker-monitor' + environment: 'production' + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +# Load rules once and periodically evaluate them +rule_files: + - '/etc/prometheus/rules/*.yml' + +# Scrape configurations +scrape_configs: + # Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + labels: + service: 'prometheus' + + # License Server metrics + - job_name: 'license-server' + metrics_path: '/metrics' + static_configs: + - targets: ['license-server:8443'] + labels: + service: 'license-server' + component: 'api' + + # Auth Service metrics + - job_name: 'auth-service' + metrics_path: '/metrics' + static_configs: + - targets: ['auth-service:5001'] + labels: + service: 'auth-service' + component: 'authentication' + + # Analytics Service metrics + - job_name: 'analytics-service' + metrics_path: '/metrics' + static_configs: + - targets: ['analytics-service:5003'] + labels: + service: 'analytics-service' + component: 'analytics' + + # Admin API Service metrics + - job_name: 'admin-api-service' + metrics_path: '/metrics' + static_configs: + - targets: ['admin-api-service:5004'] + labels: + service: 'admin-api-service' + component: 'admin' + + # Admin Panel metrics + - job_name: 'admin-panel' + metrics_path: '/metrics' + static_configs: + - targets: ['admin-panel:5000'] + labels: + service: 'admin-panel' + component: 'ui' + + # PostgreSQL Exporter + - job_name: 'postgres' + static_configs: + - targets: ['postgres-exporter:9187'] + labels: + service: 'postgres' + component: 'database' + + # Redis Exporter + - job_name: 'redis' + static_configs: + - targets: ['redis-exporter:9121'] + labels: + service: 'redis' + component: 'cache' + + # RabbitMQ metrics + - job_name: 'rabbitmq' + static_configs: + - targets: ['rabbitmq:15692'] + labels: + service: 'rabbitmq' + component: 'messaging' + + # Node Exporter for host metrics + - job_name: 'node' + static_configs: + - targets: ['node-exporter:9100'] + labels: + service: 'node-exporter' + component: 'infrastructure' + + # Nginx metrics + - job_name: 'nginx' + static_configs: + - targets: ['nginx-exporter:9113'] + labels: + service: 'nginx' + component: 'proxy' \ No newline at end of file diff --git a/monitoring/prometheus/rules/license-server-alerts.yml b/monitoring/prometheus/rules/license-server-alerts.yml new file mode 100644 index 0000000..a2996d1 --- /dev/null +++ b/monitoring/prometheus/rules/license-server-alerts.yml @@ -0,0 +1,174 @@ +groups: + - name: license_server_alerts + interval: 30s + rules: + # High error rate + - alert: HighLicenseValidationErrorRate + expr: | + ( + sum(rate(license_validation_errors_total[5m])) + / + sum(rate(license_validation_total[5m])) + ) > 0.05 + for: 5m + labels: + severity: warning + service: license-server + annotations: + summary: "High license validation error rate ({{ $value | humanizePercentage }})" + description: "License validation error rate is above 5% for the last 5 minutes" + + # License abuse detection + - alert: PossibleLicenseAbuse + expr: | + rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1 + for: 10m + labels: + severity: critical + service: license-server + annotations: + summary: "Possible license abuse detected" + description: "High rate of validations from multiple IPs for same license" + + # Service down + - alert: LicenseServerDown + expr: up{job="license-server"} == 0 + for: 2m + labels: + severity: critical + service: license-server + annotations: + summary: "License server is down" + description: "License server has been down for more than 2 minutes" + + # High response time + - alert: HighLicenseValidationLatency + expr: | + histogram_quantile(0.95, + sum(rate(license_validation_duration_seconds_bucket[5m])) by (le) + ) > 0.5 + for: 5m + labels: + severity: warning + service: license-server + annotations: + summary: "High license validation latency" + description: "95th percentile latency is above 500ms" + + # Anomaly detection + - alert: HighAnomalyDetectionRate + expr: | + sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5 + for: 5m + labels: + severity: critical + service: license-server + annotations: + summary: "High rate of critical anomalies detected" + description: "More than 0.5 critical anomalies per second detected" + + - name: database_alerts + interval: 30s + rules: + # Database connection pool exhaustion + - alert: DatabaseConnectionPoolExhausted + expr: | + ( + pg_stat_database_numbackends{datname="v2_adminpanel"} + / + pg_settings_max_connections + ) > 0.9 + for: 5m + labels: + severity: critical + service: postgres + annotations: + summary: "Database connection pool nearly exhausted" + description: "PostgreSQL connection usage is above 90%" + + # Database replication lag + - alert: DatabaseReplicationLag + expr: | + pg_replication_lag_seconds > 10 + for: 5m + labels: + severity: warning + service: postgres + annotations: + summary: "Database replication lag detected" + description: "Replication lag is {{ $value }} seconds" + + - name: infrastructure_alerts + interval: 30s + rules: + # High CPU usage + - alert: HighCPUUsage + expr: | + ( + 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + ) > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "CPU usage is above 80% for 10 minutes" + + # High memory usage + - alert: HighMemoryUsage + expr: | + ( + 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) + ) > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 90%" + + # Disk space + - alert: LowDiskSpace + expr: | + ( + node_filesystem_avail_bytes{mountpoint="/"} + / + node_filesystem_size_bytes{mountpoint="/"} + ) < 0.1 + for: 5m + labels: + severity: critical + annotations: + summary: "Low disk space on {{ $labels.instance }}" + description: "Less than 10% disk space remaining" + + - name: cache_alerts + interval: 30s + rules: + # Redis connection errors + - alert: RedisConnectionErrors + expr: | + rate(redis_connection_errors_total[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: redis + annotations: + summary: "Redis connection errors detected" + description: "Redis connection error rate is {{ $value }} per second" + + # Cache hit rate + - alert: LowCacheHitRate + expr: | + ( + redis_keyspace_hits_total + / + (redis_keyspace_hits_total + redis_keyspace_misses_total) + ) < 0.7 + for: 10m + labels: + severity: warning + service: redis + annotations: + summary: "Low Redis cache hit rate" + description: "Cache hit rate is below 70%" \ No newline at end of file diff --git a/v2/docker-compose.yaml b/v2/docker-compose.yaml index 80135da..e0fa836 100644 --- a/v2/docker-compose.yaml +++ b/v2/docker-compose.yaml @@ -81,6 +81,30 @@ services: cpus: '2' memory: 4g + auth-service: + build: + context: ../lizenzserver/services/auth + container_name: auth-service + restart: always + # Port 5001 - nur intern erreichbar + env_file: .env + environment: + TZ: Europe/Berlin + DATABASE_URL: postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/v2_adminpanel + REDIS_URL: redis://redis:6379/1 + JWT_SECRET: ${JWT_SECRET} + FLASK_ENV: production + depends_on: + - postgres + - redis + networks: + - internal_net + deploy: + resources: + limits: + cpus: '1' + memory: 1g + analytics-service: build: context: ../v2_lizenzserver/services/analytics @@ -166,6 +190,7 @@ services: depends_on: - admin-panel - license-server + - auth-service - analytics-service - admin-api-service networks: diff --git a/v2_adminpanel/app.py b/v2_adminpanel/app.py index 4513b67..2913c87 100644 --- a/v2_adminpanel/app.py +++ b/v2_adminpanel/app.py @@ -10,12 +10,17 @@ from flask import Flask, render_template, session from flask_session import Session from werkzeug.middleware.proxy_fix import ProxyFix from apscheduler.schedulers.background import BackgroundScheduler +from prometheus_flask_exporter import PrometheusMetrics # Import our configuration and utilities import config from utils.backup import create_backup app = Flask(__name__) + +# Initialize Prometheus metrics +metrics = PrometheusMetrics(app) +metrics.info('admin_panel_info', 'Admin Panel Information', version='1.0.0') # Load configuration from config module app.config['SECRET_KEY'] = config.SECRET_KEY app.config['SESSION_TYPE'] = config.SESSION_TYPE diff --git a/v2_adminpanel/requirements.txt b/v2_adminpanel/requirements.txt index 50ebfbb..114a0f5 100644 --- a/v2_adminpanel/requirements.txt +++ b/v2_adminpanel/requirements.txt @@ -13,3 +13,4 @@ bcrypt pyotp qrcode[pil] PyJWT +prometheus-flask-exporter diff --git a/v2_lizenzserver/app/core/metrics.py b/v2_lizenzserver/app/core/metrics.py new file mode 100644 index 0000000..21d761a --- /dev/null +++ b/v2_lizenzserver/app/core/metrics.py @@ -0,0 +1,175 @@ +from prometheus_client import Counter, Histogram, Gauge, Info +from functools import wraps +import time + +# License validation metrics +license_validation_total = Counter( + 'license_validation_total', + 'Total number of license validations', + ['result', 'license_type'] +) + +license_validation_errors_total = Counter( + 'license_validation_errors_total', + 'Total number of license validation errors', + ['error_type'] +) + +license_validation_duration_seconds = Histogram( + 'license_validation_duration_seconds', + 'License validation duration in seconds', + buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0] +) + +# Active licenses gauge +active_licenses_total = Gauge( + 'active_licenses_total', + 'Total number of active licenses', + ['license_type'] +) + +# Heartbeat metrics +license_heartbeat_total = Counter( + 'license_heartbeat_total', + 'Total number of license heartbeats received' +) + +# Activation metrics +license_activation_total = Counter( + 'license_activation_total', + 'Total number of license activations', + ['result'] +) + +# Anomaly detection metrics +anomaly_detections_total = Counter( + 'anomaly_detections_total', + 'Total number of anomalies detected', + ['anomaly_type', 'severity'] +) + +# Concurrent sessions gauge +concurrent_sessions_total = Gauge( + 'concurrent_sessions_total', + 'Total number of concurrent active sessions' +) + +# Database connection pool metrics +db_connection_pool_size = Gauge( + 'db_connection_pool_size', + 'Database connection pool size' +) + +db_connection_pool_used = Gauge( + 'db_connection_pool_used', + 'Database connections currently in use' +) + +# API client metrics +api_requests_total = Counter( + 'api_requests_total', + 'Total number of API requests', + ['method', 'endpoint', 'status'] +) + +api_request_duration_seconds = Histogram( + 'api_request_duration_seconds', + 'API request duration in seconds', + ['method', 'endpoint'], + buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] +) + +# Cache metrics +cache_hits_total = Counter( + 'cache_hits_total', + 'Total number of cache hits', + ['cache_type'] +) + +cache_misses_total = Counter( + 'cache_misses_total', + 'Total number of cache misses', + ['cache_type'] +) + +# System info +system_info = Info( + 'license_server_info', + 'License server information' +) + +def track_request_metrics(method: str, endpoint: str): + """Decorator to track API request metrics""" + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + start_time = time.time() + status = "success" + + try: + result = await func(*args, **kwargs) + return result + except Exception as e: + status = "error" + raise + finally: + duration = time.time() - start_time + api_requests_total.labels( + method=method, + endpoint=endpoint, + status=status + ).inc() + api_request_duration_seconds.labels( + method=method, + endpoint=endpoint + ).observe(duration) + + return wrapper + return decorator + +def track_validation_metrics(): + """Decorator to track license validation metrics""" + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + start_time = time.time() + + try: + result = await func(*args, **kwargs) + + # Extract result type from the validation result + if result.get('valid'): + result_type = 'success' + elif result.get('error') == 'expired': + result_type = 'expired' + elif result.get('error') == 'invalid': + result_type = 'invalid' + else: + result_type = 'error' + + license_type = result.get('license_type', 'unknown') + license_validation_total.labels( + result=result_type, + license_type=license_type + ).inc() + + return result + except Exception as e: + license_validation_errors_total.labels( + error_type=type(e).__name__ + ).inc() + raise + finally: + duration = time.time() - start_time + license_validation_duration_seconds.observe(duration) + + return wrapper + return decorator + +# Initialize system info +def init_metrics(version: str = "1.0.0"): + """Initialize system metrics""" + system_info.info({ + 'version': version, + 'service': 'license-server' + }) \ No newline at end of file diff --git a/v2_lizenzserver/app/main.py b/v2_lizenzserver/app/main.py index b1a56c2..5f3c446 100644 --- a/v2_lizenzserver/app/main.py +++ b/v2_lizenzserver/app/main.py @@ -1,12 +1,14 @@ from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, Response import uvicorn import logging from datetime import datetime +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST from app.api import license, version from app.core.config import settings +from app.core.metrics import init_metrics, track_request_metrics from app.db.database import engine, Base logging.basicConfig(level=logging.INFO) @@ -14,6 +16,9 @@ logger = logging.getLogger(__name__) Base.metadata.create_all(bind=engine) +# Initialize metrics +init_metrics(version="1.0.0") + app = FastAPI( title="License Server API", description="API for software license management", @@ -53,6 +58,11 @@ async def health_check(): "timestamp": datetime.utcnow().isoformat() } +@app.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint""" + return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST) + app.include_router(license.router, prefix="/api/license", tags=["license"]) app.include_router(version.router, prefix="/api/version", tags=["version"]) diff --git a/v2_lizenzserver/requirements.txt b/v2_lizenzserver/requirements.txt index f8a3865..bc0a6d0 100644 --- a/v2_lizenzserver/requirements.txt +++ b/v2_lizenzserver/requirements.txt @@ -11,4 +11,5 @@ alembic==1.12.1 python-dotenv==1.0.0 httpx==0.25.2 redis==5.0.1 -packaging==23.2 \ No newline at end of file +packaging==23.2 +prometheus-client==0.19.0 \ No newline at end of file diff --git a/v2_lizenzserver/services/admin/app.py b/v2_lizenzserver/services/admin/app.py index 7f6b115..71100ae 100644 --- a/v2_lizenzserver/services/admin/app.py +++ b/v2_lizenzserver/services/admin/app.py @@ -13,6 +13,7 @@ import jwt import uuid from typing import List, Dict, Optional import bcrypt +from prometheus_flask_exporter import PrometheusMetrics logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,6 +21,10 @@ logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) +# Initialize Prometheus metrics +metrics = PrometheusMetrics(app) +metrics.info('admin_api_service_info', 'Admin API Service Information', version='1.0.0') + # Configuration DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://postgres:postgres@postgres:5432/v2_adminpanel') REDIS_URL = os.environ.get('REDIS_URL', 'redis://redis:6379/3') diff --git a/v2_lizenzserver/services/admin/requirements.txt b/v2_lizenzserver/services/admin/requirements.txt index 358f86d..951b1c6 100644 --- a/v2_lizenzserver/services/admin/requirements.txt +++ b/v2_lizenzserver/services/admin/requirements.txt @@ -6,4 +6,5 @@ PyJWT==2.8.0 bcrypt==4.1.2 requests==2.31.0 python-dotenv==1.0.0 -gunicorn==21.2.0 \ No newline at end of file +gunicorn==21.2.0 +prometheus-flask-exporter==0.23.0 \ No newline at end of file diff --git a/v2_lizenzserver/services/analytics/app.py b/v2_lizenzserver/services/analytics/app.py index fe72d7d..54a5e06 100644 --- a/v2_lizenzserver/services/analytics/app.py +++ b/v2_lizenzserver/services/analytics/app.py @@ -12,6 +12,7 @@ from functools import wraps import jwt from collections import defaultdict import numpy as np +from prometheus_flask_exporter import PrometheusMetrics logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -19,6 +20,10 @@ logger = logging.getLogger(__name__) app = Flask(__name__) CORS(app) +# Initialize Prometheus metrics +metrics = PrometheusMetrics(app) +metrics.info('analytics_service_info', 'Analytics Service Information', version='1.0.0') + # Configuration DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://postgres:postgres@postgres:5432/v2_adminpanel') REDIS_URL = os.environ.get('REDIS_URL', 'redis://redis:6379/2') diff --git a/v2_lizenzserver/services/analytics/requirements.txt b/v2_lizenzserver/services/analytics/requirements.txt index 91228d7..4d5c8b4 100644 --- a/v2_lizenzserver/services/analytics/requirements.txt +++ b/v2_lizenzserver/services/analytics/requirements.txt @@ -6,4 +6,5 @@ PyJWT==2.8.0 numpy==1.26.2 requests==2.31.0 python-dotenv==1.0.0 -gunicorn==21.2.0 \ No newline at end of file +gunicorn==21.2.0 +prometheus-flask-exporter==0.23.0 \ No newline at end of file diff --git a/v2_nginx/nginx.conf b/v2_nginx/nginx.conf index 37dae41..d699e3e 100644 --- a/v2_nginx/nginx.conf +++ b/v2_nginx/nginx.conf @@ -3,6 +3,19 @@ events { } http { + # Enable nginx status page for monitoring + server { + listen 127.0.0.1:8080; + server_name localhost; + + location /nginx_status { + stub_status on; + access_log off; + allow 127.0.0.1; + allow 172.16.0.0/12; # Docker networks + deny all; + } + } # Moderne SSL-Einstellungen für maximale Sicherheit ssl_protocols TLSv1.2 TLSv1.3; ssl_ciphers 'ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384'; @@ -60,6 +73,16 @@ http { proxy_set_header Connection "upgrade"; } + # Auth Service API (internal only) + location /api/v1/auth/ { + proxy_pass http://auth-service:5001/api/v1/auth/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Authorization $http_authorization; + } + # Analytics Service API (internal only) location /api/v1/analytics/ { proxy_pass http://analytics-service:5003/api/v1/analytics/;