Hetzner-Backup/monitoring/prometheus/rules/license-server-alerts.yml

groups:
  - name: license_server_alerts
    interval: 30s
    rules:
      # High error rate
      - alert: HighLicenseValidationErrorRate
        expr: |
          (
            sum(rate(license_validation_errors_total[5m]))
            /
            sum(rate(license_validation_total[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: warning
          service: license-server
        annotations:
          summary: "High license validation error rate ({{ $value | humanizePercentage }})"
          description: "License validation error rate is above 5% for the last 5 minutes"

      # License abuse detection
      - alert: PossibleLicenseAbuse
        expr: |
          rate(license_validation_total{result="multiple_ips"}[5m]) > 0.1
        for: 10m
        labels:
          severity: critical
          service: license-server
        annotations:
          summary: "Possible license abuse detected"
          description: "High rate of validations from multiple IPs for same license"

      # Service down
      - alert: LicenseServerDown
        expr: up{job="license-server"} == 0
        for: 2m
        labels:
          severity: critical
          service: license-server
        annotations:
          summary: "License server is down"
          description: "License server has been down for more than 2 minutes"

      # High response time
      - alert: HighLicenseValidationLatency
        expr: |
          histogram_quantile(0.95,
            sum(rate(license_validation_duration_seconds_bucket[5m])) by (le)
          ) > 0.5
        for: 5m
        labels:
          severity: warning
          service: license-server
        annotations:
          summary: "High license validation latency"
          description: "95th percentile latency is above 500ms"

      # Anomaly detection
      - alert: HighAnomalyDetectionRate
        expr: |
          sum(rate(anomaly_detections_total{severity=~"high|critical"}[5m])) > 0.5
        for: 5m
        labels:
          severity: critical
          service: license-server
        annotations:
          summary: "High rate of critical anomalies detected"
          description: "More than 0.5 critical anomalies per second detected"

  - name: database_alerts
    interval: 30s
    rules:
      # Database connection pool exhaustion
      - alert: DatabaseConnectionPoolExhausted
        expr: |
          (
            pg_stat_database_numbackends{datname="v2_adminpanel"}
            /
            pg_settings_max_connections
          ) > 0.9
        for: 5m
        labels:
          severity: critical
          service: postgres
        annotations:
          summary: "Database connection pool nearly exhausted"
          description: "PostgreSQL connection usage is above 90%"

      # Database replication lag
      - alert: DatabaseReplicationLag
        expr: |
          pg_replication_lag_seconds > 10
        for: 5m
        labels:
          severity: warning
          service: postgres
        annotations:
          summary: "Database replication lag detected"
          description: "Replication lag is {{ $value }} seconds"

  - name: infrastructure_alerts
    interval: 30s
    rules:
      # High CPU usage
      - alert: HighCPUUsage
        expr: |
          (
            100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
          ) > 80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is above 80% for 10 minutes"

      # High memory usage
      - alert: HighMemoryUsage
        expr: |
          (
            1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
          ) > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is above 90%"

      # Disk space
      - alert: LowDiskSpace
        expr: |
          (
            node_filesystem_avail_bytes{mountpoint="/"}
            /
            node_filesystem_size_bytes{mountpoint="/"}
          ) < 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Less than 10% disk space remaining"

  - name: cache_alerts
    interval: 30s
    rules:
      # Redis connection errors
      - alert: RedisConnectionErrors
        expr: |
          rate(redis_connection_errors_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
          service: redis
        annotations:
          summary: "Redis connection errors detected"
          description: "Redis connection error rate is {{ $value }} per second"

      # Cache hit rate
      - alert: LowCacheHitRate
        expr: |
          (
            redis_keyspace_hits_total
            /
            (redis_keyspace_hits_total + redis_keyspace_misses_total)
          ) < 0.7
        for: 10m
        labels:
          severity: warning
          service: redis
        annotations:
          summary: "Low Redis cache hit rate"
          description: "Cache hit rate is below 70%"