Phase 2 Health-Check tenant-fähig + Historie

- migrations/2026-05-09d_source_health_history.py NEU: source_health_history-Tabelle
  (Append-only Verlauf der Health-Check-Runs mit run_id und archived_at)
- shared/services/source_health.py:
  - tenant_id IS NULL Filter raus -> auch Tenant-Quellen werden gecheckt
  - Mojibake (Triple-Encoded UTF-8) via ftfy gefixt
  - DELETE FROM source_health_checks: vorher Stand mit run_id (uuid4) in
    source_health_history archivieren -> kein Datenverlust mehr
  - User-Agent + Timeout aus config.HEALTH_CHECK_* statt hardcoded
- routers/sources.py /health/run-stream: gleiche Änderungen wie oben
- config.py: HEALTH_CHECK_USER_AGENT + HEALTH_CHECK_TIMEOUT_S ergänzt
Dieser Commit ist enthalten in:
claude-dev
2026-05-09 02:56:49 +00:00
Ursprung 650f8b0342
Commit ca4422ccd1
4 geänderte Dateien mit 376 neuen und 285 gelöschten Zeilen

Datei anzeigen

@@ -0,0 +1,57 @@
"""Migration 2026-05-09d: source_health_history (Verlauf der Health-Checks).
Bislang wurde vor jedem Health-Check-Run die Tabelle source_health_checks geleert
(DELETE FROM source_health_checks). Damit ging die Historie verloren - kein
Trend, keine Vergleichsmöglichkeit über Runs.
Diese Migration legt eine reine Append-Tabelle source_health_history an.
Vor jedem Health-Check-Run wird der aktuelle Stand von source_health_checks
hier archiviert (mit run_id und archived_at).
Ausführung:
DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09d_source_health_history.py
DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09d_source_health_history.py
"""
import os
import sqlite3
import sys
def main(db_path: str) -> int:
if not os.path.exists(db_path):
print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr)
return 1
conn = sqlite3.connect(db_path, timeout=60)
conn.execute("PRAGMA busy_timeout = 60000")
conn.execute("PRAGMA journal_mode = WAL")
print(f"Migration auf {db_path}")
conn.executescript("""
CREATE TABLE IF NOT EXISTS source_health_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id TEXT NOT NULL,
source_id INTEGER NOT NULL,
check_type TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
details TEXT,
checked_at TIMESTAMP,
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_source_health_history_run ON source_health_history(run_id);
CREATE INDEX IF NOT EXISTS idx_source_health_history_source ON source_health_history(source_id, archived_at DESC);
CREATE INDEX IF NOT EXISTS idx_source_health_history_status ON source_health_history(status, archived_at DESC);
""")
print(" + source_health_history + Indizes (idempotent)")
conn.commit()
conn.close()
print("Migration abgeschlossen.")
return 0
if __name__ == "__main__":
db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db")
sys.exit(main(db_path))

Datei anzeigen

@@ -48,3 +48,10 @@ MAX_FEEDS_PER_DOMAIN = 3
CLAUDE_MODEL_FAST = "claude-haiku-4-5-20251001"
CLAUDE_MODEL_MEDIUM = "claude-sonnet-4-6"
CLAUDE_MODEL_STANDARD = "claude-opus-4-7"
# Health-Check (genutzt von shared/services/source_health.py + routers/sources.py)
HEALTH_CHECK_USER_AGENT = os.environ.get(
"HEALTH_CHECK_USER_AGENT",
"Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)",
)
HEALTH_CHECK_TIMEOUT_S = float(os.environ.get("HEALTH_CHECK_TIMEOUT_S", "15.0"))

Datei anzeigen

@@ -1,5 +1,6 @@
"""Grundquellen-Verwaltung und Kundenquellen-Übersicht."""
import logging
import uuid
from fastapi import APIRouter, Depends, HTTPException, status, Request
from fastapi.responses import StreamingResponse
@@ -10,6 +11,7 @@ import aiosqlite
from auth import get_current_admin
from database import db_dependency
from audit import log_action, get_client_ip, row_to_dict
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
from shared.source_rules import (
discover_source,
discover_all_feeds,
@@ -564,7 +566,7 @@ async def run_health_check_stream(
# Quellen laden
cursor = await db.execute(
"SELECT id, name, url, domain, source_type, article_count, last_seen_at "
"FROM sources WHERE status = 'active' AND tenant_id IS NULL"
"FROM sources WHERE status = 'active'" # tenant + global
)
sources = [dict(row) for row in await cursor.fetchall()]
sources_with_url = [s for s in sources if s["url"]]
@@ -577,6 +579,15 @@ async def run_health_check_stream(
# Phase 1: Erreichbarkeit
yield f"data: {_json.dumps({'phase': 'check', 'checked': 0, 'total': total, 'current': ''})}\n\n"
# Bisherigen Stand archivieren, dann frisch
run_id = uuid.uuid4().hex[:12]
await db.execute(
"INSERT INTO source_health_history "
"(run_id, source_id, check_type, status, message, details, checked_at) "
"SELECT ?, source_id, check_type, status, message, details, checked_at "
"FROM source_health_checks",
(run_id,),
)
await db.execute("DELETE FROM source_health_checks")
await db.commit()
@@ -584,8 +595,8 @@ async def run_health_check_stream(
checked = 0
async with httpx.AsyncClient(
timeout=15.0, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"},
timeout=HEALTH_CHECK_TIMEOUT_S, follow_redirects=True,
headers={"User-Agent": HEALTH_CHECK_USER_AGENT},
) as client:
for source in sources_with_url:
try:

Datei anzeigen

@@ -1,41 +1,57 @@
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
import asyncio
import logging
import json
import uuid
from urllib.parse import urlparse
import httpx
import feedparser
import aiosqlite
try:
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
except ImportError:
HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)"
HEALTH_CHECK_TIMEOUT_S = 15.0
logger = logging.getLogger("osint.source_health")
async def run_health_checks(db: aiosqlite.Connection) -> dict:
"""Führt alle Health-Checks für aktive Grundquellen durch."""
"""Führt Health-Checks für alle aktiven Quellen durch (global + Tenant)."""
logger.info("Starte Quellen-Health-Check...")
# Alle aktiven Grundquellen laden
# Alle aktiven Quellen laden (global UND Tenant-spezifisch)
cursor = await db.execute(
"SELECT id, name, url, domain, source_type, article_count, last_seen_at "
"FROM sources WHERE status = 'active' AND tenant_id IS NULL"
"FROM sources WHERE status = 'active' "
)
sources = [dict(row) for row in await cursor.fetchall()]
# Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben)
# Bisherigen Stand in History archivieren, dann frisch starten
run_id = uuid.uuid4().hex[:12]
await db.execute(
"INSERT INTO source_health_history "
"(run_id, source_id, check_type, status, message, details, checked_at) "
"SELECT ?, source_id, check_type, status, message, details, checked_at "
"FROM source_health_checks",
(run_id,),
)
await db.execute("DELETE FROM source_health_checks")
await db.commit()
logger.info(f"Health-Check Run {run_id}: vorigen Stand archiviert")
checks_done = 0
issues_found = 0
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
sources_with_url = [s for s in sources if s["url"]]
async with httpx.AsyncClient(
timeout=15.0,
timeout=HEALTH_CHECK_TIMEOUT_S,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"},
headers={"User-Agent": HEALTH_CHECK_USER_AGENT},
) as client:
for i in range(0, len(sources_with_url), 5):
batch = sources_with_url[i:i + 5]
@@ -46,7 +62,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
if isinstance(result, Exception):
await _save_check(
db, source["id"], "reachability", "error",
f"Prüfung fehlgeschlagen: {result}",
f"Prüfung fehlgeschlagen: {result}",
)
issues_found += 1
else:
@@ -83,7 +99,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
await db.commit()
logger.info(
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
f"{issues_found} Probleme gefunden"
)
return {"checked": checks_done, "issues": issues_found}
@@ -92,7 +108,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
async def _check_source_reachability(
client: httpx.AsyncClient, source: dict,
) -> list[dict]:
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
checks = []
url = source["url"]
@@ -125,14 +141,14 @@ async def _check_source_reachability(
"message": "Erreichbar",
})
# Feed-Validität nur für RSS-Feeds
# Feed-Validität nur für RSS-Feeds
if source["source_type"] == "rss_feed":
text = resp.text[:20000]
if "<rss" not in text and "<feed" not in text and "<channel" not in text:
checks.append({
"type": "feed_validity",
"status": "error",
"message": "Kein gültiger RSS/Atom-Feed",
"message": "Kein gültiger RSS/Atom-Feed",
})
else:
feed = await asyncio.to_thread(feedparser.parse, text)
@@ -155,7 +171,7 @@ async def _check_source_reachability(
checks.append({
"type": "feed_validity",
"status": "ok",
"message": f"Feed gültig ({len(feed.entries)} Einträge)",
"message": f"Feed gültig ({len(feed.entries)} Einträge)",
})
except httpx.TimeoutException:
@@ -181,7 +197,7 @@ async def _check_source_reachability(
def _check_stale(source: dict) -> dict | None:
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
if source["source_type"] == "excluded":
return None
@@ -249,7 +265,7 @@ async def _save_check(
async def get_health_summary(db: aiosqlite.Connection) -> dict:
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
cursor = await db.execute("""
SELECT
h.id, h.source_id, s.name, s.domain, s.url, s.source_type,