Phase 2 Health-Check tenant-fähig + Historie

- migrations/2026-05-09d_source_health_history.py NEU: source_health_history-Tabelle
  (Append-only Verlauf der Health-Check-Runs mit run_id und archived_at)
- shared/services/source_health.py:
  - tenant_id IS NULL Filter raus -> auch Tenant-Quellen werden gecheckt
  - Mojibake (Triple-Encoded UTF-8) via ftfy gefixt
  - DELETE FROM source_health_checks: vorher Stand mit run_id (uuid4) in
    source_health_history archivieren -> kein Datenverlust mehr
  - User-Agent + Timeout aus config.HEALTH_CHECK_* statt hardcoded
- routers/sources.py /health/run-stream: gleiche Änderungen wie oben
- config.py: HEALTH_CHECK_USER_AGENT + HEALTH_CHECK_TIMEOUT_S ergänzt
Dieser Commit ist enthalten in:
claude-dev
2026-05-09 02:56:49 +00:00
Ursprung 650f8b0342
Commit ca4422ccd1
4 geänderte Dateien mit 376 neuen und 285 gelöschten Zeilen

Datei anzeigen

@@ -0,0 +1,57 @@
"""Migration 2026-05-09d: source_health_history (Verlauf der Health-Checks).
Bislang wurde vor jedem Health-Check-Run die Tabelle source_health_checks geleert
(DELETE FROM source_health_checks). Damit ging die Historie verloren - kein
Trend, keine Vergleichsmöglichkeit über Runs.
Diese Migration legt eine reine Append-Tabelle source_health_history an.
Vor jedem Health-Check-Run wird der aktuelle Stand von source_health_checks
hier archiviert (mit run_id und archived_at).
Ausführung:
DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09d_source_health_history.py
DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09d_source_health_history.py
"""
import os
import sqlite3
import sys
def main(db_path: str) -> int:
if not os.path.exists(db_path):
print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr)
return 1
conn = sqlite3.connect(db_path, timeout=60)
conn.execute("PRAGMA busy_timeout = 60000")
conn.execute("PRAGMA journal_mode = WAL")
print(f"Migration auf {db_path}")
conn.executescript("""
CREATE TABLE IF NOT EXISTS source_health_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_id TEXT NOT NULL,
source_id INTEGER NOT NULL,
check_type TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
details TEXT,
checked_at TIMESTAMP,
archived_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_source_health_history_run ON source_health_history(run_id);
CREATE INDEX IF NOT EXISTS idx_source_health_history_source ON source_health_history(source_id, archived_at DESC);
CREATE INDEX IF NOT EXISTS idx_source_health_history_status ON source_health_history(status, archived_at DESC);
""")
print(" + source_health_history + Indizes (idempotent)")
conn.commit()
conn.close()
print("Migration abgeschlossen.")
return 0
if __name__ == "__main__":
db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db")
sys.exit(main(db_path))

Datei anzeigen

@@ -48,3 +48,10 @@ MAX_FEEDS_PER_DOMAIN = 3
CLAUDE_MODEL_FAST = "claude-haiku-4-5-20251001" CLAUDE_MODEL_FAST = "claude-haiku-4-5-20251001"
CLAUDE_MODEL_MEDIUM = "claude-sonnet-4-6" CLAUDE_MODEL_MEDIUM = "claude-sonnet-4-6"
CLAUDE_MODEL_STANDARD = "claude-opus-4-7" CLAUDE_MODEL_STANDARD = "claude-opus-4-7"
# Health-Check (genutzt von shared/services/source_health.py + routers/sources.py)
HEALTH_CHECK_USER_AGENT = os.environ.get(
"HEALTH_CHECK_USER_AGENT",
"Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)",
)
HEALTH_CHECK_TIMEOUT_S = float(os.environ.get("HEALTH_CHECK_TIMEOUT_S", "15.0"))

Datei anzeigen

@@ -1,5 +1,6 @@
"""Grundquellen-Verwaltung und Kundenquellen-Übersicht.""" """Grundquellen-Verwaltung und Kundenquellen-Übersicht."""
import logging import logging
import uuid
from fastapi import APIRouter, Depends, HTTPException, status, Request from fastapi import APIRouter, Depends, HTTPException, status, Request
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
@@ -10,6 +11,7 @@ import aiosqlite
from auth import get_current_admin from auth import get_current_admin
from database import db_dependency from database import db_dependency
from audit import log_action, get_client_ip, row_to_dict from audit import log_action, get_client_ip, row_to_dict
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
from shared.source_rules import ( from shared.source_rules import (
discover_source, discover_source,
discover_all_feeds, discover_all_feeds,
@@ -564,7 +566,7 @@ async def run_health_check_stream(
# Quellen laden # Quellen laden
cursor = await db.execute( cursor = await db.execute(
"SELECT id, name, url, domain, source_type, article_count, last_seen_at " "SELECT id, name, url, domain, source_type, article_count, last_seen_at "
"FROM sources WHERE status = 'active' AND tenant_id IS NULL" "FROM sources WHERE status = 'active'" # tenant + global
) )
sources = [dict(row) for row in await cursor.fetchall()] sources = [dict(row) for row in await cursor.fetchall()]
sources_with_url = [s for s in sources if s["url"]] sources_with_url = [s for s in sources if s["url"]]
@@ -577,6 +579,15 @@ async def run_health_check_stream(
# Phase 1: Erreichbarkeit # Phase 1: Erreichbarkeit
yield f"data: {_json.dumps({'phase': 'check', 'checked': 0, 'total': total, 'current': ''})}\n\n" yield f"data: {_json.dumps({'phase': 'check', 'checked': 0, 'total': total, 'current': ''})}\n\n"
# Bisherigen Stand archivieren, dann frisch
run_id = uuid.uuid4().hex[:12]
await db.execute(
"INSERT INTO source_health_history "
"(run_id, source_id, check_type, status, message, details, checked_at) "
"SELECT ?, source_id, check_type, status, message, details, checked_at "
"FROM source_health_checks",
(run_id,),
)
await db.execute("DELETE FROM source_health_checks") await db.execute("DELETE FROM source_health_checks")
await db.commit() await db.commit()
@@ -584,8 +595,8 @@ async def run_health_check_stream(
checked = 0 checked = 0
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=15.0, follow_redirects=True, timeout=HEALTH_CHECK_TIMEOUT_S, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"}, headers={"User-Agent": HEALTH_CHECK_USER_AGENT},
) as client: ) as client:
for source in sources_with_url: for source in sources_with_url:
try: try:

Datei anzeigen

@@ -1,41 +1,57 @@
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate.""" """Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
import asyncio import asyncio
import logging import logging
import json import json
import uuid
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx import httpx
import feedparser import feedparser
import aiosqlite import aiosqlite
try:
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
except ImportError:
HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)"
HEALTH_CHECK_TIMEOUT_S = 15.0
logger = logging.getLogger("osint.source_health") logger = logging.getLogger("osint.source_health")
async def run_health_checks(db: aiosqlite.Connection) -> dict: async def run_health_checks(db: aiosqlite.Connection) -> dict:
"""Führt alle Health-Checks für aktive Grundquellen durch.""" """Führt Health-Checks für alle aktiven Quellen durch (global + Tenant)."""
logger.info("Starte Quellen-Health-Check...") logger.info("Starte Quellen-Health-Check...")
# Alle aktiven Grundquellen laden # Alle aktiven Quellen laden (global UND Tenant-spezifisch)
cursor = await db.execute( cursor = await db.execute(
"SELECT id, name, url, domain, source_type, article_count, last_seen_at " "SELECT id, name, url, domain, source_type, article_count, last_seen_at "
"FROM sources WHERE status = 'active' AND tenant_id IS NULL" "FROM sources WHERE status = 'active' "
) )
sources = [dict(row) for row in await cursor.fetchall()] sources = [dict(row) for row in await cursor.fetchall()]
# Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben) # Bisherigen Stand in History archivieren, dann frisch starten
run_id = uuid.uuid4().hex[:12]
await db.execute(
"INSERT INTO source_health_history "
"(run_id, source_id, check_type, status, message, details, checked_at) "
"SELECT ?, source_id, check_type, status, message, details, checked_at "
"FROM source_health_checks",
(run_id,),
)
await db.execute("DELETE FROM source_health_checks") await db.execute("DELETE FROM source_health_checks")
await db.commit() await db.commit()
logger.info(f"Health-Check Run {run_id}: vorigen Stand archiviert")
checks_done = 0 checks_done = 0
issues_found = 0 issues_found = 0
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL) # 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
sources_with_url = [s for s in sources if s["url"]] sources_with_url = [s for s in sources if s["url"]]
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=15.0, timeout=HEALTH_CHECK_TIMEOUT_S,
follow_redirects=True, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"}, headers={"User-Agent": HEALTH_CHECK_USER_AGENT},
) as client: ) as client:
for i in range(0, len(sources_with_url), 5): for i in range(0, len(sources_with_url), 5):
batch = sources_with_url[i:i + 5] batch = sources_with_url[i:i + 5]
@@ -46,7 +62,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
if isinstance(result, Exception): if isinstance(result, Exception):
await _save_check( await _save_check(
db, source["id"], "reachability", "error", db, source["id"], "reachability", "error",
f"Prüfung fehlgeschlagen: {result}", f"Prüfung fehlgeschlagen: {result}",
) )
issues_found += 1 issues_found += 1
else: else:
@@ -83,7 +99,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
await db.commit() await db.commit()
logger.info( logger.info(
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, " f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
f"{issues_found} Probleme gefunden" f"{issues_found} Probleme gefunden"
) )
return {"checked": checks_done, "issues": issues_found} return {"checked": checks_done, "issues": issues_found}
@@ -92,7 +108,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
async def _check_source_reachability( async def _check_source_reachability(
client: httpx.AsyncClient, source: dict, client: httpx.AsyncClient, source: dict,
) -> list[dict]: ) -> list[dict]:
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle.""" """Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
checks = [] checks = []
url = source["url"] url = source["url"]
@@ -125,14 +141,14 @@ async def _check_source_reachability(
"message": "Erreichbar", "message": "Erreichbar",
}) })
# Feed-Validität nur für RSS-Feeds # Feed-Validität nur für RSS-Feeds
if source["source_type"] == "rss_feed": if source["source_type"] == "rss_feed":
text = resp.text[:20000] text = resp.text[:20000]
if "<rss" not in text and "<feed" not in text and "<channel" not in text: if "<rss" not in text and "<feed" not in text and "<channel" not in text:
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "error", "status": "error",
"message": "Kein gültiger RSS/Atom-Feed", "message": "Kein gültiger RSS/Atom-Feed",
}) })
else: else:
feed = await asyncio.to_thread(feedparser.parse, text) feed = await asyncio.to_thread(feedparser.parse, text)
@@ -155,7 +171,7 @@ async def _check_source_reachability(
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "ok", "status": "ok",
"message": f"Feed gültig ({len(feed.entries)} Einträge)", "message": f"Feed gültig ({len(feed.entries)} Einträge)",
}) })
except httpx.TimeoutException: except httpx.TimeoutException:
@@ -181,7 +197,7 @@ async def _check_source_reachability(
def _check_stale(source: dict) -> dict | None: def _check_stale(source: dict) -> dict | None:
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen).""" """Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
if source["source_type"] == "excluded": if source["source_type"] == "excluded":
return None return None
@@ -249,7 +265,7 @@ async def _save_check(
async def get_health_summary(db: aiosqlite.Connection) -> dict: async def get_health_summary(db: aiosqlite.Connection) -> dict:
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück.""" """Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
cursor = await db.execute(""" cursor = await db.execute("""
SELECT SELECT
h.id, h.source_id, s.name, s.domain, s.url, s.source_type, h.id, h.source_id, s.name, s.domain, s.url, s.source_type,