Mojibake fix: source_suggester.py + source_health.py via ftfy

Beide Files hatten Doppel-Encoded UTF-8 in Docstrings, Kommentaren und
Prompt-Strings (z.B. "prüft" statt "prüft", "Vorschläge" statt
"Vorschläge"). ftfy hat das automatisch repariert.

Hauptauswirkungen:
- Logs sind jetzt mit echten Umlauten lesbar
- Claude/Haiku-Prompts in source_suggester.py (Quellen-Vorschlaege via KI)
  bekommen jetzt korrekte deutsche Umlaute - sollte bessere Antworten geben

Daneben hat ftfy line-endings normalisiert, daher der grosse Diff in
source_health.py - inhaltlich nur Mojibake-Reparatur.

Verifiziert mit:
  grep -cE "ä|ö|ü|ß|Ä|Ö|Ü" src/services/*.py
  -> 0 Treffer
Dieser Commit ist enthalten in:
Claude Code
2026-05-09 03:35:13 +00:00
Ursprung 1e9cca2555
Commit d71daee581
2 geänderte Dateien mit 299 neuen und 299 gelöschten Zeilen

Datei anzeigen

@@ -1,282 +1,282 @@
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate.""" """Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
import asyncio import asyncio
import logging import logging
import json import json
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx import httpx
import feedparser import feedparser
import aiosqlite import aiosqlite
logger = logging.getLogger("osint.source_health") logger = logging.getLogger("osint.source_health")
async def run_health_checks(db: aiosqlite.Connection) -> dict: async def run_health_checks(db: aiosqlite.Connection) -> dict:
"""Führt alle Health-Checks für aktive Grundquellen durch.""" """Führt alle Health-Checks für aktive Grundquellen durch."""
logger.info("Starte Quellen-Health-Check...") logger.info("Starte Quellen-Health-Check...")
# Alle aktiven Grundquellen laden # Alle aktiven Grundquellen laden
cursor = await db.execute( cursor = await db.execute(
"SELECT id, name, url, domain, source_type, article_count, last_seen_at " "SELECT id, name, url, domain, source_type, article_count, last_seen_at "
"FROM sources WHERE status = 'active' AND tenant_id IS NULL" "FROM sources WHERE status = 'active' AND tenant_id IS NULL"
) )
sources = [dict(row) for row in await cursor.fetchall()] sources = [dict(row) for row in await cursor.fetchall()]
# Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben) # Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben)
await db.execute("DELETE FROM source_health_checks") await db.execute("DELETE FROM source_health_checks")
await db.commit() await db.commit()
checks_done = 0 checks_done = 0
issues_found = 0 issues_found = 0
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL) # 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
sources_with_url = [s for s in sources if s["url"]] sources_with_url = [s for s in sources if s["url"]]
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=15.0, timeout=15.0,
follow_redirects=True, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"}, headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"},
) as client: ) as client:
for i in range(0, len(sources_with_url), 5): for i in range(0, len(sources_with_url), 5):
batch = sources_with_url[i:i + 5] batch = sources_with_url[i:i + 5]
tasks = [_check_source_reachability(client, s) for s in batch] tasks = [_check_source_reachability(client, s) for s in batch]
results = await asyncio.gather(*tasks, return_exceptions=True) results = await asyncio.gather(*tasks, return_exceptions=True)
for source, result in zip(batch, results): for source, result in zip(batch, results):
if isinstance(result, Exception): if isinstance(result, Exception):
await _save_check( await _save_check(
db, source["id"], "reachability", "error", db, source["id"], "reachability", "error",
f"Prüfung fehlgeschlagen: {result}", f"Prüfung fehlgeschlagen: {result}",
) )
issues_found += 1 issues_found += 1
else: else:
for check in result: for check in result:
await _save_check( await _save_check(
db, source["id"], check["type"], check["status"], db, source["id"], check["type"], check["status"],
check["message"], check.get("details"), check["message"], check.get("details"),
) )
if check["status"] != "ok": if check["status"] != "ok":
issues_found += 1 issues_found += 1
checks_done += 1 checks_done += 1
# 2. Veraltete Quellen (kein Artikel seit >30 Tagen) # 2. Veraltete Quellen (kein Artikel seit >30 Tagen)
for source in sources: for source in sources:
if source["source_type"] in ("excluded", "web_source"): if source["source_type"] in ("excluded", "web_source"):
continue continue
stale_check = _check_stale(source) stale_check = _check_stale(source)
if stale_check: if stale_check:
await _save_check( await _save_check(
db, source["id"], stale_check["type"], db, source["id"], stale_check["type"],
stale_check["status"], stale_check["message"], stale_check["status"], stale_check["message"],
) )
if stale_check["status"] != "ok": if stale_check["status"] != "ok":
issues_found += 1 issues_found += 1
# 3. Duplikate erkennen # 3. Duplikate erkennen
duplicates = _find_duplicates(sources) duplicates = _find_duplicates(sources)
for dup in duplicates: for dup in duplicates:
await _save_check( await _save_check(
db, dup["source_id"], "duplicate", "warning", db, dup["source_id"], "duplicate", "warning",
dup["message"], json.dumps(dup.get("details", {})), dup["message"], json.dumps(dup.get("details", {})),
) )
issues_found += 1 issues_found += 1
await db.commit() await db.commit()
logger.info( logger.info(
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, " f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
f"{issues_found} Probleme gefunden" f"{issues_found} Probleme gefunden"
) )
return {"checked": checks_done, "issues": issues_found} return {"checked": checks_done, "issues": issues_found}
async def _check_source_reachability( async def _check_source_reachability(
client: httpx.AsyncClient, source: dict, client: httpx.AsyncClient, source: dict,
) -> list[dict]: ) -> list[dict]:
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle.""" """Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
checks = [] checks = []
url = source["url"] url = source["url"]
try: try:
resp = await client.get(url) resp = await client.get(url)
if resp.status_code >= 400: if resp.status_code >= 400:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "error", "status": "error",
"message": f"HTTP {resp.status_code} - nicht erreichbar", "message": f"HTTP {resp.status_code} - nicht erreichbar",
"details": json.dumps({"status_code": resp.status_code, "url": url}), "details": json.dumps({"status_code": resp.status_code, "url": url}),
}) })
return checks return checks
if resp.status_code >= 300: if resp.status_code >= 300:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "warning", "status": "warning",
"message": f"HTTP {resp.status_code} - Weiterleitung", "message": f"HTTP {resp.status_code} - Weiterleitung",
"details": json.dumps({ "details": json.dumps({
"status_code": resp.status_code, "status_code": resp.status_code,
"final_url": str(resp.url), "final_url": str(resp.url),
}), }),
}) })
else: else:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "ok", "status": "ok",
"message": "Erreichbar", "message": "Erreichbar",
}) })
# Feed-Validität nur für RSS-Feeds # Feed-Validität nur für RSS-Feeds
if source["source_type"] == "rss_feed": if source["source_type"] == "rss_feed":
text = resp.text[:20000] text = resp.text[:20000]
if "<rss" not in text and "<feed" not in text and "<channel" not in text: if "<rss" not in text and "<feed" not in text and "<channel" not in text:
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "error", "status": "error",
"message": "Kein gültiger RSS/Atom-Feed", "message": "Kein gültiger RSS/Atom-Feed",
}) })
else: else:
feed = await asyncio.to_thread(feedparser.parse, text) feed = await asyncio.to_thread(feedparser.parse, text)
if feed.get("bozo") and not feed.entries: if feed.get("bozo") and not feed.entries:
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "error", "status": "error",
"message": "Feed fehlerhaft (bozo)", "message": "Feed fehlerhaft (bozo)",
"details": json.dumps({ "details": json.dumps({
"bozo_exception": str(feed.get("bozo_exception", "")), "bozo_exception": str(feed.get("bozo_exception", "")),
}), }),
}) })
elif not feed.entries: elif not feed.entries:
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "warning", "status": "warning",
"message": "Feed erreichbar aber leer", "message": "Feed erreichbar aber leer",
}) })
else: else:
checks.append({ checks.append({
"type": "feed_validity", "type": "feed_validity",
"status": "ok", "status": "ok",
"message": f"Feed gültig ({len(feed.entries)} Einträge)", "message": f"Feed gültig ({len(feed.entries)} Einträge)",
}) })
except httpx.TimeoutException: except httpx.TimeoutException:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "error", "status": "error",
"message": "Timeout (15s)", "message": "Timeout (15s)",
}) })
except httpx.ConnectError as e: except httpx.ConnectError as e:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "error", "status": "error",
"message": f"Verbindung fehlgeschlagen: {e}", "message": f"Verbindung fehlgeschlagen: {e}",
}) })
except Exception as e: except Exception as e:
checks.append({ checks.append({
"type": "reachability", "type": "reachability",
"status": "error", "status": "error",
"message": f"{type(e).__name__}: {e}", "message": f"{type(e).__name__}: {e}",
}) })
return checks return checks
def _check_stale(source: dict) -> dict | None: def _check_stale(source: dict) -> dict | None:
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen).""" """Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
if source["source_type"] == "excluded": if source["source_type"] == "excluded":
return None return None
article_count = source.get("article_count") or 0 article_count = source.get("article_count") or 0
last_seen = source.get("last_seen_at") last_seen = source.get("last_seen_at")
if article_count == 0: if article_count == 0:
return { return {
"type": "stale", "type": "stale",
"status": "warning", "status": "warning",
"message": "Noch nie Artikel geliefert", "message": "Noch nie Artikel geliefert",
} }
if last_seen: if last_seen:
try: try:
from datetime import datetime from datetime import datetime
last_dt = datetime.fromisoformat(last_seen) last_dt = datetime.fromisoformat(last_seen)
now = datetime.now() now = datetime.now()
age_days = (now - last_dt).days age_days = (now - last_dt).days
if age_days > 30: if age_days > 30:
return { return {
"type": "stale", "type": "stale",
"status": "warning", "status": "warning",
"message": f"Letzter Artikel vor {age_days} Tagen", "message": f"Letzter Artikel vor {age_days} Tagen",
} }
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
return None return None
def _find_duplicates(sources: list[dict]) -> list[dict]: def _find_duplicates(sources: list[dict]) -> list[dict]:
"""Findet doppelte Quellen (gleiche URL).""" """Findet doppelte Quellen (gleiche URL)."""
duplicates = [] duplicates = []
url_map = {} url_map = {}
for s in sources: for s in sources:
if not s["url"]: if not s["url"]:
continue continue
url_norm = s["url"].lower().rstrip("/") url_norm = s["url"].lower().rstrip("/")
if url_norm in url_map: if url_norm in url_map:
existing = url_map[url_norm] existing = url_map[url_norm]
duplicates.append({ duplicates.append({
"source_id": s["id"], "source_id": s["id"],
"message": f"Doppelte URL wie '{existing['name']}' (ID {existing['id']})", "message": f"Doppelte URL wie '{existing['name']}' (ID {existing['id']})",
"details": {"duplicate_of": existing["id"], "type": "url"}, "details": {"duplicate_of": existing["id"], "type": "url"},
}) })
else: else:
url_map[url_norm] = s url_map[url_norm] = s
return duplicates return duplicates
async def _save_check( async def _save_check(
db: aiosqlite.Connection, source_id: int, check_type: str, db: aiosqlite.Connection, source_id: int, check_type: str,
status: str, message: str, details: str = None, status: str, message: str, details: str = None,
): ):
"""Speichert ein Health-Check-Ergebnis.""" """Speichert ein Health-Check-Ergebnis."""
await db.execute( await db.execute(
"INSERT INTO source_health_checks " "INSERT INTO source_health_checks "
"(source_id, check_type, status, message, details) " "(source_id, check_type, status, message, details) "
"VALUES (?, ?, ?, ?, ?)", "VALUES (?, ?, ?, ?, ?)",
(source_id, check_type, status, message, details), (source_id, check_type, status, message, details),
) )
async def get_health_summary(db: aiosqlite.Connection) -> dict: async def get_health_summary(db: aiosqlite.Connection) -> dict:
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück.""" """Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
cursor = await db.execute(""" cursor = await db.execute("""
SELECT SELECT
h.id, h.source_id, s.name, s.domain, s.url, s.source_type, h.id, h.source_id, s.name, s.domain, s.url, s.source_type,
h.check_type, h.status, h.message, h.details, h.checked_at h.check_type, h.status, h.message, h.details, h.checked_at
FROM source_health_checks h FROM source_health_checks h
JOIN sources s ON s.id = h.source_id JOIN sources s ON s.id = h.source_id
ORDER BY ORDER BY
CASE h.status WHEN 'error' THEN 0 WHEN 'warning' THEN 1 ELSE 2 END, CASE h.status WHEN 'error' THEN 0 WHEN 'warning' THEN 1 ELSE 2 END,
s.name s.name
""") """)
checks = [dict(row) for row in await cursor.fetchall()] checks = [dict(row) for row in await cursor.fetchall()]
error_count = sum(1 for c in checks if c["status"] == "error") error_count = sum(1 for c in checks if c["status"] == "error")
warning_count = sum(1 for c in checks if c["status"] == "warning") warning_count = sum(1 for c in checks if c["status"] == "warning")
ok_count = sum(1 for c in checks if c["status"] == "ok") ok_count = sum(1 for c in checks if c["status"] == "ok")
cursor = await db.execute( cursor = await db.execute(
"SELECT MAX(checked_at) as last_check FROM source_health_checks" "SELECT MAX(checked_at) as last_check FROM source_health_checks"
) )
row = await cursor.fetchone() row = await cursor.fetchone()
last_check = row["last_check"] if row else None last_check = row["last_check"] if row else None
return { return {
"last_check": last_check, "last_check": last_check,
"total_checks": len(checks), "total_checks": len(checks),
"errors": error_count, "errors": error_count,
"warnings": warning_count, "warnings": warning_count,
"ok": ok_count, "ok": ok_count,
"checks": checks, "checks": checks,
} }

Datei anzeigen

@@ -1,4 +1,4 @@
"""KI-gestützte Quellen-Vorschläge via Haiku.""" """KI-gestützte Quellen-Vorschläge via Haiku."""
import json import json
import logging import logging
import re import re
@@ -12,8 +12,8 @@ logger = logging.getLogger("osint.source_suggester")
async def generate_suggestions(db: aiosqlite.Connection) -> int: async def generate_suggestions(db: aiosqlite.Connection) -> int:
"""Generiert Quellen-Vorschläge basierend auf Health-Checks und Lückenanalyse.""" """Generiert Quellen-Vorschläge basierend auf Health-Checks und Lückenanalyse."""
logger.info("Starte Quellen-Vorschläge via Haiku...") logger.info("Starte Quellen-Vorschläge via Haiku...")
# 1. Aktuelle Quellen laden # 1. Aktuelle Quellen laden
cursor = await db.execute( cursor = await db.execute(
@@ -33,13 +33,13 @@ async def generate_suggestions(db: aiosqlite.Connection) -> int:
""") """)
issues = [dict(row) for row in await cursor.fetchall()] issues = [dict(row) for row in await cursor.fetchall()]
# 3. Alte pending-Vorschläge entfernen (älter als 30 Tage) # 3. Alte pending-Vorschläge entfernen (älter als 30 Tage)
await db.execute( await db.execute(
"DELETE FROM source_suggestions " "DELETE FROM source_suggestions "
"WHERE status = 'pending' AND created_at < datetime('now', '-30 days')" "WHERE status = 'pending' AND created_at < datetime('now', '-30 days')"
) )
# 4. Quellen-Zusammenfassung für Haiku # 4. Quellen-Zusammenfassung für Haiku
categories = {} categories = {}
for s in sources: for s in sources:
cat = s["category"] cat = s["category"]
@@ -67,7 +67,7 @@ async def generate_suggestions(db: aiosqlite.Connection) -> int:
f"{issue['check_type']} = {issue['status']} - {issue['message']}\n" f"{issue['check_type']} = {issue['status']} - {issue['message']}\n"
) )
prompt = f"""Du bist ein OSINT-Analyst und verwaltest die Quellensammlung eines Lagebildmonitors für Sicherheitsbehörden. prompt = f"""Du bist ein OSINT-Analyst und verwaltest die Quellensammlung eines Lagebildmonitors für Sicherheitsbehörden.
Aktuelle Quellensammlung:{source_summary}{issues_summary} Aktuelle Quellensammlung:{source_summary}{issues_summary}
@@ -78,13 +78,13 @@ Beachte:
2. Fehlende wichtige OSINT-Quellen: Schlage "add_source" mit konkreter RSS-Feed-URL vor 2. Fehlende wichtige OSINT-Quellen: Schlage "add_source" mit konkreter RSS-Feed-URL vor
3. Fokus auf deutschsprachige + wichtige internationale Nachrichtenquellen 3. Fokus auf deutschsprachige + wichtige internationale Nachrichtenquellen
4. Nur Quellen vorschlagen, die NICHT bereits vorhanden sind 4. Nur Quellen vorschlagen, die NICHT bereits vorhanden sind
5. Maximal 5 Vorschläge 5. Maximal 5 Vorschläge
Antworte NUR mit einem JSON-Array. Jedes Element: Antworte NUR mit einem JSON-Array. Jedes Element:
{{ {{
"type": "add_source|deactivate_source|fix_url|remove_source", "type": "add_source|deactivate_source|fix_url|remove_source",
"title": "Kurzer Titel", "title": "Kurzer Titel",
"description": "Begründung", "description": "Begründung",
"priority": "low|medium|high", "priority": "low|medium|high",
"source_id": null, "source_id": null,
"data": {{ "data": {{
@@ -104,7 +104,7 @@ Nur das JSON-Array, kein anderer Text."""
json_match = re.search(r'\[.*\]', response, re.DOTALL) json_match = re.search(r'\[.*\]', response, re.DOTALL)
if not json_match: if not json_match:
logger.warning("Keine Vorschläge von Haiku erhalten (kein JSON)") logger.warning("Keine Vorschläge von Haiku erhalten (kein JSON)")
return 0 return 0
suggestions = json.loads(json_match.group(0)) suggestions = json.loads(json_match.group(0))
@@ -164,14 +164,14 @@ Nur das JSON-Array, kein anderer Text."""
await db.commit() await db.commit()
logger.info( logger.info(
f"Quellen-Vorschläge: {count} neue Vorschläge generiert " f"Quellen-Vorschläge: {count} neue Vorschläge generiert "
f"(Haiku: {usage.input_tokens} in / {usage.output_tokens} out / " f"(Haiku: {usage.input_tokens} in / {usage.output_tokens} out / "
f"${usage.cost_usd:.4f})" f"${usage.cost_usd:.4f})"
) )
return count return count
except Exception as e: except Exception as e:
logger.error(f"Fehler bei Quellen-Vorschlägen: {e}", exc_info=True) logger.error(f"Fehler bei Quellen-Vorschlägen: {e}", exc_info=True)
return 0 return 0
@@ -218,7 +218,7 @@ async def apply_suggestion(
(url,), (url,),
) )
if await cursor.fetchone(): if await cursor.fetchone():
result["action"] = "übersprungen (URL bereits vorhanden)" result["action"] = "übersprungen (URL bereits vorhanden)"
new_status = "rejected" new_status = "rejected"
else: else:
await db.execute( await db.execute(
@@ -230,7 +230,7 @@ async def apply_suggestion(
) )
result["action"] = f"Quelle '{name}' angelegt" result["action"] = f"Quelle '{name}' angelegt"
else: else:
result["action"] = "übersprungen (keine URL)" result["action"] = "übersprungen (keine URL)"
new_status = "rejected" new_status = "rejected"
elif stype == "deactivate_source": elif stype == "deactivate_source":
@@ -242,7 +242,7 @@ async def apply_suggestion(
) )
result["action"] = "Quelle deaktiviert" result["action"] = "Quelle deaktiviert"
else: else:
result["action"] = "übersprungen (keine source_id)" result["action"] = "übersprungen (keine source_id)"
elif stype == "remove_source": elif stype == "remove_source":
source_id = suggestion["source_id"] source_id = suggestion["source_id"]
@@ -250,9 +250,9 @@ async def apply_suggestion(
await db.execute( await db.execute(
"DELETE FROM sources WHERE id = ?", (source_id,), "DELETE FROM sources WHERE id = ?", (source_id,),
) )
result["action"] = "Quelle gelöscht" result["action"] = "Quelle gelöscht"
else: else:
result["action"] = "übersprungen (keine source_id)" result["action"] = "übersprungen (keine source_id)"
elif stype == "fix_url": elif stype == "fix_url":
source_id = suggestion["source_id"] source_id = suggestion["source_id"]
@@ -264,7 +264,7 @@ async def apply_suggestion(
) )
result["action"] = f"URL aktualisiert auf {new_url}" result["action"] = f"URL aktualisiert auf {new_url}"
else: else:
result["action"] = "übersprungen (keine source_id oder URL)" result["action"] = "übersprungen (keine source_id oder URL)"
await db.execute( await db.execute(
"UPDATE source_suggestions SET status = ?, reviewed_at = CURRENT_TIMESTAMP " "UPDATE source_suggestions SET status = ?, reviewed_at = CURRENT_TIMESTAMP "