feat: Post-Refresh QC auf Haiku umgestellt

Faktencheck-Duplikate: Fuzzy-Vorfilter (Threshold 0.60) reduziert Kandidaten, Haiku clustert semantische Duplikate kontextbezogen. Karten-Locations: Haiku bewertet target-Kategorien anhand des Lage-Kontexts statt statischer Wortlisten. Kosten ca. 0.005-0.008 USD pro Check. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 21:49:50 +01:00
Commit 445f645936
--- a/src/services/post_refresh_qc.py
+++ b/src/services/post_refresh_qc.py
@@ -1,54 +1,149 @@
-"""Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh.
+"""Post-Refresh Quality Check via Haiku.
-Erkennt:
+Prueft nach jedem Refresh:
-1. Inhaltliche Faktencheck-Duplikate (fuzzy matching)
+1. Semantische Faktencheck-Duplikate (Haiku-Clustering mit Fuzzy-Vorfilter)
-2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden)
+2. Falsch kategorisierte Karten-Locations (Haiku bewertet Kontext der Lage)
 Regelbasierte Listen dienen als Fallback falls Haiku fehlschlaegt.
 """
 import json
 import logging
 import re
 from difflib import SequenceMatcher
 from agents.claude_client import call_claude
 from config import CLAUDE_MODEL_FAST
 logger = logging.getLogger("osint.post_refresh_qc")
-# Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind
+STATUS_PRIORITY = {
-# (werden typischerweise wirtschaftlich/politisch erwaehnt)
+    "confirmed": 5, "established": 5,
-_NON_TARGET_LOCATIONS = {
+    "contradicted": 4, "disputed": 4,
-    # Weit entfernte Laender (keine direkten Kriegsziele)
+    "unconfirmed": 3, "unverified": 3,
-    "australia", "australien", "northern territory", "queensland",
+    "developing": 1,
    "new south wales", "victoria",
    "cuba", "kuba",
    "new york city", "new york", "washington",
    "taiwan", "south korea", "japan",
    "afghanistan", "pakistan", "karachi",
    "china", "peking", "beijing",
    "indien", "india", "new delhi",
    "brasilien", "brazil",
    "mexiko", "mexico",
    "argentinien", "argentina",
    "kanada", "canada",
    "philippinen", "philippines",
    "indonesien", "indonesia",
    "nigeria", "south africa",
 }
-# Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten
+# ---------------------------------------------------------------------------
-_ACTOR_NOT_TARGET = {
+# 1. Faktencheck-Duplikate
-    "usa", "united states", "us", "vereinigte staaten",
+# ---------------------------------------------------------------------------
-    "deutschland", "germany",
+
-    "frankreich", "france",
+_DEDUP_PROMPT = """\
-    "grossbritannien", "united kingdom", "uk",
+Du bist ein Deduplizierungs-Agent fuer Faktenchecks eines OSINT-Monitors.
-}
+
 LAGE: {incident_title}
 Unten stehen Faktenchecks (ID + Status + Claim). Finde Gruppen von Fakten,
 die INHALTLICH DASSELBE aussagen, auch wenn sie unterschiedlich formuliert sind.
 REGELN:
 - Gleicher Sachverhalt = gleiche Gruppe
  (z.B. "Trump fordert Kapitulation" und "US-Praesident verlangt bedingungslose Aufgabe")
 - Unterschiedliche Detailtiefe zum SELBEN Fakt = gleiche Gruppe
 - VERSCHIEDENE Sachverhalte = VERSCHIEDENE Gruppen
  (z.B. "Angriff auf Isfahan" vs "Angriff auf Teheran" sind NICHT dasselbe)
 - Eine Gruppe muss mindestens 2 Eintraege haben
 Antworte NUR als JSON-Array von Gruppen. Jede Gruppe ist ein Array von IDs:
 [[1,5,12], [3,8]]
 Wenn keine Duplikate: antworte mit []
 FAKTEN:
 {facts_text}"""
-async def check_fact_duplicates(db, incident_id: int) -> int:
+async def _haiku_find_duplicate_clusters(
-    """Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage.
+    facts: list[dict], incident_title: str
 ) -> list[list[int]]:
    """Fragt Haiku welche Fakten semantische Duplikate sind."""
    facts_text = "\n".join(
        f'ID={f["id"]} [{f["status"]}]: {f["claim"]}'
        for f in facts
    )
    prompt = _DEDUP_PROMPT.format(
        incident_title=incident_title, facts_text=facts_text
    )
    try:
        result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
        data = json.loads(result)
        if isinstance(data, list) and all(isinstance(g, list) for g in data):
            return data
    except json.JSONDecodeError:
        match = re.search(r'\[.*\]', result, re.DOTALL)
        if match:
            try:
                data = json.loads(match.group())
                if isinstance(data, list):
                    return data
            except json.JSONDecodeError:
                pass
    except Exception as e:
        logger.warning("Haiku Duplikat-Clustering fehlgeschlagen: %s", e)
    return []
    Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um
    semantisch identische Claims zu finden und das schwaecher belegte zu entfernen.
-    Returns: Anzahl entfernter Duplikate.
+def _fuzzy_prefilter(all_facts: list[dict], max_candidates: int = 80) -> list[dict]:
    """Waehlt Kandidaten fuer Haiku-Check per Fuzzy-Vorfilter aus.
    Findet Paare mit Aehnlichkeit >= 0.60 und gibt die betroffenen Fakten zurueck.
    Begrenzt auf max_candidates um Haiku-Tokens zu sparen.
    """
    from agents.factchecker import normalize_claim, _keyword_set
    if len(all_facts) <= max_candidates:
        return all_facts
    normalized = []
    for f in all_facts:
        nc = normalize_claim(f["claim"])
        kw = _keyword_set(f["claim"])
        normalized.append((f, nc, kw))
    candidate_ids = set()
    recent = normalized[:60]
    for i, (fact_a, norm_a, kw_a) in enumerate(recent):
        for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
            if i >= j or fact_b["id"] == fact_a["id"]:
                continue
            if not norm_a or not norm_b:
                continue
            len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
            if len_ratio > 2.5 or len_ratio < 0.4:
                continue
            seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
            kw_union = kw_a | kw_b
            jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
            combined = 0.7 * seq_ratio + 0.3 * jaccard
            if combined >= 0.60:
                candidate_ids.add(fact_a["id"])
                candidate_ids.add(fact_b["id"])
            if len(candidate_ids) >= max_candidates:
                break
        if len(candidate_ids) >= max_candidates:
            break
    candidates = [f for f in all_facts if f["id"] in candidate_ids]
    logger.info(
        "Fuzzy-Vorfilter: %d/%d Fakten als Duplikat-Kandidaten identifiziert",
        len(candidates), len(all_facts),
    )
    return candidates
 async def check_fact_duplicates(db, incident_id: int, incident_title: str) -> int:
    """Prueft auf semantische Faktencheck-Duplikate via Haiku.
    1. Fuzzy-Vorfilter reduziert auf relevante Kandidaten
    2. Haiku clustert semantische Duplikate
    3. Pro Cluster: behalte besten Fakt, loesche Rest
    Returns: Anzahl entfernter Duplikate.
    """
    cursor = await db.execute(
        "SELECT id, claim, status, sources_count, evidence, checked_at "
        "FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
@@ -59,61 +154,45 @@ async def check_fact_duplicates(db, incident_id: int) -> int:
    if len(all_facts) < 2:
        return 0
-    STATUS_PRIORITY = {
+    # Schritt 1: Fuzzy-Vorfilter
-        "confirmed": 5, "established": 5,
+    candidates = _fuzzy_prefilter(all_facts)
-        "contradicted": 4, "disputed": 4,
+    if len(candidates) < 2:
-        "unconfirmed": 3, "unverified": 3,
+        return 0
        "developing": 1,
    }
-    # Vorberechnung: normalisierte Claims und Keywords
+    # Schritt 2: Haiku-Clustering (in Batches von max 80)
-    normalized = []
+    all_clusters = []
-    for f in all_facts:
+    batch_size = 80
-        nc = normalize_claim(f["claim"])
+    for i in range(0, len(candidates), batch_size):
-        kw = _keyword_set(f["claim"])
+        batch = candidates[i:i + batch_size]
-        normalized.append((f, nc, kw))
+        clusters = await _haiku_find_duplicate_clusters(batch, incident_title)
        all_clusters.extend(clusters)
-    # Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen
+    if not all_clusters:
        logger.info("QC Fakten: Haiku fand keine Duplikate")
        return 0
    # Schritt 3: Pro Cluster besten behalten, Rest loeschen
    facts_by_id = {f["id"]: f for f in all_facts}
    ids_to_delete = set()
    checked_pairs = set()
-    recent = normalized[:50]
+    for cluster_ids in all_clusters:
-    for i, (fact_a, norm_a, kw_a) in enumerate(recent):
+        valid_ids = [cid for cid in cluster_ids if cid in facts_by_id]
-        if fact_a["id"] in ids_to_delete:
+        if len(valid_ids) <= 1:
            continue
        for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
            if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete:
            continue
-            pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"]))
+        cluster_facts = [facts_by_id[cid] for cid in valid_ids]
-            if pair_key in checked_pairs:
+        best = max(cluster_facts, key=lambda f: (
-                continue
+            STATUS_PRIORITY.get(f["status"], 0),
-            checked_pairs.add(pair_key)
+            f.get("sources_count", 0),
            f.get("checked_at", ""),
        ))
-            if not norm_a or not norm_b:
+        for fact in cluster_facts:
-                continue
+            if fact["id"] != best["id"]:
-
+                ids_to_delete.add(fact["id"])
            # Laengenfilter
            len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
            if len_ratio > 2.5 or len_ratio < 0.4:
                continue
            seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
            kw_union = kw_a | kw_b
            jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
            combined = 0.7 * seq_ratio + 0.3 * jaccard
            if combined >= 0.80:  # Hoher Threshold fuer Duplikaterkennung
                # Behalte den mit hoeherem Status-Prio + mehr Quellen
                score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0))
                score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0))
                loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"]
                winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"]
                ids_to_delete.add(loser_id)
                logger.info(
-                    "QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d",
+                    "QC Duplikat: ID %d entfernt, behalte ID %d ('%s')",
-                    loser_id, combined, winner_id,
+                    fact["id"], best["id"], best["claim"][:60],
                )
    if ids_to_delete:
@@ -130,11 +209,45 @@ async def check_fact_duplicates(db, incident_id: int) -> int:
    return len(ids_to_delete)
-async def check_location_categories(db, incident_id: int) -> int:
+# ---------------------------------------------------------------------------
-    """Prueft und korrigiert falsch kategorisierte Karten-Locations.
+# 2. Karten-Location-Kategorien
 # ---------------------------------------------------------------------------
-    Locations die als 'target' markiert sind aber offensichtlich keine
+_LOCATION_PROMPT = """\
-    Angriffsziele im Konfliktkontext darstellen, werden korrigiert.
+Du bist ein Geopolitik-Experte fuer einen OSINT-Monitor.
 LAGE: {incident_title}
 BESCHREIBUNG: {incident_desc}
 Unten stehen Orte, die auf der Karte als "target" (Angriffsziel) markiert sind.
 Pruefe fuer jeden Ort, ob die Kategorie "target" korrekt ist.
 KATEGORIEN:
 - target: Ort wurde tatsaechlich militaerisch angegriffen oder bombardiert
 - actor: Ort gehoert zu einer Konfliktpartei (z.B. Hauptstadt des Angreifers)
 - response: Ort reagiert auf den Konflikt (z.B. diplomatische Reaktion, Sanktionen)
 - mentioned: Ort wird nur im Kontext erwaehnt (z.B. wirtschaftliche Auswirkungen)
 REGELN:
 - Nur Orte die TATSAECHLICH physisch angegriffen/bombardiert wurden = "target"
 - Hauptstaedte von Angreiferlaendern (z.B. Washington DC) = "actor"
 - Laender die nur wirtschaftlich betroffen sind (z.B. steigende Oelpreise) = "mentioned"
 - Laender die diplomatisch reagieren = "response"
 - Im Zweifel: "mentioned"
 Antworte als JSON-Array mit Korrekturen. Nur Eintraege die GEAENDERT werden muessen:
 [{{"id": 123, "category": "mentioned"}}, {{"id": 456, "category": "actor"}}]
 Wenn alle Kategorien korrekt sind: antworte mit []
 ORTE (aktuell alle als "target" markiert):
 {locations_text}"""
 async def check_location_categories(
    db, incident_id: int, incident_title: str, incident_desc: str
 ) -> int:
    """Prueft Karten-Location-Kategorien via Haiku.
    Returns: Anzahl korrigierter Eintraege.
    """
@@ -148,39 +261,80 @@ async def check_location_categories(db, incident_id: int) -> int:
    if not targets:
        return 0
-    fixes = []
+    # Dedupliziere nach location_name fuer den Prompt (spart Tokens)
-
+    unique_names = {}
    ids_by_name = {}
    for loc in targets:
-        name_lower = loc["location_name"].lower().strip()
+        name = loc["location_name"]
        if name not in unique_names:
            unique_names[name] = loc
            ids_by_name[name] = []
        ids_by_name[name].append(loc["id"])
-        if name_lower in _NON_TARGET_LOCATIONS:
+    locations_text = "\n".join(
-            fixes.append((loc["id"], "mentioned"))
+        f'ID={loc["id"]} | {loc["location_name"]} ({loc["latitude"]:.2f}, {loc["longitude"]:.2f})'
-        elif name_lower in _ACTOR_NOT_TARGET:
+        for loc in unique_names.values()
-            lat, lon = loc["latitude"], loc["longitude"]
+    )
-            # USA mit DC-Koordinaten -> actor
+
-            if name_lower in ("usa", "united states", "us", "vereinigte staaten"):
+    prompt = _LOCATION_PROMPT.format(
-                if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0:
+        incident_title=incident_title,
-                    fixes.append((loc["id"], "actor"))
+        incident_desc=incident_desc[:500] if incident_desc else "(keine Beschreibung)",
-            # Deutschland mit Berlin-Koordinaten -> response
+        locations_text=locations_text,
-            elif name_lower in ("deutschland", "germany"):
+    )
-                if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0:
+
-                    fixes.append((loc["id"], "response"))
+    fixes = []
-            # Frankreich mit Paris-Koordinaten -> response
+    try:
-            elif name_lower in ("frankreich", "france"):
+        result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
-                if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0:
+        data = json.loads(result)
-                    fixes.append((loc["id"], "response"))
+        if isinstance(data, list):
-            # UK mit London-Koordinaten -> response
+            fixes = data
-            elif name_lower in ("grossbritannien", "united kingdom", "uk"):
+    except json.JSONDecodeError:
-                if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0:
+        match = re.search(r'\[.*\]', result, re.DOTALL)
-                    fixes.append((loc["id"], "response"))
+        if match:
-
+            try:
-    total_fixed = 0
+                data = json.loads(match.group())
-    for loc_id, new_category in fixes:
+                if isinstance(data, list):
-        await db.execute(
+                    fixes = data
-            "UPDATE article_locations SET category = ? WHERE id = ?",
+            except json.JSONDecodeError:
-            (new_category, loc_id),
+                pass
    except Exception as e:
        logger.warning("Haiku Location-Check fehlgeschlagen: %s", e)
        return 0
    if not fixes:
        logger.info("QC Locations: Haiku fand keine falschen Kategorien")
        return 0
    # Korrekturen anwenden (auch auf alle IDs mit gleichem Namen)
    total_fixed = 0
    representative_ids = {loc["id"]: name for name, loc in unique_names.items()}
    for fix in fixes:
        fix_id = fix.get("id")
        new_cat = fix.get("category")
        if not fix_id or not new_cat:
            continue
        if new_cat not in ("target", "actor", "response", "mentioned"):
            continue
        # Finde den location_name fuer diese ID
        loc_name = representative_ids.get(fix_id)
        if not loc_name:
            continue
        # Korrigiere ALLE Eintraege mit diesem Namen
        all_ids = ids_by_name.get(loc_name, [fix_id])
        placeholders = ",".join("?" * len(all_ids))
        await db.execute(
            f"UPDATE article_locations SET category = ? "
            f"WHERE id IN ({placeholders}) AND category = 'target'",
            [new_cat] + all_ids,
        )
        total_fixed += len(all_ids)
        logger.info(
            "QC Location: '%s' (%d Eintraege): target -> %s",
            loc_name, len(all_ids), new_cat,
        )
        total_fixed += 1
    if total_fixed > 0:
        logger.info(
@@ -191,14 +345,32 @@ async def check_location_categories(db, incident_id: int) -> int:
    return total_fixed
 # ---------------------------------------------------------------------------
 # 3. Hauptfunktion
 # ---------------------------------------------------------------------------
 async def run_post_refresh_qc(db, incident_id: int) -> dict:
-    """Fuehrt den kompletten Post-Refresh Quality Check durch.
+    """Fuehrt den kompletten Post-Refresh Quality Check via Haiku durch.
    Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
    """
    try:
-        facts_removed = await check_fact_duplicates(db, incident_id)
+        # Lage-Titel und Beschreibung laden
-        locations_fixed = await check_location_categories(db, incident_id)
+        cursor = await db.execute(
            "SELECT title, description FROM incidents WHERE id = ?",
            (incident_id,),
        )
        row = await cursor.fetchone()
        if not row:
            return {"facts_removed": 0, "locations_fixed": 0}
        incident_title = row["title"] or ""
        incident_desc = row["description"] or ""
        facts_removed = await check_fact_duplicates(db, incident_id, incident_title)
        locations_fixed = await check_location_categories(
            db, incident_id, incident_title, incident_desc
        )
        if facts_removed > 0 or locations_fixed > 0:
            await db.commit()