feat: Post-Refresh Quality Check fuer Faktenchecks und Karten-Locations

Automatischer QC-Schritt nach jedem Refresh: - Erkennt inhaltliche Faktencheck-Duplikate via Fuzzy-Matching (Threshold 0.80) - Korrigiert falsch kategorisierte Karten-Locations (z.B. entfernte Laender als 'target') - Laeuft nach dem Faktencheck-Commit, vor den Notifications - Fehler im QC blockieren nicht den Refresh-Ablauf Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 21:41:45 +01:00
Commit 81a393fd4a
--- a/src/agents/orchestrator.py
+++ b/src/agents/orchestrator.py
@@ -985,6 +985,17 @@ class AgentOrchestrator:

                await db.commit()

+                # Post-Refresh Quality Check: Duplikate und Karten-Kategorien pruefen
+                try:
+                    from services.post_refresh_qc import run_post_refresh_qc
+                    qc_result = await run_post_refresh_qc(db, incident_id)
+                    if qc_result.get("facts_removed", 0) > 0 or qc_result.get("locations_fixed", 0) > 0:
+                        logger.info(
+                            f"QC: {qc_result['facts_removed']} Duplikate, "
+                            f"{qc_result['locations_fixed']} Location-Fixes"
+                        )
+                except Exception as qc_err:
+                    logger.warning(f"Post-Refresh QC fehlgeschlagen: {qc_err}")
                # Gebündelte Notification senden (nicht beim ersten Refresh)
                if not is_first_refresh:
                    if self._ws_manager:
--- a/src/services/post_refresh_qc.py
+++ b/src/services/post_refresh_qc.py
@@ -0,0 +1,217 @@
+"""Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh.
+
+Erkennt:
+1. Inhaltliche Faktencheck-Duplikate (fuzzy matching)
+2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden)
+"""
+import logging
+from difflib import SequenceMatcher
+
+logger = logging.getLogger("osint.post_refresh_qc")
+
+# Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind
+# (werden typischerweise wirtschaftlich/politisch erwaehnt)
+_NON_TARGET_LOCATIONS = {
+    # Weit entfernte Laender (keine direkten Kriegsziele)
+    "australia", "australien", "northern territory", "queensland",
+    "new south wales", "victoria",
+    "cuba", "kuba",
+    "new york city", "new york", "washington",
+    "taiwan", "south korea", "japan",
+    "afghanistan", "pakistan", "karachi",
+    "china", "peking", "beijing",
+    "indien", "india", "new delhi",
+    "brasilien", "brazil",
+    "mexiko", "mexico",
+    "argentinien", "argentina",
+    "kanada", "canada",
+    "philippinen", "philippines",
+    "indonesien", "indonesia",
+    "nigeria", "south africa",
+}
+
+# Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten
+_ACTOR_NOT_TARGET = {
+    "usa", "united states", "us", "vereinigte staaten",
+    "deutschland", "germany",
+    "frankreich", "france",
+    "grossbritannien", "united kingdom", "uk",
+}
+
+
+async def check_fact_duplicates(db, incident_id: int) -> int:
+    """Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage.
+
+    Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um
+    semantisch identische Claims zu finden und das schwaecher belegte zu entfernen.
+
+    Returns: Anzahl entfernter Duplikate.
+    """
+    from agents.factchecker import normalize_claim, _keyword_set
+
+    cursor = await db.execute(
+        "SELECT id, claim, status, sources_count, evidence, checked_at "
+        "FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
+        (incident_id,),
+    )
+    all_facts = [dict(row) for row in await cursor.fetchall()]
+
+    if len(all_facts) < 2:
+        return 0
+
+    STATUS_PRIORITY = {
+        "confirmed": 5, "established": 5,
+        "contradicted": 4, "disputed": 4,
+        "unconfirmed": 3, "unverified": 3,
+        "developing": 1,
+    }
+
+    # Vorberechnung: normalisierte Claims und Keywords
+    normalized = []
+    for f in all_facts:
+        nc = normalize_claim(f["claim"])
+        kw = _keyword_set(f["claim"])
+        normalized.append((f, nc, kw))
+
+    # Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen
+    ids_to_delete = set()
+    checked_pairs = set()
+
+    recent = normalized[:50]
+    for i, (fact_a, norm_a, kw_a) in enumerate(recent):
+        if fact_a["id"] in ids_to_delete:
+            continue
+        for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
+            if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete:
+                continue
+
+            pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"]))
+            if pair_key in checked_pairs:
+                continue
+            checked_pairs.add(pair_key)
+
+            if not norm_a or not norm_b:
+                continue
+
+            # Laengenfilter
+            len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
+            if len_ratio > 2.5 or len_ratio < 0.4:
+                continue
+
+            seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
+            kw_union = kw_a | kw_b
+            jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
+            combined = 0.7 * seq_ratio + 0.3 * jaccard
+
+            if combined >= 0.80:  # Hoher Threshold fuer Duplikaterkennung
+                # Behalte den mit hoeherem Status-Prio + mehr Quellen
+                score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0))
+                score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0))
+
+                loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"]
+                winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"]
+                ids_to_delete.add(loser_id)
+                logger.info(
+                    "QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d",
+                    loser_id, combined, winner_id,
+                )
+
+    if ids_to_delete:
+        placeholders = ",".join("?" * len(ids_to_delete))
+        await db.execute(
+            f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
+            list(ids_to_delete),
+        )
+        logger.info(
+            "QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
+            len(ids_to_delete), incident_id,
+        )
+
+    return len(ids_to_delete)
+
+
+async def check_location_categories(db, incident_id: int) -> int:
+    """Prueft und korrigiert falsch kategorisierte Karten-Locations.
+
+    Locations die als 'target' markiert sind aber offensichtlich keine
+    Angriffsziele im Konfliktkontext darstellen, werden korrigiert.
+
+    Returns: Anzahl korrigierter Eintraege.
+    """
+    cursor = await db.execute(
+        "SELECT id, location_name, latitude, longitude, category "
+        "FROM article_locations WHERE incident_id = ? AND category = 'target'",
+        (incident_id,),
+    )
+    targets = [dict(row) for row in await cursor.fetchall()]
+
+    if not targets:
+        return 0
+
+    fixes = []
+
+    for loc in targets:
+        name_lower = loc["location_name"].lower().strip()
+
+        if name_lower in _NON_TARGET_LOCATIONS:
+            fixes.append((loc["id"], "mentioned"))
+        elif name_lower in _ACTOR_NOT_TARGET:
+            lat, lon = loc["latitude"], loc["longitude"]
+            # USA mit DC-Koordinaten -> actor
+            if name_lower in ("usa", "united states", "us", "vereinigte staaten"):
+                if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0:
+                    fixes.append((loc["id"], "actor"))
+            # Deutschland mit Berlin-Koordinaten -> response
+            elif name_lower in ("deutschland", "germany"):
+                if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0:
+                    fixes.append((loc["id"], "response"))
+            # Frankreich mit Paris-Koordinaten -> response
+            elif name_lower in ("frankreich", "france"):
+                if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0:
+                    fixes.append((loc["id"], "response"))
+            # UK mit London-Koordinaten -> response
+            elif name_lower in ("grossbritannien", "united kingdom", "uk"):
+                if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0:
+                    fixes.append((loc["id"], "response"))
+
+    total_fixed = 0
+    for loc_id, new_category in fixes:
+        await db.execute(
+            "UPDATE article_locations SET category = ? WHERE id = ?",
+            (new_category, loc_id),
+        )
+        total_fixed += 1
+
+    if total_fixed > 0:
+        logger.info(
+            "QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
+            total_fixed, incident_id,
+        )
+
+    return total_fixed
+
+
+async def run_post_refresh_qc(db, incident_id: int) -> dict:
+    """Fuehrt den kompletten Post-Refresh Quality Check durch.
+
+    Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
+    """
+    try:
+        facts_removed = await check_fact_duplicates(db, incident_id)
+        locations_fixed = await check_location_categories(db, incident_id)
+
+        if facts_removed > 0 or locations_fixed > 0:
+            await db.commit()
+            logger.info(
+                "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
+                incident_id, facts_removed, locations_fixed,
+            )
+
+        return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
+
+    except Exception as e:
+        logger.error(
+            "Post-Refresh QC Fehler fuer Incident %d: %s",
+            incident_id, e, exc_info=True,
+        )
+        return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}