feat: Post-Refresh Quality Check fuer Faktenchecks und Karten-Locations

Automatischer QC-Schritt nach jedem Refresh: - Erkennt inhaltliche Faktencheck-Duplikate via Fuzzy-Matching (Threshold 0.80) - Korrigiert falsch kategorisierte Karten-Locations (z.B. entfernte Laender als 'target') - Laeuft nach dem Faktencheck-Commit, vor den Notifications - Fehler im QC blockieren nicht den Refresh-Ablauf Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 21:41:45 +01:00
Commit 81a393fd4a
--- a/src/agents/orchestrator.py
+++ b/src/agents/orchestrator.py
@@ -985,6 +985,17 @@ class AgentOrchestrator:
                await db.commit()
                # Post-Refresh Quality Check: Duplikate und Karten-Kategorien pruefen
                try:
                    from services.post_refresh_qc import run_post_refresh_qc
                    qc_result = await run_post_refresh_qc(db, incident_id)
                    if qc_result.get("facts_removed", 0) > 0 or qc_result.get("locations_fixed", 0) > 0:
                        logger.info(
                            f"QC: {qc_result['facts_removed']} Duplikate, "
                            f"{qc_result['locations_fixed']} Location-Fixes"
                        )
                except Exception as qc_err:
                    logger.warning(f"Post-Refresh QC fehlgeschlagen: {qc_err}")
                # Gebündelte Notification senden (nicht beim ersten Refresh)
                if not is_first_refresh:
                    if self._ws_manager:
--- a/src/services/post_refresh_qc.py
+++ b/src/services/post_refresh_qc.py
@@ -0,0 +1,217 @@
 """Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh.
 Erkennt:
 1. Inhaltliche Faktencheck-Duplikate (fuzzy matching)
 2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden)
 """
 import logging
 from difflib import SequenceMatcher
 logger = logging.getLogger("osint.post_refresh_qc")
 # Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind
 # (werden typischerweise wirtschaftlich/politisch erwaehnt)
 _NON_TARGET_LOCATIONS = {
    # Weit entfernte Laender (keine direkten Kriegsziele)
    "australia", "australien", "northern territory", "queensland",
    "new south wales", "victoria",
    "cuba", "kuba",
    "new york city", "new york", "washington",
    "taiwan", "south korea", "japan",
    "afghanistan", "pakistan", "karachi",
    "china", "peking", "beijing",
    "indien", "india", "new delhi",
    "brasilien", "brazil",
    "mexiko", "mexico",
    "argentinien", "argentina",
    "kanada", "canada",
    "philippinen", "philippines",
    "indonesien", "indonesia",
    "nigeria", "south africa",
 }
 # Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten
 _ACTOR_NOT_TARGET = {
    "usa", "united states", "us", "vereinigte staaten",
    "deutschland", "germany",
    "frankreich", "france",
    "grossbritannien", "united kingdom", "uk",
 }
 async def check_fact_duplicates(db, incident_id: int) -> int:
    """Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage.
    Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um
    semantisch identische Claims zu finden und das schwaecher belegte zu entfernen.
    Returns: Anzahl entfernter Duplikate.
    """
    from agents.factchecker import normalize_claim, _keyword_set
    cursor = await db.execute(
        "SELECT id, claim, status, sources_count, evidence, checked_at "
        "FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
        (incident_id,),
    )
    all_facts = [dict(row) for row in await cursor.fetchall()]
    if len(all_facts) < 2:
        return 0
    STATUS_PRIORITY = {
        "confirmed": 5, "established": 5,
        "contradicted": 4, "disputed": 4,
        "unconfirmed": 3, "unverified": 3,
        "developing": 1,
    }
    # Vorberechnung: normalisierte Claims und Keywords
    normalized = []
    for f in all_facts:
        nc = normalize_claim(f["claim"])
        kw = _keyword_set(f["claim"])
        normalized.append((f, nc, kw))
    # Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen
    ids_to_delete = set()
    checked_pairs = set()
    recent = normalized[:50]
    for i, (fact_a, norm_a, kw_a) in enumerate(recent):
        if fact_a["id"] in ids_to_delete:
            continue
        for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
            if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete:
                continue
            pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"]))
            if pair_key in checked_pairs:
                continue
            checked_pairs.add(pair_key)
            if not norm_a or not norm_b:
                continue
            # Laengenfilter
            len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
            if len_ratio > 2.5 or len_ratio < 0.4:
                continue
            seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
            kw_union = kw_a | kw_b
            jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
            combined = 0.7 * seq_ratio + 0.3 * jaccard
            if combined >= 0.80:  # Hoher Threshold fuer Duplikaterkennung
                # Behalte den mit hoeherem Status-Prio + mehr Quellen
                score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0))
                score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0))
                loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"]
                winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"]
                ids_to_delete.add(loser_id)
                logger.info(
                    "QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d",
                    loser_id, combined, winner_id,
                )
    if ids_to_delete:
        placeholders = ",".join("?" * len(ids_to_delete))
        await db.execute(
            f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
            list(ids_to_delete),
        )
        logger.info(
            "QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
            len(ids_to_delete), incident_id,
        )
    return len(ids_to_delete)
 async def check_location_categories(db, incident_id: int) -> int:
    """Prueft und korrigiert falsch kategorisierte Karten-Locations.
    Locations die als 'target' markiert sind aber offensichtlich keine
    Angriffsziele im Konfliktkontext darstellen, werden korrigiert.
    Returns: Anzahl korrigierter Eintraege.
    """
    cursor = await db.execute(
        "SELECT id, location_name, latitude, longitude, category "
        "FROM article_locations WHERE incident_id = ? AND category = 'target'",
        (incident_id,),
    )
    targets = [dict(row) for row in await cursor.fetchall()]
    if not targets:
        return 0
    fixes = []
    for loc in targets:
        name_lower = loc["location_name"].lower().strip()
        if name_lower in _NON_TARGET_LOCATIONS:
            fixes.append((loc["id"], "mentioned"))
        elif name_lower in _ACTOR_NOT_TARGET:
            lat, lon = loc["latitude"], loc["longitude"]
            # USA mit DC-Koordinaten -> actor
            if name_lower in ("usa", "united states", "us", "vereinigte staaten"):
                if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0:
                    fixes.append((loc["id"], "actor"))
            # Deutschland mit Berlin-Koordinaten -> response
            elif name_lower in ("deutschland", "germany"):
                if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0:
                    fixes.append((loc["id"], "response"))
            # Frankreich mit Paris-Koordinaten -> response
            elif name_lower in ("frankreich", "france"):
                if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0:
                    fixes.append((loc["id"], "response"))
            # UK mit London-Koordinaten -> response
            elif name_lower in ("grossbritannien", "united kingdom", "uk"):
                if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0:
                    fixes.append((loc["id"], "response"))
    total_fixed = 0
    for loc_id, new_category in fixes:
        await db.execute(
            "UPDATE article_locations SET category = ? WHERE id = ?",
            (new_category, loc_id),
        )
        total_fixed += 1
    if total_fixed > 0:
        logger.info(
            "QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
            total_fixed, incident_id,
        )
    return total_fixed
 async def run_post_refresh_qc(db, incident_id: int) -> dict:
    """Fuehrt den kompletten Post-Refresh Quality Check durch.
    Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
    """
    try:
        facts_removed = await check_fact_duplicates(db, incident_id)
        locations_fixed = await check_location_categories(db, incident_id)
        if facts_removed > 0 or locations_fixed > 0:
            await db.commit()
            logger.info(
                "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
                incident_id, facts_removed, locations_fixed,
            )
        return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
    except Exception as e:
        logger.error(
            "Post-Refresh QC Fehler fuer Incident %d: %s",
            incident_id, e, exc_info=True,
        )
        return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}