diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 3e0a6bc..d6de3b2 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -985,6 +985,17 @@ class AgentOrchestrator: await db.commit() + # Post-Refresh Quality Check: Duplikate und Karten-Kategorien pruefen + try: + from services.post_refresh_qc import run_post_refresh_qc + qc_result = await run_post_refresh_qc(db, incident_id) + if qc_result.get("facts_removed", 0) > 0 or qc_result.get("locations_fixed", 0) > 0: + logger.info( + f"QC: {qc_result['facts_removed']} Duplikate, " + f"{qc_result['locations_fixed']} Location-Fixes" + ) + except Exception as qc_err: + logger.warning(f"Post-Refresh QC fehlgeschlagen: {qc_err}") # Gebündelte Notification senden (nicht beim ersten Refresh) if not is_first_refresh: if self._ws_manager: diff --git a/src/services/post_refresh_qc.py b/src/services/post_refresh_qc.py new file mode 100644 index 0000000..a84417a --- /dev/null +++ b/src/services/post_refresh_qc.py @@ -0,0 +1,217 @@ +"""Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh. + +Erkennt: +1. Inhaltliche Faktencheck-Duplikate (fuzzy matching) +2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden) +""" +import logging +from difflib import SequenceMatcher + +logger = logging.getLogger("osint.post_refresh_qc") + +# Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind +# (werden typischerweise wirtschaftlich/politisch erwaehnt) +_NON_TARGET_LOCATIONS = { + # Weit entfernte Laender (keine direkten Kriegsziele) + "australia", "australien", "northern territory", "queensland", + "new south wales", "victoria", + "cuba", "kuba", + "new york city", "new york", "washington", + "taiwan", "south korea", "japan", + "afghanistan", "pakistan", "karachi", + "china", "peking", "beijing", + "indien", "india", "new delhi", + "brasilien", "brazil", + "mexiko", "mexico", + "argentinien", "argentina", + "kanada", "canada", + "philippinen", "philippines", + "indonesien", "indonesia", + "nigeria", "south africa", +} + +# Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten +_ACTOR_NOT_TARGET = { + "usa", "united states", "us", "vereinigte staaten", + "deutschland", "germany", + "frankreich", "france", + "grossbritannien", "united kingdom", "uk", +} + + +async def check_fact_duplicates(db, incident_id: int) -> int: + """Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage. + + Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um + semantisch identische Claims zu finden und das schwaecher belegte zu entfernen. + + Returns: Anzahl entfernter Duplikate. + """ + from agents.factchecker import normalize_claim, _keyword_set + + cursor = await db.execute( + "SELECT id, claim, status, sources_count, evidence, checked_at " + "FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC", + (incident_id,), + ) + all_facts = [dict(row) for row in await cursor.fetchall()] + + if len(all_facts) < 2: + return 0 + + STATUS_PRIORITY = { + "confirmed": 5, "established": 5, + "contradicted": 4, "disputed": 4, + "unconfirmed": 3, "unverified": 3, + "developing": 1, + } + + # Vorberechnung: normalisierte Claims und Keywords + normalized = [] + for f in all_facts: + nc = normalize_claim(f["claim"]) + kw = _keyword_set(f["claim"]) + normalized.append((f, nc, kw)) + + # Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen + ids_to_delete = set() + checked_pairs = set() + + recent = normalized[:50] + for i, (fact_a, norm_a, kw_a) in enumerate(recent): + if fact_a["id"] in ids_to_delete: + continue + for j, (fact_b, norm_b, kw_b) in enumerate(normalized): + if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete: + continue + + pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"])) + if pair_key in checked_pairs: + continue + checked_pairs.add(pair_key) + + if not norm_a or not norm_b: + continue + + # Laengenfilter + len_ratio = len(norm_a) / len(norm_b) if norm_b else 0 + if len_ratio > 2.5 or len_ratio < 0.4: + continue + + seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio() + kw_union = kw_a | kw_b + jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0 + combined = 0.7 * seq_ratio + 0.3 * jaccard + + if combined >= 0.80: # Hoher Threshold fuer Duplikaterkennung + # Behalte den mit hoeherem Status-Prio + mehr Quellen + score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0)) + score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0)) + + loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"] + winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"] + ids_to_delete.add(loser_id) + logger.info( + "QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d", + loser_id, combined, winner_id, + ) + + if ids_to_delete: + placeholders = ",".join("?" * len(ids_to_delete)) + await db.execute( + f"DELETE FROM fact_checks WHERE id IN ({placeholders})", + list(ids_to_delete), + ) + logger.info( + "QC: %d Faktencheck-Duplikate entfernt fuer Incident %d", + len(ids_to_delete), incident_id, + ) + + return len(ids_to_delete) + + +async def check_location_categories(db, incident_id: int) -> int: + """Prueft und korrigiert falsch kategorisierte Karten-Locations. + + Locations die als 'target' markiert sind aber offensichtlich keine + Angriffsziele im Konfliktkontext darstellen, werden korrigiert. + + Returns: Anzahl korrigierter Eintraege. + """ + cursor = await db.execute( + "SELECT id, location_name, latitude, longitude, category " + "FROM article_locations WHERE incident_id = ? AND category = 'target'", + (incident_id,), + ) + targets = [dict(row) for row in await cursor.fetchall()] + + if not targets: + return 0 + + fixes = [] + + for loc in targets: + name_lower = loc["location_name"].lower().strip() + + if name_lower in _NON_TARGET_LOCATIONS: + fixes.append((loc["id"], "mentioned")) + elif name_lower in _ACTOR_NOT_TARGET: + lat, lon = loc["latitude"], loc["longitude"] + # USA mit DC-Koordinaten -> actor + if name_lower in ("usa", "united states", "us", "vereinigte staaten"): + if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0: + fixes.append((loc["id"], "actor")) + # Deutschland mit Berlin-Koordinaten -> response + elif name_lower in ("deutschland", "germany"): + if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0: + fixes.append((loc["id"], "response")) + # Frankreich mit Paris-Koordinaten -> response + elif name_lower in ("frankreich", "france"): + if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0: + fixes.append((loc["id"], "response")) + # UK mit London-Koordinaten -> response + elif name_lower in ("grossbritannien", "united kingdom", "uk"): + if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0: + fixes.append((loc["id"], "response")) + + total_fixed = 0 + for loc_id, new_category in fixes: + await db.execute( + "UPDATE article_locations SET category = ? WHERE id = ?", + (new_category, loc_id), + ) + total_fixed += 1 + + if total_fixed > 0: + logger.info( + "QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d", + total_fixed, incident_id, + ) + + return total_fixed + + +async def run_post_refresh_qc(db, incident_id: int) -> dict: + """Fuehrt den kompletten Post-Refresh Quality Check durch. + + Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}. + """ + try: + facts_removed = await check_fact_duplicates(db, incident_id) + locations_fixed = await check_location_categories(db, incident_id) + + if facts_removed > 0 or locations_fixed > 0: + await db.commit() + logger.info( + "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert", + incident_id, facts_removed, locations_fixed, + ) + + return {"facts_removed": facts_removed, "locations_fixed": locations_fixed} + + except Exception as e: + logger.error( + "Post-Refresh QC Fehler fuer Incident %d: %s", + incident_id, e, exc_info=True, + ) + return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}