feat: Post-Refresh Quality Check fuer Faktenchecks und Karten-Locations
Automatischer QC-Schritt nach jedem Refresh: - Erkennt inhaltliche Faktencheck-Duplikate via Fuzzy-Matching (Threshold 0.80) - Korrigiert falsch kategorisierte Karten-Locations (z.B. entfernte Laender als 'target') - Laeuft nach dem Faktencheck-Commit, vor den Notifications - Fehler im QC blockieren nicht den Refresh-Ablauf Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -985,6 +985,17 @@ class AgentOrchestrator:
|
||||
|
||||
await db.commit()
|
||||
|
||||
# Post-Refresh Quality Check: Duplikate und Karten-Kategorien pruefen
|
||||
try:
|
||||
from services.post_refresh_qc import run_post_refresh_qc
|
||||
qc_result = await run_post_refresh_qc(db, incident_id)
|
||||
if qc_result.get("facts_removed", 0) > 0 or qc_result.get("locations_fixed", 0) > 0:
|
||||
logger.info(
|
||||
f"QC: {qc_result['facts_removed']} Duplikate, "
|
||||
f"{qc_result['locations_fixed']} Location-Fixes"
|
||||
)
|
||||
except Exception as qc_err:
|
||||
logger.warning(f"Post-Refresh QC fehlgeschlagen: {qc_err}")
|
||||
# Gebündelte Notification senden (nicht beim ersten Refresh)
|
||||
if not is_first_refresh:
|
||||
if self._ws_manager:
|
||||
|
||||
217
src/services/post_refresh_qc.py
Normale Datei
217
src/services/post_refresh_qc.py
Normale Datei
@@ -0,0 +1,217 @@
|
||||
"""Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh.
|
||||
|
||||
Erkennt:
|
||||
1. Inhaltliche Faktencheck-Duplikate (fuzzy matching)
|
||||
2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden)
|
||||
"""
|
||||
import logging
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
logger = logging.getLogger("osint.post_refresh_qc")
|
||||
|
||||
# Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind
|
||||
# (werden typischerweise wirtschaftlich/politisch erwaehnt)
|
||||
_NON_TARGET_LOCATIONS = {
|
||||
# Weit entfernte Laender (keine direkten Kriegsziele)
|
||||
"australia", "australien", "northern territory", "queensland",
|
||||
"new south wales", "victoria",
|
||||
"cuba", "kuba",
|
||||
"new york city", "new york", "washington",
|
||||
"taiwan", "south korea", "japan",
|
||||
"afghanistan", "pakistan", "karachi",
|
||||
"china", "peking", "beijing",
|
||||
"indien", "india", "new delhi",
|
||||
"brasilien", "brazil",
|
||||
"mexiko", "mexico",
|
||||
"argentinien", "argentina",
|
||||
"kanada", "canada",
|
||||
"philippinen", "philippines",
|
||||
"indonesien", "indonesia",
|
||||
"nigeria", "south africa",
|
||||
}
|
||||
|
||||
# Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten
|
||||
_ACTOR_NOT_TARGET = {
|
||||
"usa", "united states", "us", "vereinigte staaten",
|
||||
"deutschland", "germany",
|
||||
"frankreich", "france",
|
||||
"grossbritannien", "united kingdom", "uk",
|
||||
}
|
||||
|
||||
|
||||
async def check_fact_duplicates(db, incident_id: int) -> int:
|
||||
"""Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage.
|
||||
|
||||
Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um
|
||||
semantisch identische Claims zu finden und das schwaecher belegte zu entfernen.
|
||||
|
||||
Returns: Anzahl entfernter Duplikate.
|
||||
"""
|
||||
from agents.factchecker import normalize_claim, _keyword_set
|
||||
|
||||
cursor = await db.execute(
|
||||
"SELECT id, claim, status, sources_count, evidence, checked_at "
|
||||
"FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
|
||||
(incident_id,),
|
||||
)
|
||||
all_facts = [dict(row) for row in await cursor.fetchall()]
|
||||
|
||||
if len(all_facts) < 2:
|
||||
return 0
|
||||
|
||||
STATUS_PRIORITY = {
|
||||
"confirmed": 5, "established": 5,
|
||||
"contradicted": 4, "disputed": 4,
|
||||
"unconfirmed": 3, "unverified": 3,
|
||||
"developing": 1,
|
||||
}
|
||||
|
||||
# Vorberechnung: normalisierte Claims und Keywords
|
||||
normalized = []
|
||||
for f in all_facts:
|
||||
nc = normalize_claim(f["claim"])
|
||||
kw = _keyword_set(f["claim"])
|
||||
normalized.append((f, nc, kw))
|
||||
|
||||
# Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen
|
||||
ids_to_delete = set()
|
||||
checked_pairs = set()
|
||||
|
||||
recent = normalized[:50]
|
||||
for i, (fact_a, norm_a, kw_a) in enumerate(recent):
|
||||
if fact_a["id"] in ids_to_delete:
|
||||
continue
|
||||
for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
|
||||
if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete:
|
||||
continue
|
||||
|
||||
pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"]))
|
||||
if pair_key in checked_pairs:
|
||||
continue
|
||||
checked_pairs.add(pair_key)
|
||||
|
||||
if not norm_a or not norm_b:
|
||||
continue
|
||||
|
||||
# Laengenfilter
|
||||
len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
|
||||
if len_ratio > 2.5 or len_ratio < 0.4:
|
||||
continue
|
||||
|
||||
seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
|
||||
kw_union = kw_a | kw_b
|
||||
jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
|
||||
combined = 0.7 * seq_ratio + 0.3 * jaccard
|
||||
|
||||
if combined >= 0.80: # Hoher Threshold fuer Duplikaterkennung
|
||||
# Behalte den mit hoeherem Status-Prio + mehr Quellen
|
||||
score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0))
|
||||
score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0))
|
||||
|
||||
loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"]
|
||||
winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"]
|
||||
ids_to_delete.add(loser_id)
|
||||
logger.info(
|
||||
"QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d",
|
||||
loser_id, combined, winner_id,
|
||||
)
|
||||
|
||||
if ids_to_delete:
|
||||
placeholders = ",".join("?" * len(ids_to_delete))
|
||||
await db.execute(
|
||||
f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
|
||||
list(ids_to_delete),
|
||||
)
|
||||
logger.info(
|
||||
"QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
|
||||
len(ids_to_delete), incident_id,
|
||||
)
|
||||
|
||||
return len(ids_to_delete)
|
||||
|
||||
|
||||
async def check_location_categories(db, incident_id: int) -> int:
|
||||
"""Prueft und korrigiert falsch kategorisierte Karten-Locations.
|
||||
|
||||
Locations die als 'target' markiert sind aber offensichtlich keine
|
||||
Angriffsziele im Konfliktkontext darstellen, werden korrigiert.
|
||||
|
||||
Returns: Anzahl korrigierter Eintraege.
|
||||
"""
|
||||
cursor = await db.execute(
|
||||
"SELECT id, location_name, latitude, longitude, category "
|
||||
"FROM article_locations WHERE incident_id = ? AND category = 'target'",
|
||||
(incident_id,),
|
||||
)
|
||||
targets = [dict(row) for row in await cursor.fetchall()]
|
||||
|
||||
if not targets:
|
||||
return 0
|
||||
|
||||
fixes = []
|
||||
|
||||
for loc in targets:
|
||||
name_lower = loc["location_name"].lower().strip()
|
||||
|
||||
if name_lower in _NON_TARGET_LOCATIONS:
|
||||
fixes.append((loc["id"], "mentioned"))
|
||||
elif name_lower in _ACTOR_NOT_TARGET:
|
||||
lat, lon = loc["latitude"], loc["longitude"]
|
||||
# USA mit DC-Koordinaten -> actor
|
||||
if name_lower in ("usa", "united states", "us", "vereinigte staaten"):
|
||||
if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0:
|
||||
fixes.append((loc["id"], "actor"))
|
||||
# Deutschland mit Berlin-Koordinaten -> response
|
||||
elif name_lower in ("deutschland", "germany"):
|
||||
if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0:
|
||||
fixes.append((loc["id"], "response"))
|
||||
# Frankreich mit Paris-Koordinaten -> response
|
||||
elif name_lower in ("frankreich", "france"):
|
||||
if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0:
|
||||
fixes.append((loc["id"], "response"))
|
||||
# UK mit London-Koordinaten -> response
|
||||
elif name_lower in ("grossbritannien", "united kingdom", "uk"):
|
||||
if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0:
|
||||
fixes.append((loc["id"], "response"))
|
||||
|
||||
total_fixed = 0
|
||||
for loc_id, new_category in fixes:
|
||||
await db.execute(
|
||||
"UPDATE article_locations SET category = ? WHERE id = ?",
|
||||
(new_category, loc_id),
|
||||
)
|
||||
total_fixed += 1
|
||||
|
||||
if total_fixed > 0:
|
||||
logger.info(
|
||||
"QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
|
||||
total_fixed, incident_id,
|
||||
)
|
||||
|
||||
return total_fixed
|
||||
|
||||
|
||||
async def run_post_refresh_qc(db, incident_id: int) -> dict:
|
||||
"""Fuehrt den kompletten Post-Refresh Quality Check durch.
|
||||
|
||||
Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
|
||||
"""
|
||||
try:
|
||||
facts_removed = await check_fact_duplicates(db, incident_id)
|
||||
locations_fixed = await check_location_categories(db, incident_id)
|
||||
|
||||
if facts_removed > 0 or locations_fixed > 0:
|
||||
await db.commit()
|
||||
logger.info(
|
||||
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
|
||||
incident_id, facts_removed, locations_fixed,
|
||||
)
|
||||
|
||||
return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Post-Refresh QC Fehler fuer Incident %d: %s",
|
||||
incident_id, e, exc_info=True,
|
||||
)
|
||||
return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren