feat: Post-Refresh Quality Check fuer Faktenchecks und Karten-Locations
Automatischer QC-Schritt nach jedem Refresh: - Erkennt inhaltliche Faktencheck-Duplikate via Fuzzy-Matching (Threshold 0.80) - Korrigiert falsch kategorisierte Karten-Locations (z.B. entfernte Laender als 'target') - Laeuft nach dem Faktencheck-Commit, vor den Notifications - Fehler im QC blockieren nicht den Refresh-Ablauf Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -985,6 +985,17 @@ class AgentOrchestrator:
|
|||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
# Post-Refresh Quality Check: Duplikate und Karten-Kategorien pruefen
|
||||||
|
try:
|
||||||
|
from services.post_refresh_qc import run_post_refresh_qc
|
||||||
|
qc_result = await run_post_refresh_qc(db, incident_id)
|
||||||
|
if qc_result.get("facts_removed", 0) > 0 or qc_result.get("locations_fixed", 0) > 0:
|
||||||
|
logger.info(
|
||||||
|
f"QC: {qc_result['facts_removed']} Duplikate, "
|
||||||
|
f"{qc_result['locations_fixed']} Location-Fixes"
|
||||||
|
)
|
||||||
|
except Exception as qc_err:
|
||||||
|
logger.warning(f"Post-Refresh QC fehlgeschlagen: {qc_err}")
|
||||||
# Gebündelte Notification senden (nicht beim ersten Refresh)
|
# Gebündelte Notification senden (nicht beim ersten Refresh)
|
||||||
if not is_first_refresh:
|
if not is_first_refresh:
|
||||||
if self._ws_manager:
|
if self._ws_manager:
|
||||||
|
|||||||
217
src/services/post_refresh_qc.py
Normale Datei
217
src/services/post_refresh_qc.py
Normale Datei
@@ -0,0 +1,217 @@
|
|||||||
|
"""Post-Refresh Quality Check: Prueft Faktenchecks und Karten-Locations nach jedem Refresh.
|
||||||
|
|
||||||
|
Erkennt:
|
||||||
|
1. Inhaltliche Faktencheck-Duplikate (fuzzy matching)
|
||||||
|
2. Falsch kategorisierte Karten-Locations (z.B. Laender als 'target' die nicht angegriffen wurden)
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
logger = logging.getLogger("osint.post_refresh_qc")
|
||||||
|
|
||||||
|
# Orte die in einem Konflikt-Kontext fast nie echte Angriffsziele sind
|
||||||
|
# (werden typischerweise wirtschaftlich/politisch erwaehnt)
|
||||||
|
_NON_TARGET_LOCATIONS = {
|
||||||
|
# Weit entfernte Laender (keine direkten Kriegsziele)
|
||||||
|
"australia", "australien", "northern territory", "queensland",
|
||||||
|
"new south wales", "victoria",
|
||||||
|
"cuba", "kuba",
|
||||||
|
"new york city", "new york", "washington",
|
||||||
|
"taiwan", "south korea", "japan",
|
||||||
|
"afghanistan", "pakistan", "karachi",
|
||||||
|
"china", "peking", "beijing",
|
||||||
|
"indien", "india", "new delhi",
|
||||||
|
"brasilien", "brazil",
|
||||||
|
"mexiko", "mexico",
|
||||||
|
"argentinien", "argentina",
|
||||||
|
"kanada", "canada",
|
||||||
|
"philippinen", "philippines",
|
||||||
|
"indonesien", "indonesia",
|
||||||
|
"nigeria", "south africa",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Orte die je nach Konflikt als actor/response statt target kategorisiert werden sollten
|
||||||
|
_ACTOR_NOT_TARGET = {
|
||||||
|
"usa", "united states", "us", "vereinigte staaten",
|
||||||
|
"deutschland", "germany",
|
||||||
|
"frankreich", "france",
|
||||||
|
"grossbritannien", "united kingdom", "uk",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def check_fact_duplicates(db, incident_id: int) -> int:
|
||||||
|
"""Prueft auf inhaltliche Faktencheck-Duplikate innerhalb einer Lage.
|
||||||
|
|
||||||
|
Nutzt Fuzzy-Matching (SequenceMatcher + Keyword-Jaccard) um
|
||||||
|
semantisch identische Claims zu finden und das schwaecher belegte zu entfernen.
|
||||||
|
|
||||||
|
Returns: Anzahl entfernter Duplikate.
|
||||||
|
"""
|
||||||
|
from agents.factchecker import normalize_claim, _keyword_set
|
||||||
|
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id, claim, status, sources_count, evidence, checked_at "
|
||||||
|
"FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
|
||||||
|
(incident_id,),
|
||||||
|
)
|
||||||
|
all_facts = [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|
||||||
|
if len(all_facts) < 2:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
STATUS_PRIORITY = {
|
||||||
|
"confirmed": 5, "established": 5,
|
||||||
|
"contradicted": 4, "disputed": 4,
|
||||||
|
"unconfirmed": 3, "unverified": 3,
|
||||||
|
"developing": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Vorberechnung: normalisierte Claims und Keywords
|
||||||
|
normalized = []
|
||||||
|
for f in all_facts:
|
||||||
|
nc = normalize_claim(f["claim"])
|
||||||
|
kw = _keyword_set(f["claim"])
|
||||||
|
normalized.append((f, nc, kw))
|
||||||
|
|
||||||
|
# Paarweiser Vergleich: nur die letzten 50 Fakten gegen alle pruefen
|
||||||
|
ids_to_delete = set()
|
||||||
|
checked_pairs = set()
|
||||||
|
|
||||||
|
recent = normalized[:50]
|
||||||
|
for i, (fact_a, norm_a, kw_a) in enumerate(recent):
|
||||||
|
if fact_a["id"] in ids_to_delete:
|
||||||
|
continue
|
||||||
|
for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
|
||||||
|
if i >= j or fact_b["id"] == fact_a["id"] or fact_b["id"] in ids_to_delete:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pair_key = (min(fact_a["id"], fact_b["id"]), max(fact_a["id"], fact_b["id"]))
|
||||||
|
if pair_key in checked_pairs:
|
||||||
|
continue
|
||||||
|
checked_pairs.add(pair_key)
|
||||||
|
|
||||||
|
if not norm_a or not norm_b:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Laengenfilter
|
||||||
|
len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
|
||||||
|
if len_ratio > 2.5 or len_ratio < 0.4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
|
||||||
|
kw_union = kw_a | kw_b
|
||||||
|
jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
|
||||||
|
combined = 0.7 * seq_ratio + 0.3 * jaccard
|
||||||
|
|
||||||
|
if combined >= 0.80: # Hoher Threshold fuer Duplikaterkennung
|
||||||
|
# Behalte den mit hoeherem Status-Prio + mehr Quellen
|
||||||
|
score_a = (STATUS_PRIORITY.get(fact_a["status"], 0), fact_a.get("sources_count", 0))
|
||||||
|
score_b = (STATUS_PRIORITY.get(fact_b["status"], 0), fact_b.get("sources_count", 0))
|
||||||
|
|
||||||
|
loser_id = fact_b["id"] if score_a >= score_b else fact_a["id"]
|
||||||
|
winner_id = fact_a["id"] if score_a >= score_b else fact_b["id"]
|
||||||
|
ids_to_delete.add(loser_id)
|
||||||
|
logger.info(
|
||||||
|
"QC Duplikat: ID %d entfernt (Score %.2f), behalte ID %d",
|
||||||
|
loser_id, combined, winner_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
if ids_to_delete:
|
||||||
|
placeholders = ",".join("?" * len(ids_to_delete))
|
||||||
|
await db.execute(
|
||||||
|
f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
|
||||||
|
list(ids_to_delete),
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
|
||||||
|
len(ids_to_delete), incident_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return len(ids_to_delete)
|
||||||
|
|
||||||
|
|
||||||
|
async def check_location_categories(db, incident_id: int) -> int:
|
||||||
|
"""Prueft und korrigiert falsch kategorisierte Karten-Locations.
|
||||||
|
|
||||||
|
Locations die als 'target' markiert sind aber offensichtlich keine
|
||||||
|
Angriffsziele im Konfliktkontext darstellen, werden korrigiert.
|
||||||
|
|
||||||
|
Returns: Anzahl korrigierter Eintraege.
|
||||||
|
"""
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id, location_name, latitude, longitude, category "
|
||||||
|
"FROM article_locations WHERE incident_id = ? AND category = 'target'",
|
||||||
|
(incident_id,),
|
||||||
|
)
|
||||||
|
targets = [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|
||||||
|
if not targets:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
fixes = []
|
||||||
|
|
||||||
|
for loc in targets:
|
||||||
|
name_lower = loc["location_name"].lower().strip()
|
||||||
|
|
||||||
|
if name_lower in _NON_TARGET_LOCATIONS:
|
||||||
|
fixes.append((loc["id"], "mentioned"))
|
||||||
|
elif name_lower in _ACTOR_NOT_TARGET:
|
||||||
|
lat, lon = loc["latitude"], loc["longitude"]
|
||||||
|
# USA mit DC-Koordinaten -> actor
|
||||||
|
if name_lower in ("usa", "united states", "us", "vereinigte staaten"):
|
||||||
|
if 37.0 <= lat <= 41.0 and -79.0 <= lon <= -74.0:
|
||||||
|
fixes.append((loc["id"], "actor"))
|
||||||
|
# Deutschland mit Berlin-Koordinaten -> response
|
||||||
|
elif name_lower in ("deutschland", "germany"):
|
||||||
|
if 50.0 <= lat <= 54.0 and 10.0 <= lon <= 15.0:
|
||||||
|
fixes.append((loc["id"], "response"))
|
||||||
|
# Frankreich mit Paris-Koordinaten -> response
|
||||||
|
elif name_lower in ("frankreich", "france"):
|
||||||
|
if 45.0 <= lat <= 50.0 and 1.0 <= lon <= 4.0:
|
||||||
|
fixes.append((loc["id"], "response"))
|
||||||
|
# UK mit London-Koordinaten -> response
|
||||||
|
elif name_lower in ("grossbritannien", "united kingdom", "uk"):
|
||||||
|
if 50.0 <= lat <= 56.0 and -3.0 <= lon <= 1.0:
|
||||||
|
fixes.append((loc["id"], "response"))
|
||||||
|
|
||||||
|
total_fixed = 0
|
||||||
|
for loc_id, new_category in fixes:
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE article_locations SET category = ? WHERE id = ?",
|
||||||
|
(new_category, loc_id),
|
||||||
|
)
|
||||||
|
total_fixed += 1
|
||||||
|
|
||||||
|
if total_fixed > 0:
|
||||||
|
logger.info(
|
||||||
|
"QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
|
||||||
|
total_fixed, incident_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return total_fixed
|
||||||
|
|
||||||
|
|
||||||
|
async def run_post_refresh_qc(db, incident_id: int) -> dict:
|
||||||
|
"""Fuehrt den kompletten Post-Refresh Quality Check durch.
|
||||||
|
|
||||||
|
Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
facts_removed = await check_fact_duplicates(db, incident_id)
|
||||||
|
locations_fixed = await check_location_categories(db, incident_id)
|
||||||
|
|
||||||
|
if facts_removed > 0 or locations_fixed > 0:
|
||||||
|
await db.commit()
|
||||||
|
logger.info(
|
||||||
|
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
|
||||||
|
incident_id, facts_removed, locations_fixed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Post-Refresh QC Fehler fuer Incident %d: %s",
|
||||||
|
incident_id, e, exc_info=True,
|
||||||
|
)
|
||||||
|
return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}
|
||||||
In neuem Issue referenzieren
Einen Benutzer sperren