Dateien
AegisSight-Monitor/src/services/fact_consolidation.py
claude-dev e2ea4eaaa0 Faktencheck-Deduplizierung und Auto-Resolve implementiert
3-Ebenen-System gegen Duplikate:
1. Pre-Dedup: LLM-Antwort wird vor DB-Insert dedupliziert (deduplicate_new_facts)
2. Auto-Resolve: Bestaetigte Fakten loesen automatisch stale developing/unconfirmed Fakten auf
3. Periodische Konsolidierung: Haiku clustert alle 6h semantische Duplikate und entfernt sie

Verbessertes Claim-Matching: SequenceMatcher (70%) + Jaccard-Keyword-Overlap (30%)
statt reinem SequenceMatcher. Threshold von 0.7 auf 0.75 erhoeht.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 21:59:50 +01:00

244 Zeilen
8.5 KiB
Python

"""Periodische Faktencheck-Konsolidierung via Haiku.
Erkennt und merged semantische Duplikate unter Faktenchecks.
Laeuft als Scheduler-Job alle 6 Stunden.
"""
import json
import logging
import re
from datetime import datetime
from config import CLAUDE_MODEL_FAST, TIMEZONE
from database import get_db
from agents.claude_client import call_claude
logger = logging.getLogger("osint.fact_consolidation")
STATUS_PRIORITY = {
"confirmed": 5, "established": 5,
"contradicted": 4, "disputed": 4,
"unconfirmed": 3, "unverified": 3,
"developing": 1,
}
CONSOLIDATION_PROMPT = (
"Du bist ein Deduplizierungs-Agent. Du bekommst eine Liste von Faktenchecks (ID + Claim + Status).\n"
"Finde Gruppen von Fakten, die inhaltlich DASSELBE aussagen (auch bei unterschiedlicher Formulierung).\n\n"
"REGELN:\n"
'- Gleicher Sachverhalt = gleiche Gruppe (z.B. "Khamenei wurde getoetet" und "Chamenei bei Angriff ums Leben gekommen")\n'
"- Unterschiedliche Detailtiefe zum SELBEN Fakt = gleiche Gruppe\n"
'- VERSCHIEDENE Sachverhalte = verschiedene Gruppen (z.B. "Angriff auf Isfahan" vs "Angriff auf Teheran")\n'
"- Eine Gruppe muss mindestens 2 Eintraege haben\n\n"
"Antworte NUR als JSON-Array von Gruppen. Jede Gruppe ist ein Array von IDs:\n"
"[[1,5,12], [3,8], [20,25,30]]\n\n"
"Wenn keine Duplikate: antworte mit []\n\n"
"FAKTEN:\n{facts_text}"
)
async def _ask_haiku_for_clusters(facts: list[dict]) -> list[list[int]]:
"""Fragt Haiku welche Fakten semantische Duplikate sind."""
facts_text = "\n".join(
f'ID={f["id"]} [{f["status"]}]: {f["claim"]}'
for f in facts
)
prompt = CONSOLIDATION_PROMPT.format(facts_text=facts_text)
try:
result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
data = json.loads(result)
if isinstance(data, list) and all(isinstance(g, list) for g in data):
return data
except json.JSONDecodeError:
match = re.search(r'\[.*\]', result, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if isinstance(data, list):
return data
except json.JSONDecodeError:
pass
except Exception as e:
logger.error(f"Haiku-Cluster-Anfrage fehlgeschlagen: {e}")
return []
async def consolidate_fact_checks(max_per_incident: int = 25):
"""Konsolidiert doppelte Faktenchecks via Haiku-Clustering."""
db = await get_db()
try:
cursor = await db.execute(
"SELECT incident_id, COUNT(*) as cnt FROM fact_checks "
"GROUP BY incident_id HAVING cnt > ?",
(max_per_incident,),
)
bloated = [dict(row) for row in await cursor.fetchall()]
if not bloated:
logger.info("Faktencheck-Konsolidierung: keine aufgeblaehten Incidents gefunden")
return 0
total_removed = 0
for row in bloated:
incident_id = row["incident_id"]
# Pruefe ob gerade ein Refresh laeuft
cursor_rl = await db.execute(
"SELECT COUNT(*) as cnt FROM refresh_log "
"WHERE incident_id = ? AND status = 'running'",
(incident_id,),
)
rl_row = await cursor_rl.fetchone()
if rl_row and rl_row["cnt"] > 0:
logger.info(
f"Incident {incident_id} hat laufenden Refresh, ueberspringe"
)
continue
cursor2 = await db.execute(
"SELECT id, claim, status, sources_count, evidence, "
"checked_at, status_history "
"FROM fact_checks WHERE incident_id = ? "
"ORDER BY checked_at DESC",
(incident_id,),
)
all_facts = [dict(r) for r in await cursor2.fetchall()]
if len(all_facts) <= max_per_incident:
continue
# Haiku in Batches fragen
all_clusters = []
batch_size = 80
for i in range(0, len(all_facts), batch_size):
batch = all_facts[i:i + batch_size]
clusters = await _ask_haiku_for_clusters(batch)
all_clusters.extend(clusters)
# Pro Cluster: besten behalten, Rest loeschen
ids_to_delete = []
facts_by_id = {f["id"]: f for f in all_facts}
for cluster_ids in all_clusters:
valid_ids = [cid for cid in cluster_ids if cid in facts_by_id]
if len(valid_ids) <= 1:
continue
cluster_facts = [facts_by_id[cid] for cid in valid_ids]
best = max(cluster_facts, key=lambda f: (
STATUS_PRIORITY.get(f["status"], 0),
f.get("sources_count", 0),
f.get("checked_at", ""),
))
for fact in cluster_facts:
if fact["id"] != best["id"]:
ids_to_delete.append(fact["id"])
if ids_to_delete:
unique_ids = list(set(ids_to_delete))
placeholders = ",".join("?" * len(unique_ids))
await db.execute(
f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
unique_ids,
)
total_removed += len(unique_ids)
logger.info(
f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, "
f"{len(all_facts) - len(unique_ids)} verbleiben"
)
await db.commit()
if total_removed > 0:
logger.info(
f"Faktencheck-Konsolidierung: {total_removed} Duplikate entfernt"
)
return total_removed
except Exception as e:
logger.error(
f"Faktencheck-Konsolidierung Fehler: {e}", exc_info=True
)
return 0
finally:
await db.close()
async def auto_resolve_stale_facts(incident_id: int, confirmed_claims: list[dict], db):
"""Loest veraltete developing/unconfirmed Fakten automatisch auf,
wenn ein bestaetigter Match gefunden wird.
Wird vom Orchestrator nach jedem Faktencheck aufgerufen.
"""
if not confirmed_claims:
return 0
from agents.factchecker import find_matching_claim
now = datetime.now(TIMEZONE).strftime('%Y-%m-%d %H:%M:%S')
cursor = await db.execute(
"SELECT id, claim, status, status_history FROM fact_checks "
"WHERE incident_id = ? "
"AND status IN ('developing', 'unconfirmed', 'unverified')",
(incident_id,),
)
stale_facts = [dict(row) for row in await cursor.fetchall()]
if not stale_facts:
return 0
resolved_count = 0
resolved_ids = set()
for confirmed_fc in confirmed_claims:
confirmed_claim_text = confirmed_fc.get("claim", "")
for stale in stale_facts:
if stale["id"] in resolved_ids:
continue
# Niedrigerer Threshold (0.65) fuer aggressiveres Auto-Resolve
if find_matching_claim(
confirmed_claim_text, [stale], threshold=0.65
):
try:
history = json.loads(
stale.get("status_history") or "[]"
)
except (ValueError, TypeError):
history = []
new_status = (
"confirmed"
if confirmed_fc.get("status") == "confirmed"
else "established"
)
history.append({
"status": new_status,
"at": now,
"reason": "auto-resolved",
})
await db.execute(
"UPDATE fact_checks SET status = ?, "
"evidence = COALESCE(evidence, '') "
"|| ' [Auto-aufgeloest: uebereinstimmender Fakt bestaetigt]', "
"status_history = ?, checked_at = ? WHERE id = ?",
(new_status, json.dumps(history), now, stale["id"]),
)
resolved_ids.add(stale["id"])
resolved_count += 1
logger.info(
f"Auto-resolved Fakt #{stale['id']}: "
f"'{stale['claim'][:60]}...' -> {new_status}"
)
if resolved_count > 0:
logger.info(
f"Auto-Resolve: {resolved_count} veraltete Fakten "
f"fuer Incident {incident_id} aufgeloest"
)
return resolved_count