"""Periodische Faktencheck-Konsolidierung via Haiku. Erkennt und merged semantische Duplikate unter Faktenchecks. Laeuft als Scheduler-Job alle 6 Stunden. """ import json import logging import re from datetime import datetime from config import CLAUDE_MODEL_FAST, TIMEZONE from database import get_db from agents.claude_client import call_claude logger = logging.getLogger("osint.fact_consolidation") STATUS_PRIORITY = { "confirmed": 5, "established": 5, "contradicted": 4, "disputed": 4, "unconfirmed": 3, "unverified": 3, "developing": 1, } CONSOLIDATION_PROMPT = ( "Du bist ein Deduplizierungs-Agent. Du bekommst eine Liste von Faktenchecks (ID + Claim + Status).\n" "Finde Gruppen von Fakten, die inhaltlich DASSELBE aussagen (auch bei unterschiedlicher Formulierung).\n\n" "REGELN:\n" '- Gleicher Sachverhalt = gleiche Gruppe (z.B. "Khamenei wurde getoetet" und "Chamenei bei Angriff ums Leben gekommen")\n' "- Unterschiedliche Detailtiefe zum SELBEN Fakt = gleiche Gruppe\n" '- VERSCHIEDENE Sachverhalte = verschiedene Gruppen (z.B. "Angriff auf Isfahan" vs "Angriff auf Teheran")\n' "- Eine Gruppe muss mindestens 2 Eintraege haben\n\n" "Antworte NUR als JSON-Array von Gruppen. Jede Gruppe ist ein Array von IDs:\n" "[[1,5,12], [3,8], [20,25,30]]\n\n" "Wenn keine Duplikate: antworte mit []\n\n" "FAKTEN:\n{facts_text}" ) async def _ask_haiku_for_clusters(facts: list[dict]) -> list[list[int]]: """Fragt Haiku welche Fakten semantische Duplikate sind.""" facts_text = "\n".join( f'ID={f["id"]} [{f["status"]}]: {f["claim"]}' for f in facts ) prompt = CONSOLIDATION_PROMPT.format(facts_text=facts_text) try: result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST) data = json.loads(result) if isinstance(data, list) and all(isinstance(g, list) for g in data): return data except json.JSONDecodeError: match = re.search(r'\[.*\]', result, re.DOTALL) if match: try: data = json.loads(match.group()) if isinstance(data, list): return data except json.JSONDecodeError: pass except Exception as e: logger.error(f"Haiku-Cluster-Anfrage fehlgeschlagen: {e}") return [] async def consolidate_fact_checks(max_per_incident: int = 25): """Konsolidiert doppelte Faktenchecks via Haiku-Clustering.""" db = await get_db() try: cursor = await db.execute( "SELECT incident_id, COUNT(*) as cnt FROM fact_checks " "GROUP BY incident_id HAVING cnt > ?", (max_per_incident,), ) bloated = [dict(row) for row in await cursor.fetchall()] if not bloated: logger.info("Faktencheck-Konsolidierung: keine aufgeblaehten Incidents gefunden") return 0 total_removed = 0 for row in bloated: incident_id = row["incident_id"] # Pruefe ob gerade ein Refresh laeuft cursor_rl = await db.execute( "SELECT COUNT(*) as cnt FROM refresh_log " "WHERE incident_id = ? AND status = 'running'", (incident_id,), ) rl_row = await cursor_rl.fetchone() if rl_row and rl_row["cnt"] > 0: logger.info( f"Incident {incident_id} hat laufenden Refresh, ueberspringe" ) continue cursor2 = await db.execute( "SELECT id, claim, status, sources_count, evidence, " "checked_at, status_history " "FROM fact_checks WHERE incident_id = ? " "ORDER BY checked_at DESC", (incident_id,), ) all_facts = [dict(r) for r in await cursor2.fetchall()] if len(all_facts) <= max_per_incident: continue # Haiku in Batches fragen all_clusters = [] batch_size = 80 for i in range(0, len(all_facts), batch_size): batch = all_facts[i:i + batch_size] clusters = await _ask_haiku_for_clusters(batch) all_clusters.extend(clusters) # Pro Cluster: besten behalten, Rest loeschen ids_to_delete = [] facts_by_id = {f["id"]: f for f in all_facts} for cluster_ids in all_clusters: valid_ids = [cid for cid in cluster_ids if cid in facts_by_id] if len(valid_ids) <= 1: continue cluster_facts = [facts_by_id[cid] for cid in valid_ids] best = max(cluster_facts, key=lambda f: ( STATUS_PRIORITY.get(f["status"], 0), f.get("sources_count", 0), f.get("checked_at", ""), )) for fact in cluster_facts: if fact["id"] != best["id"]: ids_to_delete.append(fact["id"]) if ids_to_delete: unique_ids = list(set(ids_to_delete)) placeholders = ",".join("?" * len(unique_ids)) await db.execute( f"DELETE FROM fact_checks WHERE id IN ({placeholders})", unique_ids, ) total_removed += len(unique_ids) logger.info( f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, " f"{len(all_facts) - len(unique_ids)} verbleiben" ) await db.commit() if total_removed > 0: logger.info( f"Faktencheck-Konsolidierung: {total_removed} Duplikate entfernt" ) return total_removed except Exception as e: logger.error( f"Faktencheck-Konsolidierung Fehler: {e}", exc_info=True ) return 0 finally: await db.close() async def auto_resolve_stale_facts(incident_id: int, confirmed_claims: list[dict], db): """Loest veraltete developing/unconfirmed Fakten automatisch auf, wenn ein bestaetigter Match gefunden wird. Wird vom Orchestrator nach jedem Faktencheck aufgerufen. """ if not confirmed_claims: return 0 from agents.factchecker import find_matching_claim now = datetime.now(TIMEZONE).strftime('%Y-%m-%d %H:%M:%S') cursor = await db.execute( "SELECT id, claim, status, status_history FROM fact_checks " "WHERE incident_id = ? " "AND status IN ('developing', 'unconfirmed', 'unverified')", (incident_id,), ) stale_facts = [dict(row) for row in await cursor.fetchall()] if not stale_facts: return 0 resolved_count = 0 resolved_ids = set() for confirmed_fc in confirmed_claims: confirmed_claim_text = confirmed_fc.get("claim", "") for stale in stale_facts: if stale["id"] in resolved_ids: continue # Niedrigerer Threshold (0.65) fuer aggressiveres Auto-Resolve if find_matching_claim( confirmed_claim_text, [stale], threshold=0.65 ): try: history = json.loads( stale.get("status_history") or "[]" ) except (ValueError, TypeError): history = [] new_status = ( "confirmed" if confirmed_fc.get("status") == "confirmed" else "established" ) history.append({ "status": new_status, "at": now, "reason": "auto-resolved", }) await db.execute( "UPDATE fact_checks SET status = ?, " "evidence = COALESCE(evidence, '') " "|| ' [Auto-aufgeloest: uebereinstimmender Fakt bestaetigt]', " "status_history = ?, checked_at = ? WHERE id = ?", (new_status, json.dumps(history), now, stale["id"]), ) resolved_ids.add(stale["id"]) resolved_count += 1 logger.info( f"Auto-resolved Fakt #{stale['id']}: " f"'{stale['claim'][:60]}...' -> {new_status}" ) if resolved_count > 0: logger.info( f"Auto-Resolve: {resolved_count} veraltete Fakten " f"fuer Incident {incident_id} aufgeloest" ) return resolved_count