From 204422ced9ae7917fe02a51b3fb1db18ec8a8fca Mon Sep 17 00:00:00 2001 From: claude-dev Date: Sun, 8 Mar 2026 22:03:25 +0100 Subject: [PATCH] Fakten-Konsolidierung: Evidenz zusammenfuehren statt nur loeschen Beim Mergen von Duplikaten werden jetzt URLs und Quellen aus allen Duplikaten in den besten Fakt uebernommen, bevor die Duplikate entfernt werden. So gehen keine Belege verloren. Co-Authored-By: Claude Opus 4.6 --- src/services/fact_consolidation.py | 53 +++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/services/fact_consolidation.py b/src/services/fact_consolidation.py index 867bcbd..297ec7d 100644 --- a/src/services/fact_consolidation.py +++ b/src/services/fact_consolidation.py @@ -64,6 +64,44 @@ async def _ask_haiku_for_clusters(facts: list[dict]) -> list[list[int]]: return [] + +def _extract_urls(text: str) -> set[str]: + """Extrahiert URLs aus Evidenz-Text.""" + if not text: + return set() + return set(re.findall(r'https?://[^\s\)>\]]+', text)) + + +def _merge_evidence(cluster_facts: list[dict], best_id: int) -> str: + """Fuehrt Evidenz aus mehreren Duplikat-Fakten zusammen. + + Behaelt die Evidenz des besten Fakts als Basis und ergaenzt + einzigartige URLs und Quellen aus den anderen Fakten. + """ + best_fact = next(f for f in cluster_facts if f["id"] == best_id) + base_evidence = best_fact.get("evidence") or "" + seen_urls = _extract_urls(base_evidence) + + additional_sources = [] + for fact in cluster_facts: + if fact["id"] == best_id: + continue + evidence = fact.get("evidence") or "" + if not evidence: + continue + new_urls = _extract_urls(evidence) - seen_urls + if new_urls: + seen_urls.update(new_urls) + additional_sources.append(evidence.strip()) + + if not additional_sources: + return base_evidence + + merged = base_evidence.rstrip() + merged += " | Weitere Quellen: " + " ; ".join(additional_sources) + return merged + + async def consolidate_fact_checks(max_per_incident: int = 25): """Konsolidiert doppelte Faktenchecks via Haiku-Clustering.""" db = await get_db() @@ -117,7 +155,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25): clusters = await _ask_haiku_for_clusters(batch) all_clusters.extend(clusters) - # Pro Cluster: besten behalten, Rest loeschen + # Pro Cluster: Evidenz zusammenfuehren, besten behalten, Rest loeschen ids_to_delete = [] facts_by_id = {f["id"]: f for f in all_facts} @@ -133,6 +171,18 @@ async def consolidate_fact_checks(max_per_incident: int = 25): f.get("checked_at", ""), )) + # Evidenz aus allen Duplikaten zusammenfuehren + merged_evidence = _merge_evidence(cluster_facts, best["id"]) + merged_sources = sum( + f.get("sources_count", 0) for f in cluster_facts + ) + if merged_evidence or merged_sources > best.get("sources_count", 0): + await db.execute( + "UPDATE fact_checks SET evidence = ?, sources_count = ? " + "WHERE id = ?", + (merged_evidence, merged_sources, best["id"]), + ) + for fact in cluster_facts: if fact["id"] != best["id"]: ids_to_delete.append(fact["id"]) @@ -147,6 +197,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25): total_removed += len(unique_ids) logger.info( f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, " + f"Quellen zusammengefuehrt, " f"{len(all_facts) - len(unique_ids)} verbleiben" )