Fakten-Konsolidierung: Evidenz zusammenfuehren statt nur loeschen

Beim Mergen von Duplikaten werden jetzt URLs und Quellen aus allen
Duplikaten in den besten Fakt uebernommen, bevor die Duplikate
entfernt werden. So gehen keine Belege verloren.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
claude-dev
2026-03-08 22:03:25 +01:00
Ursprung e2ea4eaaa0
Commit 204422ced9

Datei anzeigen

@@ -64,6 +64,44 @@ async def _ask_haiku_for_clusters(facts: list[dict]) -> list[list[int]]:
return [] return []
def _extract_urls(text: str) -> set[str]:
"""Extrahiert URLs aus Evidenz-Text."""
if not text:
return set()
return set(re.findall(r'https?://[^\s\)>\]]+', text))
def _merge_evidence(cluster_facts: list[dict], best_id: int) -> str:
"""Fuehrt Evidenz aus mehreren Duplikat-Fakten zusammen.
Behaelt die Evidenz des besten Fakts als Basis und ergaenzt
einzigartige URLs und Quellen aus den anderen Fakten.
"""
best_fact = next(f for f in cluster_facts if f["id"] == best_id)
base_evidence = best_fact.get("evidence") or ""
seen_urls = _extract_urls(base_evidence)
additional_sources = []
for fact in cluster_facts:
if fact["id"] == best_id:
continue
evidence = fact.get("evidence") or ""
if not evidence:
continue
new_urls = _extract_urls(evidence) - seen_urls
if new_urls:
seen_urls.update(new_urls)
additional_sources.append(evidence.strip())
if not additional_sources:
return base_evidence
merged = base_evidence.rstrip()
merged += " | Weitere Quellen: " + " ; ".join(additional_sources)
return merged
async def consolidate_fact_checks(max_per_incident: int = 25): async def consolidate_fact_checks(max_per_incident: int = 25):
"""Konsolidiert doppelte Faktenchecks via Haiku-Clustering.""" """Konsolidiert doppelte Faktenchecks via Haiku-Clustering."""
db = await get_db() db = await get_db()
@@ -117,7 +155,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25):
clusters = await _ask_haiku_for_clusters(batch) clusters = await _ask_haiku_for_clusters(batch)
all_clusters.extend(clusters) all_clusters.extend(clusters)
# Pro Cluster: besten behalten, Rest loeschen # Pro Cluster: Evidenz zusammenfuehren, besten behalten, Rest loeschen
ids_to_delete = [] ids_to_delete = []
facts_by_id = {f["id"]: f for f in all_facts} facts_by_id = {f["id"]: f for f in all_facts}
@@ -133,6 +171,18 @@ async def consolidate_fact_checks(max_per_incident: int = 25):
f.get("checked_at", ""), f.get("checked_at", ""),
)) ))
# Evidenz aus allen Duplikaten zusammenfuehren
merged_evidence = _merge_evidence(cluster_facts, best["id"])
merged_sources = sum(
f.get("sources_count", 0) for f in cluster_facts
)
if merged_evidence or merged_sources > best.get("sources_count", 0):
await db.execute(
"UPDATE fact_checks SET evidence = ?, sources_count = ? "
"WHERE id = ?",
(merged_evidence, merged_sources, best["id"]),
)
for fact in cluster_facts: for fact in cluster_facts:
if fact["id"] != best["id"]: if fact["id"] != best["id"]:
ids_to_delete.append(fact["id"]) ids_to_delete.append(fact["id"])
@@ -147,6 +197,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25):
total_removed += len(unique_ids) total_removed += len(unique_ids)
logger.info( logger.info(
f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, " f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, "
f"Quellen zusammengefuehrt, "
f"{len(all_facts) - len(unique_ids)} verbleiben" f"{len(all_facts) - len(unique_ids)} verbleiben"
) )