diff --git a/src/services/fact_consolidation.py b/src/services/fact_consolidation.py index 867bcbd..297ec7d 100644 --- a/src/services/fact_consolidation.py +++ b/src/services/fact_consolidation.py @@ -64,6 +64,44 @@ async def _ask_haiku_for_clusters(facts: list[dict]) -> list[list[int]]: return [] + +def _extract_urls(text: str) -> set[str]: + """Extrahiert URLs aus Evidenz-Text.""" + if not text: + return set() + return set(re.findall(r'https?://[^\s\)>\]]+', text)) + + +def _merge_evidence(cluster_facts: list[dict], best_id: int) -> str: + """Fuehrt Evidenz aus mehreren Duplikat-Fakten zusammen. + + Behaelt die Evidenz des besten Fakts als Basis und ergaenzt + einzigartige URLs und Quellen aus den anderen Fakten. + """ + best_fact = next(f for f in cluster_facts if f["id"] == best_id) + base_evidence = best_fact.get("evidence") or "" + seen_urls = _extract_urls(base_evidence) + + additional_sources = [] + for fact in cluster_facts: + if fact["id"] == best_id: + continue + evidence = fact.get("evidence") or "" + if not evidence: + continue + new_urls = _extract_urls(evidence) - seen_urls + if new_urls: + seen_urls.update(new_urls) + additional_sources.append(evidence.strip()) + + if not additional_sources: + return base_evidence + + merged = base_evidence.rstrip() + merged += " | Weitere Quellen: " + " ; ".join(additional_sources) + return merged + + async def consolidate_fact_checks(max_per_incident: int = 25): """Konsolidiert doppelte Faktenchecks via Haiku-Clustering.""" db = await get_db() @@ -117,7 +155,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25): clusters = await _ask_haiku_for_clusters(batch) all_clusters.extend(clusters) - # Pro Cluster: besten behalten, Rest loeschen + # Pro Cluster: Evidenz zusammenfuehren, besten behalten, Rest loeschen ids_to_delete = [] facts_by_id = {f["id"]: f for f in all_facts} @@ -133,6 +171,18 @@ async def consolidate_fact_checks(max_per_incident: int = 25): f.get("checked_at", ""), )) + # Evidenz aus allen Duplikaten zusammenfuehren + merged_evidence = _merge_evidence(cluster_facts, best["id"]) + merged_sources = sum( + f.get("sources_count", 0) for f in cluster_facts + ) + if merged_evidence or merged_sources > best.get("sources_count", 0): + await db.execute( + "UPDATE fact_checks SET evidence = ?, sources_count = ? " + "WHERE id = ?", + (merged_evidence, merged_sources, best["id"]), + ) + for fact in cluster_facts: if fact["id"] != best["id"]: ids_to_delete.append(fact["id"]) @@ -147,6 +197,7 @@ async def consolidate_fact_checks(max_per_incident: int = 25): total_removed += len(unique_ids) logger.info( f"Incident {incident_id}: {len(unique_ids)} Duplikate entfernt, " + f"Quellen zusammengefuehrt, " f"{len(all_facts) - len(unique_ids)} verbleiben" )