Faktencheck-Deduplizierung und Auto-Resolve implementiert
3-Ebenen-System gegen Duplikate: 1. Pre-Dedup: LLM-Antwort wird vor DB-Insert dedupliziert (deduplicate_new_facts) 2. Auto-Resolve: Bestaetigte Fakten loesen automatisch stale developing/unconfirmed Fakten auf 3. Periodische Konsolidierung: Haiku clustert alle 6h semantische Duplikate und entfernt sie Verbessertes Claim-Matching: SequenceMatcher (70%) + Jaccard-Keyword-Overlap (30%) statt reinem SequenceMatcher. Threshold von 0.7 auf 0.75 erhoeht. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -168,49 +168,120 @@ Jedes Element hat:
|
||||
Antworte NUR mit dem JSON-Array."""
|
||||
|
||||
|
||||
# --- Stopwords fuer Keyword-Extraktion ---
|
||||
_STOPWORDS = frozenset({
|
||||
"der", "die", "das", "ein", "eine", "und", "oder", "von", "nach", "bei", "mit",
|
||||
"wurde", "wird", "haben", "sein", "dass", "ist", "sind", "hat", "vor", "fuer",
|
||||
"den", "dem", "des", "sich", "auf", "als", "auch", "noch", "nicht", "aber",
|
||||
"ueber", "durch", "einer", "einem", "eines", "werden", "wurde", "waren",
|
||||
"the", "and", "was", "has", "been", "have", "that", "with", "from", "for",
|
||||
"are", "were", "this", "which", "into", "their", "than", "about",
|
||||
})
|
||||
|
||||
STATUS_PRIORITY = {
|
||||
"confirmed": 5, "established": 5,
|
||||
"contradicted": 4, "disputed": 4,
|
||||
"unconfirmed": 3, "unverified": 3,
|
||||
"developing": 1,
|
||||
}
|
||||
|
||||
|
||||
def normalize_claim(claim: str) -> str:
|
||||
"""Normalisiert einen Claim für Ähnlichkeitsvergleich."""
|
||||
"""Normalisiert einen Claim fuer Aehnlichkeitsvergleich."""
|
||||
c = claim.lower().strip()
|
||||
# Umlaute normalisieren
|
||||
c = c.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
|
||||
c = c.replace("\u00e4", "ae").replace("\u00f6", "oe").replace("\u00fc", "ue").replace("\u00df", "ss")
|
||||
c = re.sub(r'[^\w\s]', '', c)
|
||||
c = re.sub(r'\s+', ' ', c).strip()
|
||||
return c
|
||||
|
||||
|
||||
def find_matching_claim(new_claim: str, existing_claims: list[dict], threshold: float = 0.7) -> dict | None:
|
||||
"""Findet den besten passenden bestehenden Claim per Fuzzy-Matching.
|
||||
def _keyword_set(text: str) -> set[str]:
|
||||
"""Extrahiert signifikante Woerter fuer Overlap-Vergleich."""
|
||||
words = set(normalize_claim(text).split())
|
||||
return {w for w in words if len(w) >= 4 and w not in _STOPWORDS}
|
||||
|
||||
Args:
|
||||
new_claim: Der neue Claim-Text
|
||||
existing_claims: Liste von Dicts mit mindestens {"id", "claim", "status"}
|
||||
threshold: Mindest-Ähnlichkeit (0.0-1.0), Standard 0.7
|
||||
|
||||
Returns:
|
||||
Das passende Dict oder None wenn kein Match über dem Schwellwert
|
||||
def find_matching_claim(new_claim: str, existing_claims: list[dict], threshold: float = 0.75) -> dict | None:
|
||||
"""Findet den besten passenden bestehenden Claim per kombiniertem Scoring.
|
||||
|
||||
Verwendet SequenceMatcher (70%) + Jaccard-Keyword-Overlap (30%) fuer robusteres Matching.
|
||||
"""
|
||||
norm_new = normalize_claim(new_claim)
|
||||
if not norm_new:
|
||||
return None
|
||||
|
||||
kw_new = _keyword_set(new_claim)
|
||||
best_match = None
|
||||
best_ratio = 0.0
|
||||
best_score = 0.0
|
||||
|
||||
for existing in existing_claims:
|
||||
norm_existing = normalize_claim(existing.get("claim", ""))
|
||||
if not norm_existing:
|
||||
continue
|
||||
ratio = SequenceMatcher(None, norm_new, norm_existing).ratio()
|
||||
if ratio > best_ratio:
|
||||
best_ratio = ratio
|
||||
|
||||
# Fruehzeitiger Abbruch bei grossem Laengenunterschied
|
||||
len_ratio = len(norm_new) / len(norm_existing) if norm_existing else 0
|
||||
if len_ratio > 2.5 or len_ratio < 0.4:
|
||||
continue
|
||||
|
||||
seq_ratio = SequenceMatcher(None, norm_new, norm_existing).ratio()
|
||||
|
||||
kw_existing = _keyword_set(existing.get("claim", ""))
|
||||
kw_union = kw_new | kw_existing
|
||||
jaccard = len(kw_new & kw_existing) / len(kw_union) if kw_union else 0.0
|
||||
|
||||
combined = 0.7 * seq_ratio + 0.3 * jaccard
|
||||
|
||||
if combined > best_score:
|
||||
best_score = combined
|
||||
best_match = existing
|
||||
|
||||
if best_ratio >= threshold:
|
||||
logger.debug(f"Claim-Match ({best_ratio:.2f}): '{new_claim[:50]}...' → '{best_match['claim'][:50]}...'")
|
||||
if best_score >= threshold:
|
||||
logger.debug(
|
||||
f"Claim-Match ({best_score:.2f}): "
|
||||
f"'{new_claim[:50]}...' -> '{best_match['claim'][:50]}...'"
|
||||
)
|
||||
return best_match
|
||||
return None
|
||||
|
||||
|
||||
def deduplicate_new_facts(facts: list[dict], threshold: float = 0.70) -> list[dict]:
|
||||
"""Dedupliziert Fakten aus einer einzelnen LLM-Antwort vor dem DB-Insert.
|
||||
|
||||
Clustert aehnliche Claims und behaelt pro Cluster den mit dem
|
||||
hoechsten Status und den meisten Quellen.
|
||||
"""
|
||||
if not facts:
|
||||
return []
|
||||
|
||||
clusters: list[list[dict]] = []
|
||||
for fact in facts:
|
||||
matched_cluster = None
|
||||
for cluster in clusters:
|
||||
if find_matching_claim(fact.get("claim", ""), cluster, threshold=threshold):
|
||||
matched_cluster = cluster
|
||||
break
|
||||
if matched_cluster is not None:
|
||||
matched_cluster.append(fact)
|
||||
else:
|
||||
clusters.append([fact])
|
||||
|
||||
result = []
|
||||
for cluster in clusters:
|
||||
best = max(cluster, key=lambda f: (
|
||||
STATUS_PRIORITY.get(f.get("status", "developing"), 0),
|
||||
f.get("sources_count", 0),
|
||||
))
|
||||
result.append(best)
|
||||
|
||||
if len(result) < len(facts):
|
||||
logger.info(
|
||||
f"Fakten-Dedup: {len(facts)} -> {len(result)} "
|
||||
f"(-{len(facts) - len(result)} Duplikate)"
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
class FactCheckerAgent:
|
||||
"""Prüft Fakten über Claude CLI gegen unabhängige Quellen."""
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from typing import Optional
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from agents.claude_client import UsageAccumulator
|
||||
from agents.factchecker import find_matching_claim
|
||||
from agents.factchecker import find_matching_claim, deduplicate_new_facts
|
||||
from source_rules import (
|
||||
_detect_category,
|
||||
_extract_domain,
|
||||
@@ -890,6 +890,9 @@ class AgentOrchestrator:
|
||||
all_articles_for_fc = [dict(row) for row in await cursor.fetchall()]
|
||||
fact_checks, fc_usage = await factchecker.check(title, all_articles_for_fc, incident_type)
|
||||
|
||||
# Pre-Dedup: Duplikate aus LLM-Antwort entfernen
|
||||
fact_checks = deduplicate_new_facts(fact_checks)
|
||||
|
||||
if fc_usage:
|
||||
usage_acc.add(fc_usage)
|
||||
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren