feat(fimi): Fundament Counter-Disinformation-Match (Embedding + LLM-Verifikation)
Zweistufiger Abgleich von Monitor-Artikeln gegen den EUvsDisinfo- Falschbehauptungsbestand, vollstaendig im Monitor (kein Vigil-Call): - services/embeddings.py: SentenceTransformer-Singleton (paraphrase- multilingual-MiniLM-L12-v2), Modell-Cache mit Vigil geteilt. - fimi_claims-Tabelle + scripts/import_fimi_claims.py: Einmal-/Sync-Import der 19.629 EUvsDisinfo-Claims inkl. Embedding-BLOB und Case-URL. - services/fimi_matcher.py: Stufe 1 Embedding-Vorfilter (numpy-Matrix im RAM, Kosinus), Stufe 2 Haiku-Verifikation (verbreitet vs. berichtet/widerlegt), speichert nur bestaetigte Verbreitungen + woertliches Zitat. - article_fimi_matches-Tabelle + fimi_checked_at-Marker auf articles. - requirements.txt: torch, sentence-transformers, transformers, numpy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
116
scripts/import_fimi_claims.py
Ausführbare Datei
116
scripts/import_fimi_claims.py
Ausführbare Datei
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Einmal-/Sync-Import des EUvsDisinfo-Falschbehauptungsbestands in den Monitor.
|
||||
|
||||
Kopiert die Claims (Text, Verdict, Widerlegung, Quell-Referenz, Embedding-BLOB)
|
||||
aus der Vigil-Datenbank in die Monitor-Tabelle fimi_claims. Die Embeddings
|
||||
werden als BLOB 1:1 uebernommen (384-dim float32, L2-normalisiert) und im
|
||||
Monitor mit demselben Modell (paraphrase-multilingual-MiniLM-L12-v2) gematcht.
|
||||
|
||||
Idempotent: UPSERT auf der stabilen Vigil-claim.id. Bestehende Treffer in
|
||||
article_fimi_matches bleiben dadurch gueltig.
|
||||
|
||||
Aufruf (Staging):
|
||||
python scripts/import_fimi_claims.py \
|
||||
--vigil-db /home/claude-dev/vigil-data/vigil.db \
|
||||
--osint-db /home/claude-dev/AegisSight-Monitor-staging/data/osint.db
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
EUVSDISINFO_REPORT_BASE = "https://euvsdisinfo.eu/report/"
|
||||
|
||||
|
||||
def case_url_from_source_ref(source_ref: str | None) -> str | None:
|
||||
"""Leitet die EUvsDisinfo-Case-URL aus 'euvsdisinfo:<slug>' ab."""
|
||||
if not source_ref:
|
||||
return None
|
||||
prefix = "euvsdisinfo:"
|
||||
if source_ref.startswith(prefix):
|
||||
slug = source_ref[len(prefix):].strip().strip("/")
|
||||
if slug:
|
||||
return f"{EUVSDISINFO_REPORT_BASE}{slug}/"
|
||||
return None
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--vigil-db", required=True, help="Pfad zur Vigil-SQLite-DB (Quelle)")
|
||||
ap.add_argument("--osint-db", required=True, help="Pfad zur Monitor-SQLite-DB (Ziel)")
|
||||
ap.add_argument("--limit", type=int, default=0, help="Optional: nur N Claims importieren (Test)")
|
||||
args = ap.parse_args()
|
||||
|
||||
src = sqlite3.connect(args.vigil_db)
|
||||
src.row_factory = sqlite3.Row
|
||||
q = (
|
||||
"SELECT id, text, text_normalized, language, verdict, verdict_summary, "
|
||||
"source_id, embedding, first_seen_at FROM claims WHERE embedding IS NOT NULL"
|
||||
)
|
||||
if args.limit:
|
||||
q += f" LIMIT {int(args.limit)}"
|
||||
rows = src.execute(q).fetchall()
|
||||
src.close()
|
||||
print(f"Vigil: {len(rows)} Claims mit Embedding gelesen", flush=True)
|
||||
|
||||
dst = sqlite3.connect(args.osint_db)
|
||||
dst.execute("PRAGMA busy_timeout=10000")
|
||||
|
||||
# Sicherstellen, dass die Zieltabelle existiert (falls Skript vor init_db laeuft)
|
||||
dst.execute(
|
||||
"""CREATE TABLE IF NOT EXISTS fimi_claims (
|
||||
id INTEGER PRIMARY KEY,
|
||||
text TEXT NOT NULL,
|
||||
text_normalized TEXT,
|
||||
language TEXT,
|
||||
verdict TEXT NOT NULL DEFAULT 'false',
|
||||
verdict_summary TEXT,
|
||||
source_ref TEXT,
|
||||
case_url TEXT,
|
||||
embedding BLOB,
|
||||
first_seen_at TIMESTAMP,
|
||||
imported_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)"""
|
||||
)
|
||||
dst.execute("CREATE INDEX IF NOT EXISTS idx_fimi_claims_source_ref ON fimi_claims(source_ref)")
|
||||
|
||||
inserted = 0
|
||||
with_url = 0
|
||||
for r in rows:
|
||||
case_url = case_url_from_source_ref(r["source_id"])
|
||||
if case_url:
|
||||
with_url += 1
|
||||
dst.execute(
|
||||
"""INSERT INTO fimi_claims
|
||||
(id, text, text_normalized, language, verdict, verdict_summary,
|
||||
source_ref, case_url, embedding, first_seen_at, imported_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
text=excluded.text,
|
||||
text_normalized=excluded.text_normalized,
|
||||
language=excluded.language,
|
||||
verdict=excluded.verdict,
|
||||
verdict_summary=excluded.verdict_summary,
|
||||
source_ref=excluded.source_ref,
|
||||
case_url=excluded.case_url,
|
||||
embedding=excluded.embedding,
|
||||
first_seen_at=excluded.first_seen_at,
|
||||
imported_at=CURRENT_TIMESTAMP""",
|
||||
(
|
||||
r["id"], r["text"], r["text_normalized"], r["language"],
|
||||
r["verdict"] or "false", r["verdict_summary"], r["source_id"],
|
||||
case_url, r["embedding"], r["first_seen_at"],
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
dst.commit()
|
||||
total = dst.execute("SELECT COUNT(*) FROM fimi_claims").fetchone()[0]
|
||||
dst.close()
|
||||
print(f"Monitor: {inserted} Claims upserted ({with_url} mit Case-URL), "
|
||||
f"fimi_claims enthaelt jetzt {total} Eintraege", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren