Zweistufiger Abgleich von Monitor-Artikeln gegen den EUvsDisinfo- Falschbehauptungsbestand, vollstaendig im Monitor (kein Vigil-Call): - services/embeddings.py: SentenceTransformer-Singleton (paraphrase- multilingual-MiniLM-L12-v2), Modell-Cache mit Vigil geteilt. - fimi_claims-Tabelle + scripts/import_fimi_claims.py: Einmal-/Sync-Import der 19.629 EUvsDisinfo-Claims inkl. Embedding-BLOB und Case-URL. - services/fimi_matcher.py: Stufe 1 Embedding-Vorfilter (numpy-Matrix im RAM, Kosinus), Stufe 2 Haiku-Verifikation (verbreitet vs. berichtet/widerlegt), speichert nur bestaetigte Verbreitungen + woertliches Zitat. - article_fimi_matches-Tabelle + fimi_checked_at-Marker auf articles. - requirements.txt: torch, sentence-transformers, transformers, numpy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
117 Zeilen
4.4 KiB
Python
Ausführbare Datei
117 Zeilen
4.4 KiB
Python
Ausführbare Datei
#!/usr/bin/env python3
|
|
"""Einmal-/Sync-Import des EUvsDisinfo-Falschbehauptungsbestands in den Monitor.
|
|
|
|
Kopiert die Claims (Text, Verdict, Widerlegung, Quell-Referenz, Embedding-BLOB)
|
|
aus der Vigil-Datenbank in die Monitor-Tabelle fimi_claims. Die Embeddings
|
|
werden als BLOB 1:1 uebernommen (384-dim float32, L2-normalisiert) und im
|
|
Monitor mit demselben Modell (paraphrase-multilingual-MiniLM-L12-v2) gematcht.
|
|
|
|
Idempotent: UPSERT auf der stabilen Vigil-claim.id. Bestehende Treffer in
|
|
article_fimi_matches bleiben dadurch gueltig.
|
|
|
|
Aufruf (Staging):
|
|
python scripts/import_fimi_claims.py \
|
|
--vigil-db /home/claude-dev/vigil-data/vigil.db \
|
|
--osint-db /home/claude-dev/AegisSight-Monitor-staging/data/osint.db
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sqlite3
|
|
import sys
|
|
|
|
EUVSDISINFO_REPORT_BASE = "https://euvsdisinfo.eu/report/"
|
|
|
|
|
|
def case_url_from_source_ref(source_ref: str | None) -> str | None:
|
|
"""Leitet die EUvsDisinfo-Case-URL aus 'euvsdisinfo:<slug>' ab."""
|
|
if not source_ref:
|
|
return None
|
|
prefix = "euvsdisinfo:"
|
|
if source_ref.startswith(prefix):
|
|
slug = source_ref[len(prefix):].strip().strip("/")
|
|
if slug:
|
|
return f"{EUVSDISINFO_REPORT_BASE}{slug}/"
|
|
return None
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("--vigil-db", required=True, help="Pfad zur Vigil-SQLite-DB (Quelle)")
|
|
ap.add_argument("--osint-db", required=True, help="Pfad zur Monitor-SQLite-DB (Ziel)")
|
|
ap.add_argument("--limit", type=int, default=0, help="Optional: nur N Claims importieren (Test)")
|
|
args = ap.parse_args()
|
|
|
|
src = sqlite3.connect(args.vigil_db)
|
|
src.row_factory = sqlite3.Row
|
|
q = (
|
|
"SELECT id, text, text_normalized, language, verdict, verdict_summary, "
|
|
"source_id, embedding, first_seen_at FROM claims WHERE embedding IS NOT NULL"
|
|
)
|
|
if args.limit:
|
|
q += f" LIMIT {int(args.limit)}"
|
|
rows = src.execute(q).fetchall()
|
|
src.close()
|
|
print(f"Vigil: {len(rows)} Claims mit Embedding gelesen", flush=True)
|
|
|
|
dst = sqlite3.connect(args.osint_db)
|
|
dst.execute("PRAGMA busy_timeout=10000")
|
|
|
|
# Sicherstellen, dass die Zieltabelle existiert (falls Skript vor init_db laeuft)
|
|
dst.execute(
|
|
"""CREATE TABLE IF NOT EXISTS fimi_claims (
|
|
id INTEGER PRIMARY KEY,
|
|
text TEXT NOT NULL,
|
|
text_normalized TEXT,
|
|
language TEXT,
|
|
verdict TEXT NOT NULL DEFAULT 'false',
|
|
verdict_summary TEXT,
|
|
source_ref TEXT,
|
|
case_url TEXT,
|
|
embedding BLOB,
|
|
first_seen_at TIMESTAMP,
|
|
imported_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)"""
|
|
)
|
|
dst.execute("CREATE INDEX IF NOT EXISTS idx_fimi_claims_source_ref ON fimi_claims(source_ref)")
|
|
|
|
inserted = 0
|
|
with_url = 0
|
|
for r in rows:
|
|
case_url = case_url_from_source_ref(r["source_id"])
|
|
if case_url:
|
|
with_url += 1
|
|
dst.execute(
|
|
"""INSERT INTO fimi_claims
|
|
(id, text, text_normalized, language, verdict, verdict_summary,
|
|
source_ref, case_url, embedding, first_seen_at, imported_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
ON CONFLICT(id) DO UPDATE SET
|
|
text=excluded.text,
|
|
text_normalized=excluded.text_normalized,
|
|
language=excluded.language,
|
|
verdict=excluded.verdict,
|
|
verdict_summary=excluded.verdict_summary,
|
|
source_ref=excluded.source_ref,
|
|
case_url=excluded.case_url,
|
|
embedding=excluded.embedding,
|
|
first_seen_at=excluded.first_seen_at,
|
|
imported_at=CURRENT_TIMESTAMP""",
|
|
(
|
|
r["id"], r["text"], r["text_normalized"], r["language"],
|
|
r["verdict"] or "false", r["verdict_summary"], r["source_id"],
|
|
case_url, r["embedding"], r["first_seen_at"],
|
|
),
|
|
)
|
|
inserted += 1
|
|
dst.commit()
|
|
total = dst.execute("SELECT COUNT(*) FROM fimi_claims").fetchone()[0]
|
|
dst.close()
|
|
print(f"Monitor: {inserted} Claims upserted ({with_url} mit Case-URL), "
|
|
f"fimi_claims enthaelt jetzt {total} Eintraege", flush=True)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|