Dateien
AegisSight-Monitor/scripts/backfill_fimi.py
2026-06-14 10:06:54 +00:00

95 Zeilen
3.2 KiB
Python
Ausführbare Datei

#!/usr/bin/env python3
"""Backfill: alle noch ungeprueften Artikel gegen den Falschbehauptungsbestand
abgleichen (Embedding-Vorfilter + LLM-Verifikation).
Geht alle Lagen mit ungeprueften Artikeln durch, kleine zuerst (schnelle,
frueh testbare Ergebnisse), grosse zuletzt. Pro Lage in Batches, damit die
Score-Matrix (Artikel x Claims) den RAM nicht sprengt. Robust: Fehler
einzelner Batches stoppen den Lauf nicht; bei Artikeln, die wiederholt
scheitern (kein Fortschritt), wird die Lage abgebrochen statt endlos zu
schleifen.
Aufruf (im Staging-Verzeichnis, mit dessen venv):
HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 FIMI_VERIFY_CONCURRENCY=5 \
./venv/bin/python scripts/backfill_fimi.py
"""
from __future__ import annotations
import asyncio
import sys
import time
sys.path.insert(0, "src")
import aiosqlite
from services import fimi_matcher
DB_PATH = "data/osint.db"
BATCH = 120
def _ts() -> str:
return time.strftime("%H:%M:%S")
async def main() -> None:
db = await aiosqlite.connect(DB_PATH)
db.row_factory = aiosqlite.Row
t0 = time.time()
n_claims = await fimi_matcher.ensure_matrix(db)
print(f"[{_ts()}] Matrix: {n_claims} Claims geladen", flush=True)
cursor = await db.execute(
"""SELECT incident_id, COUNT(*) AS n
FROM articles WHERE fimi_checked_at IS NULL AND incident_id IS NOT NULL
GROUP BY incident_id ORDER BY n"""
)
incidents = [(r["incident_id"], r["n"]) for r in await cursor.fetchall()]
total = sum(n for _, n in incidents)
print(f"[{_ts()}] START: {len(incidents)} Lagen, {total} ungepruefte Artikel", flush=True)
grand = {"articles": 0, "candidates": 0, "articles_with_match": 0, "stored": 0, "errors": 0}
for iid, n in incidents:
done = 0
prev_remaining = None
while True:
res = await fimi_matcher.match_incident_articles(
db, iid, only_unchecked=True, limit=BATCH
)
if res["articles"] == 0:
break
done += res["articles"]
for k in grand:
grand[k] += res.get(k, 0)
cur = await db.execute(
"SELECT COUNT(*) FROM articles WHERE incident_id = ? AND fimi_checked_at IS NULL",
(iid,),
)
remaining = (await cur.fetchone())[0]
print(
f"[{_ts()}] Lage {iid}: +{res['articles']} ({done}/{n}), "
f"Treffer {res['articles_with_match']}, Fehler {res['errors']}, "
f"verbleibend {remaining}",
flush=True,
)
if remaining == 0:
break
if prev_remaining is not None and remaining >= prev_remaining:
print(
f"[{_ts()}] Lage {iid}: kein Fortschritt (verbleibend {remaining}), "
f"Abbruch wegen wiederholt fehlschlagender Artikel",
flush=True,
)
break
prev_remaining = remaining
print(f"[{_ts()}] == Lage {iid} fertig: {done} Artikel verarbeitet ==", flush=True)
await db.close()
dt = time.time() - t0
print(f"[{_ts()}] FERTIG nach {dt/60:.1f} min: {grand}", flush=True)
if __name__ == "__main__":
asyncio.run(main())