diff --git a/src/database.py b/src/database.py index 54d6b7e..b8d9366 100644 --- a/src/database.py +++ b/src/database.py @@ -176,7 +176,12 @@ CREATE TABLE IF NOT EXISTS sources ( proposed_alignments_json TEXT, proposed_confidence REAL, proposed_reasoning TEXT, - proposed_at TIMESTAMP + proposed_at TIMESTAMP, + eu_disinfo_listed INTEGER DEFAULT 0, + eu_disinfo_case_count INTEGER DEFAULT 0, + eu_disinfo_last_seen TIMESTAMP, + ifcn_signatory INTEGER DEFAULT 0, + external_data_synced_at TIMESTAMP ); CREATE TABLE IF NOT EXISTS source_alignments ( @@ -668,6 +673,20 @@ async def init_db(): if any(c not in src_columns for c in ("political_orientation", "media_type", "reliability")): logger.info("Migration: Klassifikations-Spalten zu sources hinzugefuegt") + # Migration: externe Reputations-Daten (EUvsDisinfo + IFCN) + for col, ddl in [ + ("eu_disinfo_listed", "ALTER TABLE sources ADD COLUMN eu_disinfo_listed INTEGER DEFAULT 0"), + ("eu_disinfo_case_count", "ALTER TABLE sources ADD COLUMN eu_disinfo_case_count INTEGER DEFAULT 0"), + ("eu_disinfo_last_seen", "ALTER TABLE sources ADD COLUMN eu_disinfo_last_seen TIMESTAMP"), + ("ifcn_signatory", "ALTER TABLE sources ADD COLUMN ifcn_signatory INTEGER DEFAULT 0"), + ("external_data_synced_at", "ALTER TABLE sources ADD COLUMN external_data_synced_at TIMESTAMP"), + ]: + if col not in src_columns: + await db.execute(ddl) + await db.commit() + if any(c not in src_columns for c in ("eu_disinfo_listed", "ifcn_signatory")): + logger.info("Migration: externe Reputations-Spalten zu sources hinzugefuegt") + # Migration: source_alignments-Tabelle (Mehrfach-Tags fuer geopolitische Naehe) cursor = await db.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='source_alignments'" diff --git a/src/models.py b/src/models.py index 32d3bb7..7682310 100644 --- a/src/models.py +++ b/src/models.py @@ -210,6 +210,11 @@ class SourceResponse(BaseModel): classified_at: Optional[str] = None alignments: list[str] = [] is_global: bool = False + ifcn_signatory: bool = False + eu_disinfo_listed: bool = False + eu_disinfo_case_count: int = 0 + eu_disinfo_last_seen: Optional[str] = None + external_data_synced_at: Optional[str] = None # Source Discovery diff --git a/src/routers/sources.py b/src/routers/sources.py index 25a898f..e0f2014 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest from auth import get_current_user from database import db_dependency, get_db, refresh_source_counts +from services.external_reputation import apply_reputation_overrides, sync_all as sync_external_reputation from services.source_classifier import bulk_classify, classify_source from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES import aiosqlite @@ -90,6 +91,8 @@ async def list_sources( reliability: str = None, state_affiliated: bool = None, alignment: str = None, + ifcn_signatory: bool = None, + eu_disinfo_listed: bool = None, current_user: dict = Depends(get_current_user), db: aiosqlite.Connection = Depends(db_dependency), ): @@ -124,6 +127,12 @@ async def list_sources( if alignment: query += " AND EXISTS (SELECT 1 FROM source_alignments sa WHERE sa.source_id = s.id AND sa.alignment = ?)" params.append(alignment.lower()) + if ifcn_signatory is not None: + query += " AND s.ifcn_signatory = ?" + params.append(1 if ifcn_signatory else 0) + if eu_disinfo_listed is not None: + query += " AND s.eu_disinfo_listed = ?" + params.append(1 if eu_disinfo_listed else 0) query += " ORDER BY s.source_type, s.category, s.name" cursor = await db.execute(query, params) @@ -133,6 +142,8 @@ async def list_sources( for d in results: d["is_global"] = d.get("tenant_id") is None d["state_affiliated"] = bool(d.get("state_affiliated")) + d["ifcn_signatory"] = bool(d.get("ifcn_signatory")) + d["eu_disinfo_listed"] = bool(d.get("eu_disinfo_listed")) d["alignments"] = alignments_map.get(d["id"], []) return results @@ -864,6 +875,11 @@ async def approve_classification( await _replace_alignments(db, source_id, [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS]) await _clear_proposed(db, source_id) await db.commit() + # Reliability-Override anwenden (IFCN/EUvsDisinfo) + try: + await apply_reputation_overrides(db, source_id) + except Exception as e: + logger.warning("Reputation-Override fuer source_id=%s fehlgeschlagen: %s", source_id, e) return {"source_id": source_id, "status": "approved"} @@ -939,6 +955,26 @@ async def trigger_bulk_classify( return {"status": "started", "limit": limit, "only_unclassified": only_unclassified} +@router.post("/external-reputation/sync") +async def trigger_external_reputation_sync( + background_tasks: BackgroundTasks, + current_user: dict = Depends(get_current_user), +): + """Startet Sync von IFCN- und EUvsDisinfo-Daten (Admin, Hintergrund).""" + if current_user.get("role") != "org_admin": + raise HTTPException(status_code=403, detail="Nur Admins koennen den externen Sync starten") + + async def _bg(): + db = await get_db() + try: + await sync_external_reputation(db) + finally: + await db.close() + + background_tasks.add_task(_bg) + return {"status": "started"} + + @router.post("/classification/bulk-approve") async def bulk_approve_classifications( min_confidence: float = 0.85, @@ -995,4 +1031,10 @@ async def bulk_approve_classifications( await _clear_proposed(db, src["id"]) approved_ids.append(src["id"]) await db.commit() + # Reliability-Override fuer alle gerade Approved + try: + for sid in approved_ids: + await apply_reputation_overrides(db, sid) + except Exception as e: + logger.warning("Bulk Reputation-Override fehlgeschlagen: %s", e) return {"approved_count": len(approved_ids), "min_confidence": min_confidence} diff --git a/src/services/external_reputation.py b/src/services/external_reputation.py new file mode 100644 index 0000000..1e900b0 --- /dev/null +++ b/src/services/external_reputation.py @@ -0,0 +1,268 @@ +"""Externe Reputations-Daten fuer Quellen. + +Synchronisiert Domain-Listen von oeffentlichen Reputations-/Faktencheck-Datenbanken +und schreibt die Treffer in die sources-Spalten: + +- IFCN-Signatories (anerkannte Faktenchecker) -> ifcn_signatory +- EUvsDisinfo (pro-Kreml-Desinformation, Zenodo-CSV) -> eu_disinfo_listed, + eu_disinfo_case_count, eu_disinfo_last_seen + +Anschliessend wendet apply_reputation_overrides() Override-Regeln auf die +reliability-Spalte an: +- ifcn_signatory=1 -> reliability='sehr_hoch' +- eu_disinfo_case_count >= 5 -> reliability='sehr_niedrig' +- eu_disinfo_case_count >= 1 -> reliability eine Stufe runter (max bis 'niedrig') +""" +import csv +import io +import logging +from collections import defaultdict +from urllib.parse import urlparse + +import aiosqlite +import httpx + +logger = logging.getLogger("osint.external_reputation") + +IFCN_LIST_URL = "https://raw.githubusercontent.com/IFCN/verified-signatories/main/list" +EU_DISINFO_CSV_URL = "https://zenodo.org/records/10514307/files/euvsdisinfo_base.csv?download=1" + +HTTP_TIMEOUT = httpx.Timeout(60.0, connect=10.0) + +# Reliability-Skala in Stufenfolge (schlecht -> gut) +RELIABILITY_ORDER = ["sehr_niedrig", "niedrig", "gemischt", "hoch", "sehr_hoch"] + + +def _normalize_domain(raw: str | None) -> str | None: + """Normalisiert eine Domain: lowercase, ohne www., ohne Schema/Pfad.""" + if not raw: + return None + raw = raw.strip().lower() + if not raw: + return None + # Falls eine vollstaendige URL uebergeben wurde + if "://" in raw: + try: + raw = urlparse(raw).netloc or raw + except ValueError: + pass + # Pfad/Query strippen + raw = raw.split("/")[0].split("?")[0].split("#")[0] + if raw.startswith("www."): + raw = raw[4:] + return raw or None + + +async def _fetch_text(url: str) -> str: + """Laedt Text von einer URL. Wirft HTTPException bei Fehler.""" + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True) as client: + resp = await client.get(url) + resp.raise_for_status() + return resp.text + + +async def sync_ifcn_signatories(db: aiosqlite.Connection) -> dict: + """Laedt IFCN-Domain-Liste und matcht gegen sources.domain. + + Setzt ifcn_signatory=1 wo die Domain in der Liste vorkommt, sonst 0. + """ + text = await _fetch_text(IFCN_LIST_URL) + domains: set[str] = set() + for line in text.splitlines(): + d = _normalize_domain(line) + if d: + domains.add(d) + logger.info("IFCN-Liste geladen: %d Domains", len(domains)) + + # Aktuelle Quellen mit Domain laden + cursor = await db.execute( + "SELECT id, domain FROM sources WHERE domain IS NOT NULL AND domain != ''" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + matched_ids: list[int] = [] + unmatched_ids: list[int] = [] + for s in sources: + nd = _normalize_domain(s["domain"]) + if nd and nd in domains: + matched_ids.append(s["id"]) + else: + unmatched_ids.append(s["id"]) + + # Bulk-Update in zwei Statements + if matched_ids: + placeholders = ",".join("?" for _ in matched_ids) + await db.execute( + f"UPDATE sources SET ifcn_signatory = 1 WHERE id IN ({placeholders})", + matched_ids, + ) + if unmatched_ids: + placeholders = ",".join("?" for _ in unmatched_ids) + await db.execute( + f"UPDATE sources SET ifcn_signatory = 0 WHERE id IN ({placeholders})", + unmatched_ids, + ) + await db.commit() + logger.info("IFCN-Sync: %d Quellen als Faktenchecker markiert (von %d)", + len(matched_ids), len(sources)) + return { + "list_size": len(domains), + "sources_checked": len(sources), + "matched": len(matched_ids), + } + + +async def sync_eu_disinfo(db: aiosqlite.Connection) -> dict: + """Laedt EUvsDisinfo-CSV von Zenodo, aggregiert pro Domain, schreibt sources. + + - eu_disinfo_listed: 1 wenn Domain mindestens 1x als 'disinformation' debunkt + - eu_disinfo_case_count: Anzahl Disinformation-Faelle + - eu_disinfo_last_seen: spaetestes debunk_date + """ + text = await _fetch_text(EU_DISINFO_CSV_URL) + reader = csv.DictReader(io.StringIO(text)) + + # Per-Domain aggregieren (nur class='disinformation') + counts: dict[str, int] = defaultdict(int) + last_seen: dict[str, str] = {} + total_rows = 0 + for row in reader: + total_rows += 1 + if (row.get("class") or "").strip().lower() != "disinformation": + continue + d = _normalize_domain(row.get("article_domain")) + if not d: + continue + counts[d] += 1 + debunk_date = (row.get("debunk_date") or "").strip() + if debunk_date: + prev = last_seen.get(d) + if not prev or debunk_date > prev: + last_seen[d] = debunk_date + logger.info("EUvsDisinfo-CSV: %d Zeilen, %d Domains mit Desinformation", + total_rows, len(counts)) + + # Quellen laden + matchen + cursor = await db.execute( + "SELECT id, domain FROM sources WHERE domain IS NOT NULL AND domain != ''" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + matched = 0 + for s in sources: + nd = _normalize_domain(s["domain"]) + if nd and nd in counts: + await db.execute( + """UPDATE sources SET + eu_disinfo_listed = 1, + eu_disinfo_case_count = ?, + eu_disinfo_last_seen = ? + WHERE id = ?""", + (counts[nd], last_seen.get(nd), s["id"]), + ) + matched += 1 + else: + await db.execute( + """UPDATE sources SET + eu_disinfo_listed = 0, + eu_disinfo_case_count = 0, + eu_disinfo_last_seen = NULL + WHERE id = ?""", + (s["id"],), + ) + await db.commit() + logger.info("EUvsDisinfo-Sync: %d Quellen als Desinformations-Quelle markiert (von %d)", + matched, len(sources)) + return { + "rows_in_csv": total_rows, + "domains_with_disinfo_in_csv": len(counts), + "sources_checked": len(sources), + "matched": matched, + } + + +def _override_reliability(current: str | None, ifcn: bool, eu_count: int) -> str | None: + """Wendet Override-Regeln auf eine reliability-Stufe an. + + Rueckgabe: neue Stufe (oder None, wenn unveraendert). + """ + cur = current or "na" + + # IFCN gewinnt: zertifizierter Faktenchecker -> sehr_hoch (immer) + if ifcn: + return "sehr_hoch" if cur != "sehr_hoch" else None + + # EUvsDisinfo: Downgrade + if eu_count >= 5: + return "sehr_niedrig" if cur != "sehr_niedrig" else None + if eu_count >= 1: + # Eine Stufe runter, mindestens bis 'niedrig' + if cur == "na": + return "niedrig" + if cur in RELIABILITY_ORDER: + idx = RELIABILITY_ORDER.index(cur) + new_idx = max(0, idx - 1) + new = RELIABILITY_ORDER[new_idx] + # Mindeststufe 'niedrig' bei eu_count >= 1 + if RELIABILITY_ORDER.index(new) > RELIABILITY_ORDER.index("niedrig"): + new = "niedrig" + return new if new != cur else None + return None + + +async def apply_reputation_overrides(db: aiosqlite.Connection, source_id: int | None = None) -> dict: + """Wendet Reliability-Override-Regeln an. + + Wenn source_id angegeben ist, nur fuer diese Quelle. Sonst fuer alle Quellen. + """ + if source_id is not None: + cursor = await db.execute( + "SELECT id, reliability, ifcn_signatory, eu_disinfo_case_count " + "FROM sources WHERE id = ?", + (source_id,), + ) + else: + cursor = await db.execute( + "SELECT id, reliability, ifcn_signatory, eu_disinfo_case_count FROM sources" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + changed = 0 + for s in sources: + new = _override_reliability( + s.get("reliability"), + bool(s.get("ifcn_signatory")), + int(s.get("eu_disinfo_case_count") or 0), + ) + if new is not None: + await db.execute( + "UPDATE sources SET reliability = ? WHERE id = ?", + (new, s["id"]), + ) + changed += 1 + await db.commit() + logger.info("Reliability-Override: %d Quellen angepasst (von %d gepruefte)", + changed, len(sources)) + return {"checked": len(sources), "changed": changed} + + +async def sync_all(db: aiosqlite.Connection) -> dict: + """Vollstaendiger Sync: IFCN + EUvsDisinfo + Reliability-Override. + + Setzt external_data_synced_at fuer alle Quellen. + """ + ifcn_result = await sync_ifcn_signatories(db) + eu_result = await sync_eu_disinfo(db) + override_result = await apply_reputation_overrides(db) + + await db.execute( + "UPDATE sources SET external_data_synced_at = CURRENT_TIMESTAMP " + "WHERE domain IS NOT NULL AND domain != ''" + ) + await db.commit() + + return { + "ifcn": ifcn_result, + "eu_disinfo": eu_result, + "override": override_result, + } diff --git a/src/static/css/style.css b/src/static/css/style.css index 777d490..4b03934 100644 --- a/src/static/css/style.css +++ b/src/static/css/style.css @@ -3759,6 +3759,32 @@ a.dev-source-pill:hover { line-height: 1; } +.source-ifcn-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: var(--radius); + background: #e8f5e9; + color: #1b5e20; + border: 1px solid #66bb6a; + font-size: 10px; + font-weight: 600; + letter-spacing: 0.3px; +} + +.source-eu-disinfo-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: var(--radius); + background: #ffebee; + color: #b71c1c; + border: 1px solid #c62828; + font-size: 10px; + font-weight: 600; + letter-spacing: 0.3px; +} + .source-alignment-chip-badge { display: inline-flex; align-items: center; diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 8e73d59..f664cf9 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -538,6 +538,12 @@ Sehr niedrig Nicht eingeordnet + Externe Reputation filtern + + Externe Reputation: alle + IFCN-Faktenchecker + EU-Desinfo gelistet + Geopolitische Nähe filtern Alle Nähen @@ -736,6 +742,7 @@ + Externe Daten syncen + Klassifikation starten Alle ≥ 0.85 genehmigen diff --git a/src/static/js/api.js b/src/static/js/api.js index b2b1fd9..427df61 100644 --- a/src/static/js/api.js +++ b/src/static/js/api.js @@ -234,6 +234,9 @@ const API = { const qs = new URLSearchParams({ min_confidence: String(minConfidence) }).toString(); return this._request('POST', `/sources/classification/bulk-approve?${qs}`); }, + triggerExternalReputationSync() { + return this._request('POST', '/sources/external-reputation/sync'); + }, createSource(data) { return this._request('POST', '/sources', data); diff --git a/src/static/js/app.js b/src/static/js/app.js index 1f8d0b4..13cf81a 100644 --- a/src/static/js/app.js +++ b/src/static/js/app.js @@ -2834,6 +2834,16 @@ async handleRefresh() { } }, + async triggerExternalReputationSync() { + if (!confirm('IFCN- und EUvsDisinfo-Datenbanken jetzt syncen? Lauft im Hintergrund (~30 Sek).')) return; + try { + await API.triggerExternalReputationSync(); + UI.showToast('Externer Sync gestartet. Quellenliste in 30 Sek neu laden.', 'info'); + } catch (err) { + UI.showToast('Sync fehlgeschlagen: ' + err.message, 'error'); + } + }, + renderSourceStats(stats) { const bar = document.getElementById('sources-stats-bar'); if (!bar) return; @@ -2866,6 +2876,7 @@ async handleRefresh() { const mediaTypeFilter = document.getElementById('sources-filter-mediatype')?.value || ''; const reliabilityFilter = document.getElementById('sources-filter-reliability')?.value || ''; const alignmentFilter = document.getElementById('sources-filter-alignment')?.value || ''; + const externFilter = document.getElementById('sources-filter-extern')?.value || ''; const search = (document.getElementById('sources-search')?.value || '').toLowerCase(); // Alle Quellen nach Domain gruppieren @@ -2929,6 +2940,11 @@ async handleRefresh() { if (alignmentFilter) { if (!feeds.some(f => Array.isArray(f.alignments) && f.alignments.includes(alignmentFilter))) continue; } + if (externFilter === 'ifcn') { + if (!feeds.some(f => f.ifcn_signatory)) continue; + } else if (externFilter === 'eu_disinfo') { + if (!feeds.some(f => f.eu_disinfo_listed)) continue; + } // Suche if (search) { diff --git a/src/static/js/components.js b/src/static/js/components.js index 338802e..2ea7743 100644 --- a/src/static/js/components.js +++ b/src/static/js/components.js @@ -1193,7 +1193,20 @@ const UI = { } const rel = feed.reliability; if (rel && rel !== 'na') { - parts.push(``); + const relLabel = this._reliabilityLabels[rel] || rel; + const relSource = feed.ifcn_signatory ? '(IFCN-Faktenchecker)' + : (feed.eu_disinfo_listed ? `(EU-Desinfo, ${feed.eu_disinfo_case_count || 0} Fälle)` + : '(LLM-Schätzung)'); + const relTitle = `Glaubwürdigkeit: ${relLabel} ${relSource}`; + parts.push(``); + } + if (feed.ifcn_signatory) { + parts.push(`✓ IFCN`); + } + if (feed.eu_disinfo_listed) { + const cnt = feed.eu_disinfo_case_count || 0; + const title = `EUvsDisinfo: ${cnt} dokumentierte Desinformations-Fälle`; + parts.push(`⚠ EU-Desinfo (${cnt})`); } if (feed.state_affiliated) { parts.push(`⚑`); @@ -1285,7 +1298,15 @@ const UI = { lines.push('Politisch: ' + (pl ? pl.full : firstFeed.political_orientation)); } if (firstFeed.reliability && firstFeed.reliability !== 'na') { - lines.push('Glaubwürdigkeit: ' + (this._reliabilityLabels[firstFeed.reliability] || firstFeed.reliability)); + const relLabel = this._reliabilityLabels[firstFeed.reliability] || firstFeed.reliability; + const relSrc = firstFeed.ifcn_signatory ? ' (IFCN-Faktenchecker)' + : (firstFeed.eu_disinfo_listed ? ` (EU-Desinfo, ${firstFeed.eu_disinfo_case_count || 0} Fälle)` + : ' (LLM-Schätzung)'); + lines.push('Glaubwürdigkeit: ' + relLabel + relSrc); + } + if (firstFeed.ifcn_signatory) lines.push('IFCN-Faktenchecker: ja'); + if (firstFeed.eu_disinfo_listed) { + lines.push(`EUvsDisinfo: ${firstFeed.eu_disinfo_case_count || 0} Fälle` + (firstFeed.eu_disinfo_last_seen ? ` (zuletzt ${firstFeed.eu_disinfo_last_seen})` : '')); } if (firstFeed.state_affiliated) lines.push('Staatsnah: ja'); if (Array.isArray(firstFeed.alignments) && firstFeed.alignments.length > 0) {