From 7f220a9b65e164d1cd3d0c59aeee8a87d89be968 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 00:13:39 +0000 Subject: [PATCH 01/15] feat(orchestrator): Faktencheck vor Lagebild mit Fallback (sequenziell) Bislang liefen factcheck + analyze parallel via asyncio.gather. Folge: Lagebild konnte Aussagen treffen, die der Faktencheck im selben Refresh als contradicted markiert. Inkonsistenz zwischen Lagebild-Tab und Faktencheck- Tab; im PDF/DOCX-Export schon kritisch. Variante 1 aus der Diskussion: strikt sequenziell, mit Fallback bei Faktencheck-Fail (Refresh bricht NICHT ab, Lagebild laeuft dann ohne Faktenkontext wie bisher, ein Logeintrag dokumentiert den Fallback). Aenderungen: - analyzer.build_fact_context_block(): neuer Helper, baut den GEPRUEFTE-FAKTEN-Block aus existing_facts + neuen/aktualisierten Fakten. Status-Domaenen adhoc/research vereinheitlicht zu Bestaetigt / Umstritten / Unbestaetigt / Entwicklung. Max 20 Fakten, sortiert nach Status-Prioritaet desc und sources_count desc. Bei leerer Eingabe leerer String -> Fallback-Pfad. - analyzer.analyze() / analyze_incremental(): neuer Optional-Parameter fact_context_block (default leer, Backward-Compat). 4 Prompt-Templates bekommen {fact_context_block}-Platzhalter sowie eine AUSSAGE-DISZIPLIN- Sektion: bestaetigte Fakten als Geruest, Umstrittenes explizit machen, Unbestaetigtes klar einordnen, kein Spekulieren ueber ungedecktes. - orchestrator: asyncio.gather durch sequenzielle Logik ersetzt. Faktencheck zuerst, Pipeline-Step 6 done direkt nach dem Aufruf (count_value ist Schaetzung; finale DB-Zahlen stehen spaeter). Lagebild danach (Step 7) mit fact_context_block. _do_analysis-Closure um den Parameter erweitert, kein toter Inline-Block. - spaeteres _pipe_done(factcheck) entfernt -- der Step wird jetzt frueher geschlossen, der spaetere Persistierungsblock laesst ihn unberuehrt. UI-Pipeline zeigt automatisch sequenzielle Aktivitaet statt beide Steps gleichzeitig -- keine Frontend-Aenderung noetig. Latenz pro Refresh steigt um die factcheck-Dauer. Bewusst akzeptiert: Konsistenz vor Geschwindigkeit. --- src/agents/analyzer.py | 119 +++++++++++++++++++++++++++++++++++-- src/agents/orchestrator.py | 76 ++++++++++++++++++----- 2 files changed, 175 insertions(+), 20 deletions(-) diff --git a/src/agents/analyzer.py b/src/agents/analyzer.py index 8a067af..9bb45e6 100644 --- a/src/agents/analyzer.py +++ b/src/agents/analyzer.py @@ -16,7 +16,7 @@ WICHTIG: Verwende IMMER echte UTF-8-Umlaute (ä, ö, ü, ß) — NIEMALS Umschre VORFALL: {title} KONTEXT: {description} -VORHANDENE MELDUNGEN: +{fact_context_block}VORHANDENE MELDUNGEN: {articles_text} AUFTRAG: @@ -59,7 +59,7 @@ WICHTIG: Verwende IMMER echte UTF-8-Umlaute (ä, ö, ü, ß) — NIEMALS Umschre THEMA: {title} KONTEXT: {description} -VORLIEGENDE QUELLEN: +{fact_context_block}VORLIEGENDE QUELLEN: {articles_text} AUFTRAG: @@ -118,7 +118,7 @@ BISHERIGES LAGEBILD: BISHERIGE QUELLEN: {previous_sources_text} -NEUE MELDUNGEN SEIT DEM LETZTEN UPDATE: +{fact_context_block}NEUE MELDUNGEN SEIT DEM LETZTEN UPDATE: {new_articles_text} AUFTRAG: @@ -165,7 +165,7 @@ BISHERIGES BRIEFING: BISHERIGE QUELLEN: {previous_sources_text} -NEUE QUELLEN SEIT DEM LETZTEN UPDATE: +{fact_context_block}NEUE QUELLEN SEIT DEM LETZTEN UPDATE: {new_articles_text} AUFTRAG: @@ -264,6 +264,112 @@ Antworte AUSSCHLIESSLICH als JSON-Objekt — KEINE Erklärung, KEINE Einleitung: {{"relevant_ids": [1, 3, 7]}}""" + + +# Status-Gruppen fuer den Fakten-Kontext im Analyse-Prompt. +# adhoc nutzt confirmed/unconfirmed/contradicted/developing, +# research nutzt established/unverified/disputed/developing — beide Domaenen +# werden in dieselben vier Anzeige-Gruppen abgebildet. +_FACT_STATUS_GROUPS = [ + ("Bestätigt (mehrere unabhängige Quellen oder durch Faktencheck als gesichert eingestuft):", + {"confirmed", "established"}), + ("Umstritten (Quellen widersprechen sich oder Faktencheck hat Widersprüche dokumentiert):", + {"contradicted", "disputed"}), + ("Unbestätigt (nur eine einzelne Quelle, eine unabhängige Bestätigung steht aus):", + {"unconfirmed", "unverified"}), + ("In Entwicklung (laufender Sachverhalt, Stand offen):", + {"developing"}), +] + +_FACT_STATUS_PRIORITY = { + "confirmed": 5, "established": 5, + "contradicted": 4, "disputed": 4, + "unconfirmed": 3, "unverified": 3, + "developing": 1, +} + + +def build_fact_context_block( + existing_facts: list[dict] | None, + new_or_updated_facts: list[dict] | None, + incident_type: str, + max_total: int = 20, +) -> str: + """Baut den 'GEPRUEFTE FAKTEN'-Block fuer den Analyse-Prompt. + + Wird vom Orchestrator zwischen Faktencheck und Lagebild aufgerufen, damit + das Lagebild auf gepruefter Faktenbasis schreibt und Unklarheiten explizit + benennt. Bei leerer Faktenliste wird ein leerer String zurueckgegeben — der + Prompt laeuft dann ohne Fakten-Kontext (Fallback bei Faktencheck-Fail oder + bei Lagen ohne bisherige Fakten). + """ + existing_facts = existing_facts or [] + new_or_updated_facts = new_or_updated_facts or [] + if not existing_facts and not new_or_updated_facts: + return "" + + seen_claims: set[str] = set() + merged: list[dict] = [] + # Neue/aktualisierte Fakten zuerst (Status ist aktueller Stand). + for f in new_or_updated_facts: + c = (f.get("claim") or "").strip().lower() + if not c or c in seen_claims: + continue + seen_claims.add(c) + merged.append(f) + # Dann alte unveraenderte Fakten. + for f in existing_facts: + c = (f.get("claim") or "").strip().lower() + if not c or c in seen_claims: + continue + seen_claims.add(c) + merged.append(f) + + if not merged: + return "" + + merged.sort(key=lambda f: ( + -_FACT_STATUS_PRIORITY.get((f.get("status") or "").lower(), 0), + -(f.get("sources_count") or 0), + )) + merged = merged[:max_total] + + grouped: dict[str, list[dict]] = {label: [] for label, _ in _FACT_STATUS_GROUPS} + for f in merged: + s = (f.get("status") or "").lower() + for label, codes in _FACT_STATUS_GROUPS: + if s in codes: + grouped[label].append(f) + break + + if not any(grouped.values()): + return "" + + lines: list[str] = [] + lines.append("GEPRÜFTE FAKTEN (Stand nach dem Faktencheck dieses Refresh, max. {n} priorisiert):".format(n=max_total)) + for label, _codes in _FACT_STATUS_GROUPS: + items = grouped[label] + if not items: + continue + lines.append("") + lines.append(label) + for f in items: + claim = (f.get("claim") or "").strip() + sc = f.get("sources_count") or 0 + sc_text = f" ({sc} {'Quellen' if sc != 1 else 'Quelle'})" if sc else "" + lines.append(f"- {claim}{sc_text}") + + lines.append("") + lines.append("AUSSAGE-DISZIPLIN für das Lagebild:") + lines.append("- Bestätigte Fakten als Grundgerüst nehmen, ohne Hedging.") + lines.append("- Umstrittene Punkte explizit als umstritten kennzeichnen, beide Seiten knapp benennen.") + lines.append("- Unbestätigtes klar einordnen ('Eine einzelne Quelle berichtet ...', 'Eine unabhängige Bestätigung steht aus.').") + lines.append("- Bei Aussagen, die durch keinen geprüften Fakt gedeckt sind und auch nicht direkt aus einer der vorliegenden Meldungen hervorgehen: NICHT spekulieren — entweder weglassen oder als unklar kennzeichnen.") + lines.append("- Triff KEINE Aussagen, die mit den oben gelisteten geprüften Fakten in Widerspruch stehen.") + lines.append("") + return "\n".join(lines) + + class AnalyzerAgent: """Analysiert und übersetzt Meldungen über Claude CLI.""" @@ -290,7 +396,7 @@ class AnalyzerAgent: articles_text += f"Inhalt: {content[:800]}\n" return articles_text - async def analyze(self, title: str, description: str, articles: list[dict], incident_type: str = "adhoc") -> tuple[dict | None, ClaudeUsage | None]: + async def analyze(self, title: str, description: str, articles: list[dict], incident_type: str = "adhoc", fact_context_block: str = "") -> tuple[dict | None, ClaudeUsage | None]: """Erstanalyse: Analysiert alle Meldungen zu einem Vorfall (erster Refresh).""" if not articles: return None, None @@ -306,6 +412,7 @@ class AnalyzerAgent: articles_text=articles_text, today=today, output_language=OUTPUT_LANGUAGE, + fact_context_block=fact_context_block, ) try: @@ -327,6 +434,7 @@ class AnalyzerAgent: previous_summary: str, previous_sources_json: str | None, incident_type: str = "adhoc", + fact_context_block: str = "", ) -> tuple[dict | None, ClaudeUsage | None]: """Inkrementelle Analyse: Aktualisiert das Lagebild mit nur den neuen Artikeln. @@ -369,6 +477,7 @@ class AnalyzerAgent: new_articles_text=new_articles_text, today=today, output_language=OUTPUT_LANGUAGE, + fact_context_block=fact_context_block, ) try: diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 225a666..e8bb457 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -1299,18 +1299,22 @@ class AgentOrchestrator: except Exception as e: logger.warning("Bias-Anreicherung fehlgeschlagen (Pipeline laeuft weiter): %s", e) - # --- Analyse-Task --- - async def _do_analysis(): + # --- Analyse-Task (wird nach _do_factcheck mit fact_context_block aufgerufen) --- + async def _do_analysis(fact_context_block: str = ""): analyzer = AnalyzerAgent() if previous_summary and new_count > 0: logger.info(f"Inkrementelle Analyse: {new_count} neue Artikel zum bestehenden Lagebild") return await analyzer.analyze_incremental( title, description, new_articles_for_analysis, previous_summary, previous_sources_json, incident_type, + fact_context_block=fact_context_block, ) else: logger.info("Erstanalyse: Alle Artikel werden analysiert") - return await analyzer.analyze(title, description, all_articles_preloaded, incident_type) + return await analyzer.analyze( + title, description, all_articles_preloaded, incident_type, + fact_context_block=fact_context_block, + ) # --- Faktencheck-Task --- async def _do_factcheck(): @@ -1344,20 +1348,61 @@ class AgentOrchestrator: articles_for_check = [dict(row) for row in await cursor.fetchall()] return await factchecker.check(title, articles_for_check, incident_type) - # Pipeline-Schritte 6+7: Lagebild verfassen + Fakten prüfen (Start, parallel) - await _pipe_start("summary") + # Pipeline-Schritt 6: Faktencheck zuerst (sequenziell). Liefert den + # Faktenkontext fuer das Lagebild, damit dieses auf geprueftem Stand + # schreibt und Unklarheiten explizit benennt. Variante 1: bei + # Faktencheck-Fehler faellt das Lagebild auf den alten Pfad ohne + # Faktenkontext zurueck (Refresh bricht NICHT ab). await _pipe_start("factcheck") + factcheck_result: tuple = ([], None) + fact_context_block = "" + factcheck_failed_reason: str | None = None + try: + factcheck_result = await _do_factcheck() + except Exception as fc_err: + factcheck_failed_reason = str(fc_err) + logger.warning( + "Faktencheck fehlgeschlagen, Lagebild laeuft ohne Faktenkontext: %s", + fc_err, exc_info=True, + ) - # Beide Tasks PARALLEL starten - logger.info("Starte Analyse und Faktencheck parallel...") - analysis_result, factcheck_result = await asyncio.gather( - _do_analysis(), - _do_factcheck(), + fact_checks, fc_usage = factcheck_result if factcheck_result else ([], None) + + # Pipeline-Schritt 6 done direkt nach dem Aufruf — die finale + # DB-Persistierung passiert weiter unten, aber fuer die UI ist + # der Faktencheck-Aufruf hier abgeschlossen. Der count_value + # ist eine Schaetzung (echte Zahl steht spaeter in der DB). + _fc_estimated_new = max(0, len(fact_checks or []) - len(existing_facts or [])) + await _pipe_done( + "factcheck", + count_value=_fc_estimated_new, + count_secondary=len(fact_checks) if fact_checks else 0, ) + # Faktenkontext fuer das Lagebild bauen. + try: + from agents.analyzer import build_fact_context_block as _build_fc_ctx + fact_context_block = _build_fc_ctx( + existing_facts or [], fact_checks or [], incident_type, + ) + if fact_context_block: + logger.info( + "Faktenkontext fuer Lagebild: %d Zeichen, basierend auf %d alten + %d neuen Fakten", + len(fact_context_block), len(existing_facts or []), len(fact_checks or []), + ) + except Exception as ctx_err: + logger.warning("build_fact_context_block fehlgeschlagen: %s", ctx_err, exc_info=True) + fact_context_block = "" + + # Pipeline-Schritt 7: Lagebild verfassen (jetzt mit Faktenkontext) + await _pipe_start("summary") + logger.info( + "Starte Lagebild (sequenziell nach Faktencheck%s)", + " — OHNE Faktenkontext (Fallback)" if factcheck_failed_reason else "", + ) + analysis_result = await _do_analysis(fact_context_block) + analysis, analysis_usage = analysis_result - fact_checks, fc_usage = factcheck_result - # Pipeline-Schritt 6: Lagebild verfassen (fertig, keine Zahl, nur Status) await _pipe_done("summary", count_value=None, count_secondary=None) # --- Analyse-Ergebnisse verarbeiten --- @@ -1656,9 +1701,10 @@ class AgentOrchestrator: await db.commit() - # Pipeline-Schritt 7: Fakten prüfen (fertig) - _new_facts_count = max(0, len(fact_checks) - len(existing_facts)) - await _pipe_done("factcheck", count_value=_new_facts_count, count_secondary=len(fact_checks) if fact_checks else 0) + # Pipeline-Schritt 7 (Fakten pruefen) wurde bereits frueher als done + # markiert (siehe weiter oben — direkt nach dem _do_factcheck-Aufruf, + # bevor das Lagebild generiert wurde). Hier nur noch die DB- + # Persistierung der Fakten, ohne den Step erneut zu schliessen. # Pipeline-Schritt 8: Qualitätscheck (Start, ohne Zahlen) await _pipe_start("qc") From f8e2f73bc068ae4ee4212876c4d41938ebaf49a5 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 18:21:45 +0000 Subject: [PATCH 02/15] feat(sources): strukturierte Klassifikation (Politik/Medientyp/Reliability/Alignments) - Neue sources-Spalten: political_orientation (7+2 Stufen), media_type (20), reliability (5+1), state_affiliated, country_code, classification_source, classified_at sowie proposed_*-Spalten fuer LLM-Vorschlaege. - Neue source_alignments-Tabelle fuer Mehrfach-Tagging geopolitischer Naehe (prorussisch, proiranisch, prowestlich, ...). - API-Filter: ?political_orientation, ?media_type, ?reliability, ?state_affiliated, ?alignment. - create/update_source nehmen alignments[] entgegen und setzen classification_source automatisch auf 'manual' bei Klassifikations-Edits. Backwards-kompatibel: bestehendes bias/language/category bleibt unveraendert, Default fuer Bestandsquellen ist classification_source = 'legacy'. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/database.py | 77 +++++++++++++++- src/models.py | 47 ++++++++-- src/routers/sources.py | 196 ++++++++++++++++++++++++++++++++++------- 3 files changed, 279 insertions(+), 41 deletions(-) diff --git a/src/database.py b/src/database.py index 19f06bf..54d6b7e 100644 --- a/src/database.py +++ b/src/database.py @@ -158,7 +158,31 @@ CREATE TABLE IF NOT EXISTS sources ( article_count INTEGER DEFAULT 0, last_seen_at TIMESTAMP, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - tenant_id INTEGER REFERENCES organizations(id) + tenant_id INTEGER REFERENCES organizations(id), + language TEXT, + bias TEXT, + political_orientation TEXT DEFAULT 'na', + media_type TEXT DEFAULT 'sonstige', + reliability TEXT DEFAULT 'na', + state_affiliated INTEGER DEFAULT 0, + country_code TEXT, + classification_source TEXT DEFAULT 'legacy', + classified_at TIMESTAMP, + proposed_political_orientation TEXT, + proposed_media_type TEXT, + proposed_reliability TEXT, + proposed_state_affiliated INTEGER, + proposed_country_code TEXT, + proposed_alignments_json TEXT, + proposed_confidence REAL, + proposed_reasoning TEXT, + proposed_at TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS source_alignments ( + source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE, + alignment TEXT NOT NULL, + PRIMARY KEY (source_id, alignment) ); CREATE TABLE IF NOT EXISTS notifications ( @@ -611,6 +635,57 @@ async def init_db(): await db.execute("ALTER TABLE sources ADD COLUMN tenant_id INTEGER REFERENCES organizations(id)") await db.commit() + # Migration: language + bias (Freitext, schon laenger im Einsatz, Schema-Lueck schliessen) + if "language" not in src_columns: + await db.execute("ALTER TABLE sources ADD COLUMN language TEXT") + await db.commit() + if "bias" not in src_columns: + await db.execute("ALTER TABLE sources ADD COLUMN bias TEXT") + await db.commit() + + # Migration: strukturierte Klassifikations-Spalten fuer sources + for col, ddl in [ + ("political_orientation", "ALTER TABLE sources ADD COLUMN political_orientation TEXT DEFAULT 'na'"), + ("media_type", "ALTER TABLE sources ADD COLUMN media_type TEXT DEFAULT 'sonstige'"), + ("reliability", "ALTER TABLE sources ADD COLUMN reliability TEXT DEFAULT 'na'"), + ("state_affiliated", "ALTER TABLE sources ADD COLUMN state_affiliated INTEGER DEFAULT 0"), + ("country_code", "ALTER TABLE sources ADD COLUMN country_code TEXT"), + ("classification_source", "ALTER TABLE sources ADD COLUMN classification_source TEXT DEFAULT 'legacy'"), + ("classified_at", "ALTER TABLE sources ADD COLUMN classified_at TIMESTAMP"), + ("proposed_political_orientation", "ALTER TABLE sources ADD COLUMN proposed_political_orientation TEXT"), + ("proposed_media_type", "ALTER TABLE sources ADD COLUMN proposed_media_type TEXT"), + ("proposed_reliability", "ALTER TABLE sources ADD COLUMN proposed_reliability TEXT"), + ("proposed_state_affiliated", "ALTER TABLE sources ADD COLUMN proposed_state_affiliated INTEGER"), + ("proposed_country_code", "ALTER TABLE sources ADD COLUMN proposed_country_code TEXT"), + ("proposed_alignments_json", "ALTER TABLE sources ADD COLUMN proposed_alignments_json TEXT"), + ("proposed_confidence", "ALTER TABLE sources ADD COLUMN proposed_confidence REAL"), + ("proposed_reasoning", "ALTER TABLE sources ADD COLUMN proposed_reasoning TEXT"), + ("proposed_at", "ALTER TABLE sources ADD COLUMN proposed_at TIMESTAMP"), + ]: + if col not in src_columns: + await db.execute(ddl) + await db.commit() + if any(c not in src_columns for c in ("political_orientation", "media_type", "reliability")): + logger.info("Migration: Klassifikations-Spalten zu sources hinzugefuegt") + + # Migration: source_alignments-Tabelle (Mehrfach-Tags fuer geopolitische Naehe) + cursor = await db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='source_alignments'" + ) + if not await cursor.fetchone(): + await db.executescript( + """ + CREATE TABLE source_alignments ( + source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE, + alignment TEXT NOT NULL, + PRIMARY KEY (source_id, alignment) + ); + CREATE INDEX IF NOT EXISTS idx_source_alignments_alignment ON source_alignments(alignment); + """ + ) + await db.commit() + logger.info("Migration: source_alignments-Tabelle erstellt") + # Migration: tenant_id fuer notifications cursor = await db.execute("PRAGMA table_info(notifications)") notif_columns = [row[1] for row in await cursor.fetchall()] diff --git a/src/models.py b/src/models.py index 6c1e547..32d3bb7 100644 --- a/src/models.py +++ b/src/models.py @@ -139,24 +139,51 @@ class IncidentListItem(BaseModel): # Sources (Quellenverwaltung) +SOURCE_TYPE_PATTERN = "^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$" +SOURCE_CATEGORY_PATTERN = "^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$" +SOURCE_STATUS_PATTERN = "^(active|inactive)$" +POLITICAL_ORIENTATION_PATTERN = "^(links_extrem|links|mitte_links|liberal|mitte|konservativ|mitte_rechts|rechts|rechts_extrem|na)$" +MEDIA_TYPE_PATTERN = "^(tageszeitung|wochenzeitung|magazin|tv_sender|radio|oeffentlich_rechtlich|nachrichtenagentur|online_only|blog|telegram_kanal|telegram_bot|podcast|social_media|imageboard|think_tank|ngo|behoerde|staatsmedium|fachmedium|sonstige)$" +RELIABILITY_PATTERN = "^(sehr_hoch|hoch|gemischt|niedrig|sehr_niedrig|na)$" +ALIGNMENT_PATTERN = "^(prorussisch|proiranisch|prowestlich|proukrainisch|prochinesisch|projapanisch|proisraelisch|propalaestinensisch|protuerkisch|panarabisch|neutral|sonstige)$" +COUNTRY_CODE_PATTERN = "^[A-Z]{2}$" +CLASSIFICATION_SOURCE_PATTERN = "^(manual|llm_approved|llm_pending|legacy)$" + + class SourceCreate(BaseModel): name: str = Field(min_length=1, max_length=200) url: Optional[str] = None domain: Optional[str] = None - source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$") - category: str = Field(default="sonstige", pattern="^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$") - status: str = Field(default="active", pattern="^(active|inactive)$") + source_type: str = Field(default="rss_feed", pattern=SOURCE_TYPE_PATTERN) + category: str = Field(default="sonstige", pattern=SOURCE_CATEGORY_PATTERN) + status: str = Field(default="active", pattern=SOURCE_STATUS_PATTERN) notes: Optional[str] = None + language: Optional[str] = None + bias: Optional[str] = None + political_orientation: Optional[str] = Field(default=None, pattern=POLITICAL_ORIENTATION_PATTERN) + media_type: Optional[str] = Field(default=None, pattern=MEDIA_TYPE_PATTERN) + reliability: Optional[str] = Field(default=None, pattern=RELIABILITY_PATTERN) + state_affiliated: Optional[bool] = None + country_code: Optional[str] = Field(default=None, pattern=COUNTRY_CODE_PATTERN) + alignments: Optional[list[str]] = None class SourceUpdate(BaseModel): name: Optional[str] = Field(default=None, max_length=200) url: Optional[str] = None domain: Optional[str] = None - source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$") - category: Optional[str] = Field(default=None, pattern="^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$") - status: Optional[str] = Field(default=None, pattern="^(active|inactive)$") + source_type: Optional[str] = Field(default=None, pattern=SOURCE_TYPE_PATTERN) + category: Optional[str] = Field(default=None, pattern=SOURCE_CATEGORY_PATTERN) + status: Optional[str] = Field(default=None, pattern=SOURCE_STATUS_PATTERN) notes: Optional[str] = None + language: Optional[str] = None + bias: Optional[str] = None + political_orientation: Optional[str] = Field(default=None, pattern=POLITICAL_ORIENTATION_PATTERN) + media_type: Optional[str] = Field(default=None, pattern=MEDIA_TYPE_PATTERN) + reliability: Optional[str] = Field(default=None, pattern=RELIABILITY_PATTERN) + state_affiliated: Optional[bool] = None + country_code: Optional[str] = Field(default=None, pattern=COUNTRY_CODE_PATTERN) + alignments: Optional[list[str]] = None class SourceResponse(BaseModel): @@ -174,6 +201,14 @@ class SourceResponse(BaseModel): created_at: str language: Optional[str] = None bias: Optional[str] = None + political_orientation: Optional[str] = None + media_type: Optional[str] = None + reliability: Optional[str] = None + state_affiliated: bool = False + country_code: Optional[str] = None + classification_source: Optional[str] = None + classified_at: Optional[str] = None + alignments: list[str] = [] is_global: bool = False diff --git a/src/routers/sources.py b/src/routers/sources.py index f6318d1..9adade2 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -12,7 +12,56 @@ logger = logging.getLogger("osint.sources") router = APIRouter(prefix="/api/sources", tags=["sources"]) -SOURCE_UPDATE_COLUMNS = {"name", "url", "domain", "source_type", "category", "status", "notes"} +SOURCE_UPDATE_COLUMNS = { + "name", "url", "domain", "source_type", "category", "status", "notes", + "language", "bias", + "political_orientation", "media_type", "reliability", + "state_affiliated", "country_code", +} +SOURCE_CLASSIFICATION_FIELDS = { + "political_orientation", "media_type", "reliability", + "state_affiliated", "country_code", +} +ALLOWED_ALIGNMENTS = { + "prorussisch", "proiranisch", "prowestlich", "proukrainisch", + "prochinesisch", "projapanisch", "proisraelisch", "propalaestinensisch", + "protuerkisch", "panarabisch", "neutral", "sonstige", +} + + +async def _load_alignments_for(db: aiosqlite.Connection, source_ids: list[int]) -> dict[int, list[str]]: + """Lädt alignments fuer mehrere Quellen in einer Query und gibt {source_id: [alignment, ...]} zurück.""" + if not source_ids: + return {} + placeholders = ",".join("?" for _ in source_ids) + cursor = await db.execute( + f"SELECT source_id, alignment FROM source_alignments WHERE source_id IN ({placeholders}) ORDER BY alignment", + source_ids, + ) + out: dict[int, list[str]] = {sid: [] for sid in source_ids} + for row in await cursor.fetchall(): + out.setdefault(row["source_id"], []).append(row["alignment"]) + return out + + +async def _replace_alignments(db: aiosqlite.Connection, source_id: int, alignments: list[str]): + """Ersetzt die alignments-Liste einer Quelle (DELETE + INSERT) — Aufrufer muss commit() machen.""" + await db.execute("DELETE FROM source_alignments WHERE source_id = ?", (source_id,)) + seen: set[str] = set() + for raw in alignments: + a = (raw or "").strip().lower() + if not a or a in seen: + continue + if a not in ALLOWED_ALIGNMENTS: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Ungueltiger alignment-Wert: '{a}'", + ) + seen.add(a) + await db.execute( + "INSERT INTO source_alignments (source_id, alignment) VALUES (?, ?)", + (source_id, a), + ) def _check_source_ownership(source: dict, username: str): @@ -34,6 +83,11 @@ async def list_sources( source_type: str = None, category: str = None, source_status: str = None, + political_orientation: str = None, + media_type: str = None, + reliability: str = None, + state_affiliated: bool = None, + alignment: str = None, current_user: dict = Depends(get_current_user), db: aiosqlite.Connection = Depends(db_dependency), ): @@ -41,27 +95,43 @@ async def list_sources( tenant_id = current_user.get("tenant_id") # Global (tenant_id=NULL) + eigene Org - query = "SELECT * FROM sources WHERE (tenant_id IS NULL OR tenant_id = ?)" - params = [tenant_id] + query = "SELECT s.* FROM sources s WHERE (s.tenant_id IS NULL OR s.tenant_id = ?)" + params: list = [tenant_id] if source_type: - query += " AND source_type = ?" + query += " AND s.source_type = ?" params.append(source_type) if category: - query += " AND category = ?" + query += " AND s.category = ?" params.append(category) if source_status: - query += " AND status = ?" + query += " AND s.status = ?" params.append(source_status) + if political_orientation: + query += " AND s.political_orientation = ?" + params.append(political_orientation) + if media_type: + query += " AND s.media_type = ?" + params.append(media_type) + if reliability: + query += " AND s.reliability = ?" + params.append(reliability) + if state_affiliated is not None: + query += " AND s.state_affiliated = ?" + params.append(1 if state_affiliated else 0) + if alignment: + query += " AND EXISTS (SELECT 1 FROM source_alignments sa WHERE sa.source_id = s.id AND sa.alignment = ?)" + params.append(alignment.lower()) - query += " ORDER BY source_type, category, name" + query += " ORDER BY s.source_type, s.category, s.name" cursor = await db.execute(query, params) rows = await cursor.fetchall() - results = [] - for row in rows: - d = dict(row) + results = [dict(row) for row in rows] + alignments_map = await _load_alignments_for(db, [r["id"] for r in results]) + for d in results: d["is_global"] = d.get("tenant_id") is None - results.append(d) + d["state_affiliated"] = bool(d.get("state_affiliated")) + d["alignments"] = alignments_map.get(d["id"], []) return results @@ -454,26 +524,60 @@ async def create_source( detail=f"Domain '{domain}' bereits als Quelle vorhanden: {domain_existing['name']}. Für einen neuen RSS-Feed bitte die Feed-URL angeben.", ) + payload = data.model_dump(exclude_unset=True) + alignments = payload.pop("alignments", None) + classification_touched = bool(SOURCE_CLASSIFICATION_FIELDS & payload.keys()) or alignments is not None + + cols = ["name", "url", "domain", "source_type", "category", "status", "notes", + "language", "bias", + "political_orientation", "media_type", "reliability", + "state_affiliated", "country_code", + "added_by", "tenant_id"] + vals = [ + data.name, + data.url, + domain, + data.source_type, + data.category, + data.status, + data.notes, + payload.get("language"), + payload.get("bias"), + payload.get("political_orientation"), + payload.get("media_type"), + payload.get("reliability"), + 1 if payload.get("state_affiliated") else 0, + payload.get("country_code"), + current_user["username"], + tenant_id, + ] + if classification_touched: + cols += ["classification_source", "classified_at"] + vals += ["manual"] + ts_marker = True + else: + ts_marker = False + + placeholders = ", ".join(["?"] * len(vals) + (["CURRENT_TIMESTAMP"] if ts_marker else [])) cursor = await db.execute( - """INSERT INTO sources (name, url, domain, source_type, category, status, notes, added_by, tenant_id) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", - ( - data.name, - data.url, - domain, - data.source_type, - data.category, - data.status, - data.notes, - current_user["username"], - tenant_id, - ), + f"INSERT INTO sources ({', '.join(cols)}) VALUES ({placeholders})", + vals, ) + new_id = cursor.lastrowid + + if alignments: + await _replace_alignments(db, new_id, alignments) + await db.commit() - cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (cursor.lastrowid,)) + cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (new_id,)) row = await cursor.fetchone() - return dict(row) + result = dict(row) + result["is_global"] = result.get("tenant_id") is None + result["state_affiliated"] = bool(result.get("state_affiliated")) + alignments_map = await _load_alignments_for(db, [new_id]) + result["alignments"] = alignments_map.get(new_id, []) + return result @router.put("/{source_id}", response_model=SourceResponse) @@ -494,27 +598,51 @@ async def update_source( _check_source_ownership(dict(row), current_user["username"]) + payload = data.model_dump(exclude_unset=True) + alignments = payload.pop("alignments", None) + updates = {} - for field, value in data.model_dump(exclude_none=True).items(): + for field, value in payload.items(): if field not in SOURCE_UPDATE_COLUMNS: continue # Domain normalisieren if field == "domain" and value: value = _DOMAIN_ALIASES.get(value.lower(), value.lower()) + if field == "state_affiliated": + value = 1 if value else 0 updates[field] = value - if not updates: - return dict(row) + classification_touched = bool(SOURCE_CLASSIFICATION_FIELDS & updates.keys()) or alignments is not None + if classification_touched: + updates["classification_source"] = "manual" + updates["classified_at"] = "CURRENT_TIMESTAMP_MARKER" - set_clause = ", ".join(f"{k} = ?" for k in updates) - values = list(updates.values()) + [source_id] + if updates: + set_parts = [] + values = [] + for k, v in updates.items(): + if v == "CURRENT_TIMESTAMP_MARKER": + set_parts.append(f"{k} = CURRENT_TIMESTAMP") + else: + set_parts.append(f"{k} = ?") + values.append(v) + values.append(source_id) + await db.execute(f"UPDATE sources SET {', '.join(set_parts)} WHERE id = ?", values) - await db.execute(f"UPDATE sources SET {set_clause} WHERE id = ?", values) - await db.commit() + if alignments is not None: + await _replace_alignments(db, source_id, alignments) + + if updates or alignments is not None: + await db.commit() cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,)) row = await cursor.fetchone() - return dict(row) + result = dict(row) + result["is_global"] = result.get("tenant_id") is None + result["state_affiliated"] = bool(result.get("state_affiliated")) + alignments_map = await _load_alignments_for(db, [source_id]) + result["alignments"] = alignments_map.get(source_id, []) + return result @router.delete("/{source_id}", status_code=status.HTTP_204_NO_CONTENT) From 715af17ac3f85f95ae9ffe706a80b49f0147aa6e Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 18:37:09 +0000 Subject: [PATCH 03/15] feat(sources): UI fuer Quellen-Klassifikation (Filter, Badges, Edit-Form) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Quellen-Modal: 4 neue Filter (Politik, Medientyp, Reliability, Alignment). - Edit-Form: Selects fuer political_orientation/media_type/reliability, Multi-Select-Chips fuer alignments, Toggle state_affiliated, Country-Code-Input. - renderSourceGroup: Politik-Badge mit DACH-Farbskala (rot=L, blau=R), Reliability-Punkt (gruen→rot), Alignment-Tags, state-affiliated-Indikator. Tooltip um alle 4 Achsen erweitert. - CSS-Block fuer alle neuen Badge-/Chip-Styles. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/static/css/style.css | 111 +++++++++++++++++++++++++++ src/static/dashboard.html | 147 ++++++++++++++++++++++++++++++++++++ src/static/js/app.js | 65 ++++++++++++++++ src/static/js/components.js | 112 ++++++++++++++++++++++++++- 4 files changed, 431 insertions(+), 4 deletions(-) diff --git a/src/static/css/style.css b/src/static/css/style.css index f232fac..3bc671c 100644 --- a/src/static/css/style.css +++ b/src/static/css/style.css @@ -3503,6 +3503,117 @@ a.dev-source-pill:hover { color: var(--info); } +/* Klassifikations-Badges (politisch / reliability / alignments / state) */ +.source-classification-badges { + display: inline-flex; + align-items: center; + gap: 4px; + flex-wrap: wrap; +} + +.source-political-badge { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 22px; + padding: 2px 6px; + border-radius: var(--radius); + font-size: 10px; + font-weight: 700; + letter-spacing: 0.4px; + color: #fff; + background: #9e9e9e; +} +.source-political-badge.pol-links_extrem { background: #b71c1c; } +.source-political-badge.pol-links { background: #e53935; } +.source-political-badge.pol-mitte_links { background: #ef9a9a; color: #4a0d0d; } +.source-political-badge.pol-liberal { background: #fdd835; color: #4a3700; } +.source-political-badge.pol-mitte { background: #9e9e9e; } +.source-political-badge.pol-konservativ { background: #90caf9; color: #0d2740; } +.source-political-badge.pol-mitte_rechts { background: #5c6bc0; } +.source-political-badge.pol-rechts { background: #1976d2; } +.source-political-badge.pol-rechts_extrem { background: #0d47a1; } + +.source-reliability-dot { + display: inline-block; + width: 10px; + height: 10px; + border-radius: 50%; + background: #9e9e9e; + border: 1px solid rgba(0, 0, 0, 0.15); +} +.source-reliability-dot.rel-sehr_hoch { background: #2e7d32; } +.source-reliability-dot.rel-hoch { background: #66bb6a; } +.source-reliability-dot.rel-gemischt { background: #fbc02d; } +.source-reliability-dot.rel-niedrig { background: #ef6c00; } +.source-reliability-dot.rel-sehr_niedrig { background: #c62828; } + +.source-state-badge { + display: inline-flex; + align-items: center; + justify-content: center; + width: 18px; + height: 18px; + border-radius: 50%; + background: #4a148c; + color: #fff; + font-size: 11px; + line-height: 1; +} + +.source-alignment-chip-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: 999px; + font-size: 10px; + font-weight: 500; + background: var(--cat-sonstige-bg, #eef); + color: var(--text-secondary, #555); + border: 1px solid rgba(0, 0, 0, 0.08); +} + +/* Edit-Form: Klassifikations-Sektion */ +.sources-classification-section { + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid var(--border-color, rgba(0,0,0,0.08)); +} +.sources-classification-header { + font-size: 12px; + font-weight: 600; + color: var(--text-secondary, #555); + margin-bottom: 8px; + letter-spacing: 0.3px; + text-transform: uppercase; +} +.alignment-chips { + display: flex; + flex-wrap: wrap; + gap: 6px; +} +.alignment-chip { + display: inline-flex; + align-items: center; + padding: 4px 10px; + border-radius: 999px; + font-size: 11px; + font-weight: 500; + background: transparent; + color: var(--text-secondary, #555); + border: 1px solid var(--border-color, rgba(0,0,0,0.15)); + cursor: pointer; + transition: all 0.12s ease; +} +.alignment-chip:hover { + background: var(--cat-sonstige-bg, #eef); +} +.alignment-chip.active { + background: var(--primary, #2a81cb); + color: #fff; + border-color: var(--primary, #2a81cb); +} + /* Typ-Badges */ .source-type-badge { display: inline-flex; diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 4737350..43a81dd 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -481,6 +481,70 @@ + + + + + + + + @@ -548,6 +612,89 @@ +
+
Einordnung
+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+ +
+ + + + + + + + + + + + +
+
+
diff --git a/src/static/js/app.js b/src/static/js/app.js index 0e65c4f..1aff794 100644 --- a/src/static/js/app.js +++ b/src/static/js/app.js @@ -2750,6 +2750,10 @@ async handleRefresh() { // Filter anwenden const typeFilter = document.getElementById('sources-filter-type')?.value || ''; const catFilter = document.getElementById('sources-filter-category')?.value || ''; + const politicalFilter = document.getElementById('sources-filter-political')?.value || ''; + const mediaTypeFilter = document.getElementById('sources-filter-mediatype')?.value || ''; + const reliabilityFilter = document.getElementById('sources-filter-reliability')?.value || ''; + const alignmentFilter = document.getElementById('sources-filter-alignment')?.value || ''; const search = (document.getElementById('sources-search')?.value || '').toLowerCase(); // Alle Quellen nach Domain gruppieren @@ -2800,6 +2804,20 @@ async handleRefresh() { if (!hasMatchingCat) continue; } + // Klassifikations-Filter + if (politicalFilter) { + if (!feeds.some(f => (f.political_orientation || 'na') === politicalFilter)) continue; + } + if (mediaTypeFilter) { + if (!feeds.some(f => (f.media_type || 'sonstige') === mediaTypeFilter)) continue; + } + if (reliabilityFilter) { + if (!feeds.some(f => (f.reliability || 'na') === reliabilityFilter)) continue; + } + if (alignmentFilter) { + if (!feeds.some(f => Array.isArray(f.alignments) && f.alignments.includes(alignmentFilter))) continue; + } + // Suche if (search) { const groupText = feeds.map(f => @@ -3054,6 +3072,13 @@ async handleRefresh() { document.getElementById('src-discover-btn').disabled = false; document.getElementById('src-discover-btn').textContent = 'Erkennen'; document.getElementById('src-type-select').value = 'rss_feed'; + // Klassifikations-Felder auf Default zurücksetzen + const polEl = document.getElementById('src-political'); if (polEl) polEl.value = 'na'; + const mtEl = document.getElementById('src-mediatype'); if (mtEl) mtEl.value = 'sonstige'; + const relEl = document.getElementById('src-reliability'); if (relEl) relEl.value = 'na'; + const ccEl = document.getElementById('src-country'); if (ccEl) ccEl.value = ''; + const saEl = document.getElementById('src-state-affiliated'); if (saEl) saEl.checked = false; + this._setAlignmentChips([]); // Save-Button Text zurücksetzen const saveBtn = document.querySelector('#src-discovery-result .sources-discovery-actions .btn-primary'); if (saveBtn) saveBtn.textContent = 'Speichern'; @@ -3235,6 +3260,19 @@ async handleRefresh() { rss_url: source.url, }; + // Klassifikations-Felder setzen + const polEl = document.getElementById('src-political'); + if (polEl) polEl.value = source.political_orientation || 'na'; + const mtEl = document.getElementById('src-mediatype'); + if (mtEl) mtEl.value = source.media_type || 'sonstige'; + const relEl = document.getElementById('src-reliability'); + if (relEl) relEl.value = source.reliability || 'na'; + const ccEl = document.getElementById('src-country'); + if (ccEl) ccEl.value = source.country_code || ''; + const saEl = document.getElementById('src-state-affiliated'); + if (saEl) saEl.checked = !!source.state_affiliated; + this._setAlignmentChips(source.alignments || []); + // Submit-Button-Text ändern const saveBtn = document.querySelector('#src-discovery-result .sources-discovery-actions .btn-primary'); if (saveBtn) saveBtn.textContent = 'Quelle speichern'; @@ -3243,6 +3281,27 @@ async handleRefresh() { if (form) form.scrollIntoView({ behavior: 'smooth', block: 'start' }); }, + _setAlignmentChips(active) { + const chips = document.querySelectorAll('#src-alignments-chips .alignment-chip'); + const set = new Set((active || []).map(a => (a || '').toLowerCase())); + chips.forEach(chip => { + if (set.has(chip.dataset.alignment)) chip.classList.add('active'); + else chip.classList.remove('active'); + }); + }, + + _getAlignmentChips() { + return Array.from(document.querySelectorAll('#src-alignments-chips .alignment-chip.active')) + .map(chip => chip.dataset.alignment); + }, + + handleAlignmentChipClick(e) { + const chip = e.target.closest('.alignment-chip'); + if (!chip) return; + e.preventDefault(); + chip.classList.toggle('active'); + }, + async saveSource() { const name = document.getElementById('src-name').value.trim(); if (!name) { @@ -3258,6 +3317,12 @@ async handleRefresh() { url: discovered.rss_url || (discovered.source_type === 'telegram_channel' ? (document.getElementById('src-domain').value || null) : null), domain: document.getElementById('src-domain').value.trim() || discovered.domain || null, notes: document.getElementById('src-notes').value.trim() || null, + political_orientation: document.getElementById('src-political')?.value || 'na', + media_type: document.getElementById('src-mediatype')?.value || 'sonstige', + reliability: document.getElementById('src-reliability')?.value || 'na', + country_code: (document.getElementById('src-country')?.value || '').trim().toUpperCase() || null, + state_affiliated: !!document.getElementById('src-state-affiliated')?.checked, + alignments: this._getAlignmentChips(), }; if (!data.domain && discovered.domain) { diff --git a/src/static/js/components.js b/src/static/js/components.js index b32dce0..d0a2cd8 100644 --- a/src/static/js/components.js +++ b/src/static/js/components.js @@ -1062,6 +1062,85 @@ const UI = { 'sonstige': 'Sonstige', }, + _politicalLabels: { + links_extrem: { short: 'L+', full: 'Links (extrem)' }, + links: { short: 'L', full: 'Links' }, + mitte_links: { short: 'ML', full: 'Mitte-Links' }, + liberal: { short: 'LIB', full: 'Liberal' }, + mitte: { short: 'M', full: 'Mitte' }, + konservativ: { short: 'KON', full: 'Konservativ' }, + mitte_rechts: { short: 'MR', full: 'Mitte-Rechts' }, + rechts: { short: 'R', full: 'Rechts' }, + rechts_extrem: { short: 'R+', full: 'Rechts (extrem)' }, + na: { short: '?', full: 'Nicht eingeordnet' }, + }, + _reliabilityLabels: { + sehr_hoch: 'Sehr hoch', + hoch: 'Hoch', + gemischt: 'Gemischt', + niedrig: 'Niedrig', + sehr_niedrig: 'Sehr niedrig', + na: 'Nicht eingeordnet', + }, + _mediaTypeLabels: { + tageszeitung: 'Tageszeitung', + wochenzeitung: 'Wochenzeitung', + magazin: 'Magazin', + tv_sender: 'TV-Sender', + radio: 'Radio', + oeffentlich_rechtlich: 'Öffentlich-Rechtlich', + nachrichtenagentur: 'Nachrichtenagentur', + online_only: 'Online-only', + blog: 'Blog', + telegram_kanal: 'Telegram-Kanal', + telegram_bot: 'Telegram-Bot', + podcast: 'Podcast', + social_media: 'Social Media', + imageboard: 'Imageboard', + think_tank: 'Think Tank', + ngo: 'NGO', + behoerde: 'Behörde', + staatsmedium: 'Staatsmedium', + fachmedium: 'Fachmedium', + sonstige: 'Sonstige', + }, + _alignmentLabels: { + prorussisch: 'prorussisch', + proiranisch: 'proiranisch', + prowestlich: 'prowestlich', + proukrainisch: 'proukrainisch', + prochinesisch: 'prochinesisch', + projapanisch: 'projapanisch', + proisraelisch: 'proisraelisch', + propalaestinensisch: 'propalästinensisch', + protuerkisch: 'protürkisch', + panarabisch: 'panarabisch', + neutral: 'neutral', + sonstige: 'sonstige', + }, + + _renderClassificationBadges(feed) { + const parts = []; + const pol = feed.political_orientation; + if (pol && pol !== 'na') { + const label = this._politicalLabels[pol] || { short: pol, full: pol }; + parts.push(`${this.escape(label.short)}`); + } + const rel = feed.reliability; + if (rel && rel !== 'na') { + parts.push(``); + } + if (feed.state_affiliated) { + parts.push(``); + } + const aligns = Array.isArray(feed.alignments) ? feed.alignments : []; + aligns.forEach(a => { + const label = this._alignmentLabels[a] || a; + parts.push(`${this.escape(label)}`); + }); + return parts.join(''); + }, + /** * Domain-Gruppe rendern (aufklappbar mit Feeds). */ @@ -1117,20 +1196,44 @@ const UI = { ? `${feedCount} Feed${feedCount !== 1 ? 's' : ''}` : ''; - // Info-Button mit Tooltip (Typ, Sprache, Ausrichtung) + // Info-Button mit Tooltip (Typ, Sprache, Ausrichtung, Klassifikation) let infoButtonHtml = ''; const firstFeed = feeds[0] || {}; - const hasInfo = firstFeed.language || firstFeed.bias; + const hasInfo = firstFeed.language || firstFeed.bias + || (firstFeed.political_orientation && firstFeed.political_orientation !== 'na') + || (firstFeed.media_type && firstFeed.media_type !== 'sonstige') + || (firstFeed.reliability && firstFeed.reliability !== 'na') + || firstFeed.state_affiliated + || firstFeed.country_code + || (Array.isArray(firstFeed.alignments) && firstFeed.alignments.length > 0); if (hasInfo) { - const typeMap = { rss_feed: 'RSS-Feed', web_source: 'Web-Quelle', telegram_channel: 'Telegram-Kanal' }; + const typeMap = { rss_feed: 'RSS-Feed', web_source: 'Web-Quelle', telegram_channel: 'Telegram-Kanal', podcast_feed: 'Podcast' }; const lines = []; lines.push('Typ: ' + (typeMap[firstFeed.source_type] || firstFeed.source_type || 'Unbekannt')); if (firstFeed.language) lines.push('Sprache: ' + firstFeed.language); - if (firstFeed.bias) lines.push('Ausrichtung: ' + firstFeed.bias); + if (firstFeed.country_code) lines.push('Land: ' + firstFeed.country_code); + if (firstFeed.media_type && firstFeed.media_type !== 'sonstige') { + lines.push('Medientyp: ' + (this._mediaTypeLabels[firstFeed.media_type] || firstFeed.media_type)); + } + if (firstFeed.political_orientation && firstFeed.political_orientation !== 'na') { + const pl = this._politicalLabels[firstFeed.political_orientation]; + lines.push('Politisch: ' + (pl ? pl.full : firstFeed.political_orientation)); + } + if (firstFeed.reliability && firstFeed.reliability !== 'na') { + lines.push('Glaubwürdigkeit: ' + (this._reliabilityLabels[firstFeed.reliability] || firstFeed.reliability)); + } + if (firstFeed.state_affiliated) lines.push('Staatsnah: ja'); + if (Array.isArray(firstFeed.alignments) && firstFeed.alignments.length > 0) { + const labels = firstFeed.alignments.map(a => this._alignmentLabels[a] || a); + lines.push('Geopolitische Nähe: ' + labels.join(', ')); + } + if (firstFeed.bias) lines.push('Notiz: ' + firstFeed.bias); const tooltipText = this.escape(lines.join('\n')); infoButtonHtml = ` `; } + const classificationBadges = this._renderClassificationBadges(firstFeed); + return `
${toggleIcon} @@ -1138,6 +1241,7 @@ const UI = { ${this.escape(displayName)}${infoButtonHtml}
${catLabel} + ${classificationBadges ? `${classificationBadges}` : ''} ${feedCountBadge}
${!isGlobal && !hasMultiple && feeds[0]?.id ? `` : ''} From 62ba38ae46acc5db8df0eaa3e36d7e70b83e9948 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 18:46:54 +0000 Subject: [PATCH 04/15] feat(sources): LLM-Klassifikator + Review-API + Bulk-Migrationsskript - src/services/source_classifier.py: classify_source(db, id) ruft Haiku mit strukturiertem Prompt (4 Achsen + state_affiliated + country + Konfidenz) und schreibt Vorschlaege in proposed_*-Spalten. bulk_classify(db, limit) iteriert sequenziell ueber unklassifizierte Quellen. - API-Endpoints (alle hinter Auth, globale Quellen nur fuer org_admin): - GET /api/sources/classification/stats - GET /api/sources/classification/queue - POST /api/sources/{id}/classification/approve (proposed_* -> echte Felder) - POST /api/sources/{id}/classification/reject (proposed_* loeschen) - POST /api/sources/{id}/classification/reclassify (sofort, ~3-5s) - POST /api/sources/classification/bulk-classify (BackgroundTask) - scripts/migrate_sources_classification.py: CLI-Wrapper fuer Bulk-Migration zur einmaligen Erstbestueckung aller Bestandsquellen. Sample-Test auf Staging steht aus. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/migrate_sources_classification.py | 64 +++++ src/routers/sources.py | 241 +++++++++++++++++- src/services/source_classifier.py | 295 ++++++++++++++++++++++ 3 files changed, 598 insertions(+), 2 deletions(-) create mode 100644 scripts/migrate_sources_classification.py create mode 100644 src/services/source_classifier.py diff --git a/scripts/migrate_sources_classification.py b/scripts/migrate_sources_classification.py new file mode 100644 index 0000000..3fab3fe --- /dev/null +++ b/scripts/migrate_sources_classification.py @@ -0,0 +1,64 @@ +"""Einmalige LLM-Klassifikation aller noch unklassifizierten Quellen. + +Verwendung: + python3 scripts/migrate_sources_classification.py --limit 50 + python3 scripts/migrate_sources_classification.py --limit 500 # Alle + python3 scripts/migrate_sources_classification.py --recheck-pending # bereits Pending neu + +Schreibt Vorschlaege in proposed_*-Spalten. Approval erfolgt anschliessend +ueber das Verwaltungs-UI / API (POST /api/sources/{id}/classification/approve). +""" +import argparse +import asyncio +import logging +import sys +from pathlib import Path + +# src/ in PYTHONPATH aufnehmen, wenn Skript direkt aufgerufen wird +HERE = Path(__file__).resolve().parent +SRC = HERE.parent / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +from database import get_db # noqa: E402 +from services.source_classifier import bulk_classify # noqa: E402 + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", +) +logger = logging.getLogger("migrate_sources") + + +async def main(): + parser = argparse.ArgumentParser(description="LLM-Klassifikation aller Quellen.") + parser.add_argument("--limit", type=int, default=50, help="Max. Quellen pro Lauf") + parser.add_argument( + "--recheck-pending", + action="store_true", + help="Auch Quellen mit classification_source='llm_pending' neu klassifizieren", + ) + args = parser.parse_args() + + db = await get_db() + try: + result = await bulk_classify( + db, + limit=args.limit, + only_unclassified=not args.recheck_pending, + ) + finally: + await db.close() + + print(f"Verarbeitet: {result['processed']}") + print(f"Erfolgreich: {result['success']}") + print(f"Fehler: {len(result['errors'])}") + print(f"Kosten: ${result['total_cost_usd']:.4f}") + if result["errors"]: + print("\nFehler-Details:") + for e in result["errors"][:10]: + print(f" source_id={e['source_id']}: {e['error']}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/routers/sources.py b/src/routers/sources.py index 9adade2..9907e8d 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -1,10 +1,12 @@ """Sources-Router: Quellenverwaltung (Multi-Tenant).""" +import json import logging from collections import defaultdict -from fastapi import APIRouter, Depends, HTTPException, status +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest from auth import get_current_user -from database import db_dependency, refresh_source_counts +from database import db_dependency, get_db, refresh_source_counts +from services.source_classifier import bulk_classify, classify_source from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES import aiosqlite @@ -700,3 +702,238 @@ async def trigger_refresh_counts( """Artikelzaehler fuer alle Quellen neu berechnen.""" await refresh_source_counts(db) return {"status": "ok"} + + +# === Klassifikations-Review (LLM-Vorschlaege approve/reject/reclassify) === + +def _require_admin_for_global(row: dict, current_user: dict): + """Globale Quellen (tenant_id IS NULL) duerfen nur org_admins approve-en/reclassify-en.""" + if row.get("tenant_id") is None and current_user.get("role") != "org_admin": + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Globale Quellen koennen nur von Admins klassifiziert werden", + ) + + +@router.get("/classification/stats") +async def classification_stats( + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Counts pro classification_source-Wert (global + eigene Org).""" + tenant_id = current_user.get("tenant_id") + cursor = await db.execute( + """SELECT classification_source, COUNT(*) as cnt + FROM sources + WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active' + GROUP BY classification_source""", + (tenant_id,), + ) + by_source = {row["classification_source"] or "legacy": row["cnt"] for row in await cursor.fetchall()} + cursor = await db.execute( + """SELECT COUNT(*) as cnt FROM sources + WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active' + AND proposed_political_orientation IS NOT NULL""", + (tenant_id,), + ) + pending = (await cursor.fetchone())["cnt"] + return { + "by_classification_source": by_source, + "pending_review": pending, + "total": sum(by_source.values()), + } + + +@router.get("/classification/queue") +async def classification_queue( + limit: int = 50, + min_confidence: float = 0.0, + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Liefert Quellen mit nicht-leeren proposed_*-Spalten (Review-Queue).""" + tenant_id = current_user.get("tenant_id") + cursor = await db.execute( + """SELECT s.* FROM sources s + WHERE (s.tenant_id IS NULL OR s.tenant_id = ?) + AND s.proposed_political_orientation IS NOT NULL + AND COALESCE(s.proposed_confidence, 0) >= ? + ORDER BY s.proposed_confidence DESC, s.proposed_at DESC + LIMIT ?""", + (tenant_id, min_confidence, limit), + ) + rows = [dict(r) for r in await cursor.fetchall()] + alignments_map = await _load_alignments_for(db, [r["id"] for r in rows]) + out = [] + for d in rows: + try: + proposed_aligns = json.loads(d.get("proposed_alignments_json") or "[]") + except (json.JSONDecodeError, TypeError): + proposed_aligns = [] + out.append({ + "id": d["id"], + "name": d["name"], + "url": d.get("url"), + "domain": d.get("domain"), + "source_type": d.get("source_type"), + "category": d.get("category"), + "is_global": d.get("tenant_id") is None, + "current": { + "political_orientation": d.get("political_orientation"), + "media_type": d.get("media_type"), + "reliability": d.get("reliability"), + "state_affiliated": bool(d.get("state_affiliated")), + "country_code": d.get("country_code"), + "alignments": alignments_map.get(d["id"], []), + "classification_source": d.get("classification_source"), + }, + "proposed": { + "political_orientation": d.get("proposed_political_orientation"), + "media_type": d.get("proposed_media_type"), + "reliability": d.get("proposed_reliability"), + "state_affiliated": bool(d.get("proposed_state_affiliated")), + "country_code": d.get("proposed_country_code"), + "alignments": proposed_aligns, + "confidence": d.get("proposed_confidence"), + "reasoning": d.get("proposed_reasoning"), + "proposed_at": d.get("proposed_at"), + }, + }) + return out + + +async def _clear_proposed(db: aiosqlite.Connection, source_id: int): + """Loescht die proposed_*-Felder einer Quelle (ohne commit).""" + await db.execute( + """UPDATE sources SET + proposed_political_orientation = NULL, + proposed_media_type = NULL, + proposed_reliability = NULL, + proposed_state_affiliated = NULL, + proposed_country_code = NULL, + proposed_alignments_json = NULL, + proposed_confidence = NULL, + proposed_reasoning = NULL, + proposed_at = NULL + WHERE id = ?""", + (source_id,), + ) + + +@router.post("/{source_id}/classification/approve") +async def approve_classification( + source_id: int, + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Uebernimmt proposed_* in echte Felder, setzt classification_source='llm_approved'.""" + cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,)) + row = await cursor.fetchone() + if not row: + raise HTTPException(status_code=404, detail="Quelle nicht gefunden") + src = dict(row) + _require_admin_for_global(src, current_user) + + if src.get("proposed_political_orientation") is None: + raise HTTPException(status_code=400, detail="Keine LLM-Vorschlaege fuer diese Quelle vorhanden") + + try: + proposed_aligns = json.loads(src.get("proposed_alignments_json") or "[]") + except (json.JSONDecodeError, TypeError): + proposed_aligns = [] + + await db.execute( + """UPDATE sources SET + political_orientation = ?, + media_type = ?, + reliability = ?, + state_affiliated = ?, + country_code = ?, + classification_source = 'llm_approved', + classified_at = CURRENT_TIMESTAMP + WHERE id = ?""", + ( + src["proposed_political_orientation"], + src["proposed_media_type"], + src["proposed_reliability"], + 1 if src.get("proposed_state_affiliated") else 0, + src.get("proposed_country_code"), + source_id, + ), + ) + await _replace_alignments(db, source_id, [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS]) + await _clear_proposed(db, source_id) + await db.commit() + return {"source_id": source_id, "status": "approved"} + + +@router.post("/{source_id}/classification/reject") +async def reject_classification( + source_id: int, + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Verwirft die LLM-Vorschlaege ohne Uebernahme. classification_source bleibt unveraendert.""" + cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,)) + row = await cursor.fetchone() + if not row: + raise HTTPException(status_code=404, detail="Quelle nicht gefunden") + src = dict(row) + _require_admin_for_global(src, current_user) + + await _clear_proposed(db, source_id) + # Wenn classification_source noch 'llm_pending' war, zurueck auf 'legacy' + if src.get("classification_source") == "llm_pending": + await db.execute( + "UPDATE sources SET classification_source = 'legacy' WHERE id = ?", + (source_id,), + ) + await db.commit() + return {"source_id": source_id, "status": "rejected"} + + +@router.post("/{source_id}/classification/reclassify") +async def reclassify_source( + source_id: int, + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Triggert eine LLM-Klassifikation einer einzelnen Quelle (synchron, ~3-5s).""" + cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,)) + row = await cursor.fetchone() + if not row: + raise HTTPException(status_code=404, detail="Quelle nicht gefunden") + src = dict(row) + _require_admin_for_global(src, current_user) + + try: + result = await classify_source(db, source_id) + except Exception as e: + logger.error("Reclassify source_id=%s fehlgeschlagen: %s", source_id, e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Klassifikation fehlgeschlagen: {e}") + return result + + +async def _bulk_classify_background(limit: int, only_unclassified: bool): + """Hintergrund-Task: oeffnet eigene DB-Connection.""" + db = await get_db() + try: + await bulk_classify(db, limit=limit, only_unclassified=only_unclassified) + finally: + await db.close() + + +@router.post("/classification/bulk-classify") +async def trigger_bulk_classify( + background_tasks: BackgroundTasks, + limit: int = 50, + only_unclassified: bool = True, + current_user: dict = Depends(get_current_user), +): + """Startet eine Bulk-Klassifikation im Hintergrund (nur Admins).""" + if current_user.get("role") != "org_admin": + raise HTTPException(status_code=403, detail="Nur Admins koennen Bulk-Klassifikation starten") + if limit < 1 or limit > 500: + raise HTTPException(status_code=400, detail="limit muss zwischen 1 und 500 liegen") + background_tasks.add_task(_bulk_classify_background, limit, only_unclassified) + return {"status": "started", "limit": limit, "only_unclassified": only_unclassified} diff --git a/src/services/source_classifier.py b/src/services/source_classifier.py new file mode 100644 index 0000000..c965958 --- /dev/null +++ b/src/services/source_classifier.py @@ -0,0 +1,295 @@ +"""Klassifiziert Quellen via Claude (Haiku) nach 4 Achsen + state_affiliated + country. + +Schreibt Vorschlaege in die proposed_*-Spalten von sources und setzt +classification_source='llm_pending'. Approval erfolgt ueber separate Endpoints, +die proposed_* in die echten Spalten kopieren. +""" +import asyncio +import json +import logging +import re + +import aiosqlite + +from agents.claude_client import call_claude +from config import CLAUDE_MODEL_FAST + +logger = logging.getLogger("osint.source_classifier") + +POLITICAL_VALUES = { + "links_extrem", "links", "mitte_links", "liberal", "mitte", + "konservativ", "mitte_rechts", "rechts", "rechts_extrem", "na", +} +MEDIA_TYPE_VALUES = { + "tageszeitung", "wochenzeitung", "magazin", "tv_sender", "radio", + "oeffentlich_rechtlich", "nachrichtenagentur", "online_only", "blog", + "telegram_kanal", "telegram_bot", "podcast", "social_media", "imageboard", + "think_tank", "ngo", "behoerde", "staatsmedium", "fachmedium", "sonstige", +} +RELIABILITY_VALUES = {"sehr_hoch", "hoch", "gemischt", "niedrig", "sehr_niedrig", "na"} +ALIGNMENT_VALUES = { + "prorussisch", "proiranisch", "prowestlich", "proukrainisch", + "prochinesisch", "projapanisch", "proisraelisch", "propalaestinensisch", + "protuerkisch", "panarabisch", "neutral", "sonstige", +} + + +def _build_prompt(src: dict, sample_articles: list[dict]) -> str: + sample_text = "" + if sample_articles: + lines = [] + for i, art in enumerate(sample_articles[:5], 1): + headline = (art.get("headline") or art.get("headline_de") or "").strip() + if headline: + lines.append(f"{i}. {headline[:200]}") + if lines: + sample_text = "\nLetzte Artikel/Headlines:\n" + "\n".join(lines) + + return f"""Du bist ein OSINT-Analyst und klassifizierst Nachrichten- und Medienquellen fuer ein Lagebild-Monitoring-System (DACH-Raum). + +QUELLE: +Name: {src.get('name')} +URL: {src.get('url') or '-'} +Domain: {src.get('domain') or '-'} +Quellentyp: {src.get('source_type')} +Bisherige Kategorie: {src.get('category')} +Sprache: {src.get('language') or 'unbekannt'} +Bisherige Notiz (Freitext): {src.get('bias') or '-'}{sample_text} + +AUFGABE: Klassifiziere die Quelle nach folgenden Achsen. + +1. political_orientation: + - links_extrem (z.B. linksunten.indymedia) + - links (klar links, z.B. junge Welt, taz) + - mitte_links (linksliberal/sozialdemokratisch, z.B. SZ, Spiegel) + - liberal (wirtschafts-/grünliberal, z.B. NZZ, Zeit) + - mitte (politisch neutral, Agentur, z.B. dpa, Reuters, tagesschau) + - konservativ (buergerlich-konservativ, z.B. FAZ, Welt) + - mitte_rechts (rechts-buergerlich, z.B. Tichys Einblick, Achgut) + - rechts (klar rechts, z.B. Junge Freiheit, EpochTimes) + - rechts_extrem (z.B. Compact, PI-News) + - na (nicht klassifizierbar: Behoerde, Fachmedium, Think Tank ohne klare politische Linie) + +2. media_type (genau einer): + tageszeitung, wochenzeitung, magazin, tv_sender, radio, oeffentlich_rechtlich, + nachrichtenagentur, online_only, blog, telegram_kanal, telegram_bot, podcast, + social_media, imageboard, think_tank, ngo, behoerde, staatsmedium, fachmedium, sonstige + +3. reliability: + - sehr_hoch (etablierte Qualitaet, Faktencheck: tagesschau, dpa, FAZ, Reuters) + - hoch (serioes mit gelegentlichen Schwaechen: taz, Welt, BILD bei harten News) + - gemischt (Mix Meinung/Einseitigkeit: Tichys Einblick, Achgut, Boulevard) + - niedrig (haeufig irrefuehrend, schwache Quellenarbeit: Junge Freiheit, EpochTimes) + - sehr_niedrig (bekannt fuer Desinformation/Verschwoerung: Compact, RT, Sputnik, PI-News) + - na (nicht bewertbar) + +4. alignments (Mehrfach, leeres Array wenn keine ausgepraegte Naehe): + prorussisch, proiranisch, prowestlich, proukrainisch, prochinesisch, projapanisch, + proisraelisch, propalaestinensisch, protuerkisch, panarabisch, neutral, sonstige + +5. state_affiliated (true/false): true wenn vom Staat finanziert/kontrolliert + (RT, Sputnik, CGTN, PressTV, Xinhua, TRT). Public Service Broadcaster + wie ARD/ZDF/BBC sind NICHT state_affiliated. + +6. country_code (ISO 3166-1 alpha-2): Heimatland (DE, AT, CH, RU, US, ...). null wenn unklar. + +7. confidence (0.0-1.0): 0.85+ fuer bekannte Outlets, 0.5-0.85 fuer mittelbekannt, <0.5 fuer unsicher. + +8. reasoning (1-2 Saetze): Kurze Begruendung der Hauptklassifikationen. + +WICHTIG: +- Antworte AUSSCHLIESSLICH mit einem JSON-Objekt, kein Text drumherum. +- Nutze ausschliesslich die genannten enum-Werte (snake_case). +- Bei Unklarheit lieber `na` und niedrige confidence. + +JSON-Schema: +{{ + "political_orientation": "...", + "media_type": "...", + "reliability": "...", + "alignments": ["..."], + "state_affiliated": false, + "country_code": "DE", + "confidence": 0.9, + "reasoning": "..." +}}""" + + +async def _load_sample_articles(db: aiosqlite.Connection, name: str, domain: str | None, limit: int = 5) -> list[dict]: + """Laedt die letzten Headlines einer Quelle (per name oder Domain-Match).""" + rows: list = [] + if name: + cursor = await db.execute( + "SELECT headline, headline_de FROM articles WHERE source = ? ORDER BY collected_at DESC LIMIT ?", + (name, limit), + ) + rows = await cursor.fetchall() + if not rows and domain: + cursor = await db.execute( + "SELECT headline, headline_de FROM articles WHERE source_url LIKE ? ORDER BY collected_at DESC LIMIT ?", + (f"%{domain}%", limit), + ) + rows = await cursor.fetchall() + return [dict(r) for r in rows] + + +def _validate(parsed: dict) -> dict: + """Validiert + normalisiert eine LLM-Antwort gegen die Enums.""" + pol = parsed.get("political_orientation", "na") + if pol not in POLITICAL_VALUES: + pol = "na" + mt = parsed.get("media_type", "sonstige") + if mt not in MEDIA_TYPE_VALUES: + mt = "sonstige" + rel = parsed.get("reliability", "na") + if rel not in RELIABILITY_VALUES: + rel = "na" + aligns_raw = parsed.get("alignments") or [] + if not isinstance(aligns_raw, list): + aligns_raw = [] + aligns = sorted({a for a in aligns_raw if isinstance(a, str) and a in ALIGNMENT_VALUES}) + sa = bool(parsed.get("state_affiliated", False)) + cc = parsed.get("country_code") + if isinstance(cc, str) and len(cc) == 2 and cc.isalpha(): + cc = cc.upper() + else: + cc = None + try: + confidence = float(parsed.get("confidence", 0.5)) + confidence = max(0.0, min(1.0, confidence)) + except (TypeError, ValueError): + confidence = 0.5 + reasoning = str(parsed.get("reasoning", ""))[:1000] + return { + "political_orientation": pol, + "media_type": mt, + "reliability": rel, + "alignments": aligns, + "state_affiliated": sa, + "country_code": cc, + "confidence": confidence, + "reasoning": reasoning, + } + + +async def classify_source( + db: aiosqlite.Connection, + source_id: int, + sample_limit: int = 5, + model: str = CLAUDE_MODEL_FAST, +) -> dict: + """Klassifiziert eine einzelne Quelle und schreibt die Vorschlaege in proposed_*-Spalten.""" + cursor = await db.execute( + "SELECT id, name, url, domain, source_type, category, language, bias, " + "classification_source FROM sources WHERE id = ?", + (source_id,), + ) + row = await cursor.fetchone() + if not row: + raise ValueError(f"Quelle {source_id} nicht gefunden") + src = dict(row) + + sample = await _load_sample_articles(db, src["name"], src.get("domain"), sample_limit) + prompt = _build_prompt(src, sample) + response, usage = await call_claude(prompt, tools=None, model=model) + + json_match = re.search(r"\{.*\}", response, re.DOTALL) + if not json_match: + raise ValueError(f"Keine JSON-Antwort von Claude fuer source_id={source_id}: {response[:200]}") + parsed = json.loads(json_match.group(0)) + result = _validate(parsed) + + # Nur classification_source auf 'llm_pending' setzen, wenn nicht bereits manuell/approved + new_src = "CASE WHEN classification_source IN ('manual','llm_approved') THEN classification_source ELSE 'llm_pending' END" + await db.execute( + f"""UPDATE sources SET + proposed_political_orientation = ?, + proposed_media_type = ?, + proposed_reliability = ?, + proposed_state_affiliated = ?, + proposed_country_code = ?, + proposed_alignments_json = ?, + proposed_confidence = ?, + proposed_reasoning = ?, + proposed_at = CURRENT_TIMESTAMP, + classification_source = {new_src} + WHERE id = ?""", + ( + result["political_orientation"], + result["media_type"], + result["reliability"], + 1 if result["state_affiliated"] else 0, + result["country_code"], + json.dumps(result["alignments"], ensure_ascii=False), + result["confidence"], + result["reasoning"], + source_id, + ), + ) + await db.commit() + + logger.info( + "Klassifiziert source_id=%s '%s' -> %s/%s/%s conf=%.2f ($%.4f)", + source_id, src["name"], result["political_orientation"], + result["media_type"], result["reliability"], result["confidence"], + usage.cost_usd, + ) + + result["source_id"] = source_id + result["usage"] = { + "cost_usd": usage.cost_usd, + "input_tokens": usage.input_tokens, + "output_tokens": usage.output_tokens, + } + return result + + +async def bulk_classify( + db: aiosqlite.Connection, + limit: int = 50, + only_unclassified: bool = True, + model: str = CLAUDE_MODEL_FAST, +) -> dict: + """Klassifiziert noch unklassifizierte Quellen (sequenziell). + + Args: + limit: Maximale Anzahl Quellen pro Aufruf + only_unclassified: Wenn True, nur classification_source='legacy'. + Wenn False, auch 'llm_pending' neu klassifizieren. + """ + if only_unclassified: + where = "classification_source = 'legacy'" + else: + where = "classification_source IN ('legacy', 'llm_pending')" + cursor = await db.execute( + f"SELECT id FROM sources WHERE {where} AND status = 'active' " + f"AND source_type != 'excluded' ORDER BY id LIMIT ?", + (limit,), + ) + ids = [row["id"] for row in await cursor.fetchall()] + + total_cost = 0.0 + success = 0 + errors: list[dict] = [] + + for sid in ids: + try: + r = await classify_source(db, sid, model=model) + total_cost += r["usage"]["cost_usd"] + success += 1 + except asyncio.CancelledError: + raise + except Exception as e: + logger.error("Klassifikation source_id=%s fehlgeschlagen: %s", sid, e, exc_info=True) + errors.append({"source_id": sid, "error": str(e)}) + + logger.info( + "Bulk-Klassifikation fertig: %d/%d erfolgreich, $%.4f Kosten, %d Fehler", + success, len(ids), total_cost, len(errors), + ) + return { + "processed": len(ids), + "success": success, + "errors": errors, + "total_cost_usd": total_cost, + } From 48a60d7579a32c9eb74e2d97a34ad69671684603 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 19:00:47 +0000 Subject: [PATCH 05/15] feat(sources): Review-Queue-UI fuer LLM-Klassifikations-Vorschlaege (Admin) - Tab-Schalter im Quellen-Modal: "Quellenliste" vs. "Klassifikations-Review" (Review-Tab nur fuer org_admin sichtbar, mit Pending-Counter-Badge). - Review-Karten zeigen Diff aktueller Wert -> LLM-Vorschlag pro Achse, Konfidenz-Indikator (gruen/gelb/rot), LLM-Begruendung, Buttons fuer Uebernehmen / Verwerfen / Neu klassifizieren. - Toolbar: Konfidenz-Filter, "Klassifikation starten" (Bulk im Hintergrund), "Alle >= 0.85 genehmigen" (Bulk-Approve). - API-Wrapper in api.js fuer alle 6 neuen Endpoints + erweiterte listSources-Filter. - Backend-Endpoint POST /api/sources/classification/bulk-approve (Admin-only). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/routers/sources.py | 59 +++++++++++ src/static/css/style.css | 198 ++++++++++++++++++++++++++++++++++++ src/static/dashboard.html | 38 +++++++ src/static/js/api.js | 33 ++++++ src/static/js/app.js | 112 ++++++++++++++++++++ src/static/js/components.js | 65 ++++++++++++ 6 files changed, 505 insertions(+) diff --git a/src/routers/sources.py b/src/routers/sources.py index 9907e8d..25a898f 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -937,3 +937,62 @@ async def trigger_bulk_classify( raise HTTPException(status_code=400, detail="limit muss zwischen 1 und 500 liegen") background_tasks.add_task(_bulk_classify_background, limit, only_unclassified) return {"status": "started", "limit": limit, "only_unclassified": only_unclassified} + + +@router.post("/classification/bulk-approve") +async def bulk_approve_classifications( + min_confidence: float = 0.85, + current_user: dict = Depends(get_current_user), + db: aiosqlite.Connection = Depends(db_dependency), +): + """Genehmigt alle Pending-Vorschlaege ueber dem confidence-Schwellwert (nur Admins). + + Globale Quellen werden nur bearbeitet, wenn der Aufrufer org_admin ist; + Tenant-eigene Quellen sowieso. + """ + if current_user.get("role") != "org_admin": + raise HTTPException(status_code=403, detail="Nur Admins koennen Bulk-Approve nutzen") + tenant_id = current_user.get("tenant_id") + cursor = await db.execute( + """SELECT id, proposed_political_orientation, proposed_media_type, + proposed_reliability, proposed_state_affiliated, + proposed_country_code, proposed_alignments_json, tenant_id + FROM sources + WHERE proposed_political_orientation IS NOT NULL + AND COALESCE(proposed_confidence, 0) >= ? + AND (tenant_id IS NULL OR tenant_id = ?)""", + (min_confidence, tenant_id), + ) + rows = [dict(r) for r in await cursor.fetchall()] + approved_ids: list[int] = [] + for src in rows: + try: + proposed_aligns = json.loads(src.get("proposed_alignments_json") or "[]") + except (json.JSONDecodeError, TypeError): + proposed_aligns = [] + await db.execute( + """UPDATE sources SET + political_orientation = ?, + media_type = ?, + reliability = ?, + state_affiliated = ?, + country_code = ?, + classification_source = 'llm_approved', + classified_at = CURRENT_TIMESTAMP + WHERE id = ?""", + ( + src["proposed_political_orientation"], + src["proposed_media_type"], + src["proposed_reliability"], + 1 if src.get("proposed_state_affiliated") else 0, + src.get("proposed_country_code"), + src["id"], + ), + ) + await _replace_alignments( + db, src["id"], [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS] + ) + await _clear_proposed(db, src["id"]) + approved_ids.append(src["id"]) + await db.commit() + return {"approved_count": len(approved_ids), "min_confidence": min_confidence} diff --git a/src/static/css/style.css b/src/static/css/style.css index 3bc671c..777d490 100644 --- a/src/static/css/style.css +++ b/src/static/css/style.css @@ -3503,6 +3503,204 @@ a.dev-source-pill:hover { color: var(--info); } +/* Sources-Modal: Tabs */ +.sources-tabs { + display: flex; + gap: 2px; + border-bottom: 1px solid var(--border-color, rgba(0,0,0,0.1)); + margin-bottom: 12px; +} +.sources-tab { + background: transparent; + border: none; + padding: 8px 16px; + font-size: 13px; + font-weight: 500; + color: var(--text-secondary, #555); + cursor: pointer; + border-bottom: 2px solid transparent; + margin-bottom: -1px; + display: inline-flex; + align-items: center; + gap: 8px; +} +.sources-tab:hover { + color: var(--text-primary, #222); +} +.sources-tab.active { + color: var(--primary, #2a81cb); + border-bottom-color: var(--primary, #2a81cb); +} +.sources-tab-badge { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 20px; + padding: 0 6px; + height: 18px; + border-radius: 9px; + background: var(--primary, #2a81cb); + color: #fff; + font-size: 10px; + font-weight: 700; +} + +/* Review-Queue */ +.review-toolbar { + display: flex; + align-items: center; + justify-content: space-between; + padding: 8px 12px; + background: var(--cat-sonstige-bg, #f6f6fa); + border-radius: var(--radius); + margin-bottom: 12px; + flex-wrap: wrap; + gap: 12px; +} +.review-toolbar-info { + display: flex; + align-items: center; + gap: 16px; + font-size: 13px; +} +.review-conf-filter { + display: inline-flex; + align-items: center; + gap: 6px; + font-size: 12px; + color: var(--text-secondary, #555); +} +.review-conf-filter select { + padding: 2px 6px; + font-size: 12px; + border-radius: var(--radius); + border: 1px solid var(--border-color, rgba(0,0,0,0.15)); +} +.review-toolbar-actions { + display: flex; + gap: 6px; +} + +.review-list { + display: flex; + flex-direction: column; + gap: 8px; +} +.review-card { + background: var(--surface, #fff); + border: 1px solid var(--border-color, rgba(0,0,0,0.08)); + border-radius: var(--radius); + padding: 12px 14px; +} +.review-card-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + gap: 12px; + margin-bottom: 10px; +} +.review-card-title { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 8px; +} +.review-card-name { + font-weight: 600; + font-size: 14px; +} +.review-card-domain { + font-size: 11px; + color: var(--text-disabled, #888); +} +.review-global-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: var(--radius); + background: #5e35b1; + color: #fff; + font-size: 9px; + font-weight: 600; + letter-spacing: 0.3px; + text-transform: uppercase; +} +.review-card-confidence { + display: inline-flex; + flex-direction: column; + align-items: center; + padding: 4px 10px; + border-radius: var(--radius); + min-width: 60px; +} +.review-card-confidence .conf-value { + font-size: 14px; + font-weight: 700; +} +.review-card-confidence .conf-label { + font-size: 9px; + text-transform: uppercase; + letter-spacing: 0.3px; + opacity: 0.8; +} +.review-card-confidence.conf-high { background: #e8f5e9; color: #2e7d32; } +.review-card-confidence.conf-medium { background: #fff8e1; color: #ef6c00; } +.review-card-confidence.conf-low { background: #ffebee; color: #c62828; } + +.review-card-diff { + display: grid; + grid-template-columns: 1fr; + gap: 4px; + font-size: 12px; + margin-bottom: 10px; +} +.review-diff-row { + display: grid; + grid-template-columns: 110px 1fr 24px 1fr; + align-items: center; + gap: 8px; + padding: 3px 6px; + border-radius: 3px; +} +.review-diff-row.changed { + background: #fff8e1; +} +.review-diff-label { + color: var(--text-secondary, #555); + font-weight: 500; +} +.review-diff-current { + color: var(--text-disabled, #888); +} +.review-diff-arrow { + text-align: center; + color: var(--text-disabled, #888); + font-weight: 600; +} +.review-diff-proposed { + color: var(--text-primary, #222); + font-weight: 500; +} +.review-diff-row.changed .review-diff-proposed { + color: #ef6c00; + font-weight: 600; +} + +.review-card-reasoning { + font-size: 12px; + color: var(--text-secondary, #555); + background: var(--cat-sonstige-bg, #f6f6fa); + padding: 8px 10px; + border-radius: var(--radius); + margin-bottom: 10px; + line-height: 1.5; +} +.review-card-actions { + display: flex; + gap: 6px; + flex-wrap: wrap; +} + /* Klassifikations-Badges (politisch / reliability / alignments / state) */ .source-classification-badges { display: inline-flex; diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 43a81dd..8e73d59 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -456,6 +456,15 @@
+ +
+ + +
+ + +
+
@@ -706,6 +715,35 @@
Lade Quellen...
+ +
+ + + +
diff --git a/src/static/js/api.js b/src/static/js/api.js index 310476d..b2b1fd9 100644 --- a/src/static/js/api.js +++ b/src/static/js/api.js @@ -198,10 +198,43 @@ const API = { if (params.source_type) query.set('source_type', params.source_type); if (params.category) query.set('category', params.category); if (params.source_status) query.set('source_status', params.source_status); + if (params.political_orientation) query.set('political_orientation', params.political_orientation); + if (params.media_type) query.set('media_type', params.media_type); + if (params.reliability) query.set('reliability', params.reliability); + if (params.alignment) query.set('alignment', params.alignment); + if (params.state_affiliated !== undefined && params.state_affiliated !== null) { + query.set('state_affiliated', String(params.state_affiliated)); + } const qs = query.toString(); return this._request('GET', `/sources${qs ? '?' + qs : ''}`); }, + // Sources: Klassifikations-Review (LLM) + getClassificationStats() { + return this._request('GET', '/sources/classification/stats'); + }, + getClassificationQueue(limit = 50, minConfidence = 0.0) { + const qs = new URLSearchParams({ limit: String(limit), min_confidence: String(minConfidence) }).toString(); + return this._request('GET', `/sources/classification/queue?${qs}`); + }, + approveClassification(id) { + return this._request('POST', `/sources/${id}/classification/approve`); + }, + rejectClassification(id) { + return this._request('POST', `/sources/${id}/classification/reject`); + }, + reclassifySource(id) { + return this._request('POST', `/sources/${id}/classification/reclassify`); + }, + triggerBulkClassify(limit = 50, onlyUnclassified = true) { + const qs = new URLSearchParams({ limit: String(limit), only_unclassified: String(onlyUnclassified) }).toString(); + return this._request('POST', `/sources/classification/bulk-classify?${qs}`); + }, + bulkApproveClassifications(minConfidence = 0.85) { + const qs = new URLSearchParams({ min_confidence: String(minConfidence) }).toString(); + return this._request('POST', `/sources/classification/bulk-approve?${qs}`); + }, + createSource(data) { return this._request('POST', '/sources', data); }, diff --git a/src/static/js/app.js b/src/static/js/app.js index 1aff794..1f8d0b4 100644 --- a/src/static/js/app.js +++ b/src/static/js/app.js @@ -2702,6 +2702,12 @@ async handleRefresh() { async openSourceManagement() { openModal('modal-sources'); await this.loadSources(); + // Admin sieht den Review-Tab + const reviewTab = document.getElementById('sources-tab-review'); + if (reviewTab && this.user && this.user.role === 'org_admin') { + reviewTab.style.display = ''; + this._refreshReviewBadge().catch(() => {}); + } }, async loadSources() { @@ -2722,6 +2728,112 @@ async handleRefresh() { } }, + async _refreshReviewBadge() { + try { + const stats = await API.getClassificationStats(); + const badge = document.getElementById('sources-review-count'); + if (badge) badge.textContent = String(stats.pending_review || 0); + } catch (_) { /* still ok */ } + }, + + switchSourcesTab(tab) { + const listView = document.getElementById('sources-list-view'); + const reviewView = document.getElementById('sources-review-view'); + const tabList = document.getElementById('sources-tab-list'); + const tabReview = document.getElementById('sources-tab-review'); + if (!listView || !reviewView) return; + if (tab === 'review') { + listView.style.display = 'none'; + reviewView.style.display = ''; + if (tabList) { tabList.classList.remove('active'); tabList.setAttribute('aria-selected', 'false'); } + if (tabReview) { tabReview.classList.add('active'); tabReview.setAttribute('aria-selected', 'true'); } + this.loadClassificationQueue(); + } else { + listView.style.display = ''; + reviewView.style.display = 'none'; + if (tabList) { tabList.classList.add('active'); tabList.setAttribute('aria-selected', 'true'); } + if (tabReview) { tabReview.classList.remove('active'); tabReview.setAttribute('aria-selected', 'false'); } + } + }, + + async loadClassificationQueue() { + const list = document.getElementById('sources-review-list'); + if (!list) return; + const minConf = parseFloat(document.getElementById('review-min-confidence')?.value || '0'); + list.innerHTML = '
Lade...
'; + try { + const items = await API.getClassificationQueue(200, minConf); + this._reviewItems = items; + const countEl = document.getElementById('review-pending-count'); + if (countEl) countEl.textContent = String(items.length); + if (items.length === 0) { + list.innerHTML = '
Keine ausstehenden Vorschlaege.
'; + return; + } + list.innerHTML = items.map(item => UI.renderClassificationQueueItem(item)).join(''); + } catch (err) { + list.innerHTML = `
Fehler: ${err.message}
`; + } + }, + + async approveClassification(id) { + try { + await API.approveClassification(id); + UI.showToast('Klassifikation uebernommen.', 'success'); + await this.loadClassificationQueue(); + this._refreshReviewBadge(); + } catch (err) { + UI.showToast('Approve fehlgeschlagen: ' + err.message, 'error'); + } + }, + + async rejectClassification(id) { + try { + await API.rejectClassification(id); + UI.showToast('Vorschlag verworfen.', 'success'); + await this.loadClassificationQueue(); + this._refreshReviewBadge(); + } catch (err) { + UI.showToast('Reject fehlgeschlagen: ' + err.message, 'error'); + } + }, + + async reclassifySource(id) { + const btn = document.querySelector(`[data-reclassify-id="${id}"]`); + if (btn) { btn.disabled = true; btn.textContent = '...'; } + try { + await API.reclassifySource(id); + UI.showToast('Neu klassifiziert.', 'success'); + await this.loadClassificationQueue(); + } catch (err) { + UI.showToast('Reclassify fehlgeschlagen: ' + err.message, 'error'); + } finally { + if (btn) { btn.disabled = false; btn.textContent = 'Neu klassifizieren'; } + } + }, + + async triggerBulkClassify() { + if (!confirm('Bulk-Klassifikation aller noch nicht klassifizierten Quellen starten? Lauft im Hintergrund (~3-5 Sek pro Quelle, ~0.02 USD pro Quelle).')) return; + try { + const r = await API.triggerBulkClassify(500, true); + UI.showToast(`Bulk-Klassifikation gestartet (limit=${r.limit}). Nachschauen mit Reload.`, 'info'); + } catch (err) { + UI.showToast('Start fehlgeschlagen: ' + err.message, 'error'); + } + }, + + async bulkApproveHighConfidence() { + if (!confirm('Alle Vorschlaege mit Konfidenz >= 0.85 genehmigen?')) return; + try { + const r = await API.bulkApproveClassifications(0.85); + UI.showToast(`${r.approved_count} Vorschlaege uebernommen.`, 'success'); + await this.loadClassificationQueue(); + this._refreshReviewBadge(); + } catch (err) { + UI.showToast('Bulk-Approve fehlgeschlagen: ' + err.message, 'error'); + } + }, + renderSourceStats(stats) { const bar = document.getElementById('sources-stats-bar'); if (!bar) return; diff --git a/src/static/js/components.js b/src/static/js/components.js index d0a2cd8..338802e 100644 --- a/src/static/js/components.js +++ b/src/static/js/components.js @@ -1119,6 +1119,71 @@ const UI = { sonstige: 'sonstige', }, + /** + * Eintrag in der Klassifikations-Review-Queue. + * Zeigt Diff zwischen aktuellem Wert und LLM-Vorschlag. + */ + renderClassificationQueueItem(item) { + const cur = item.current || {}; + const prop = item.proposed || {}; + const conf = prop.confidence || 0; + const confPct = Math.round(conf * 100); + const confClass = conf >= 0.85 ? 'high' : (conf >= 0.7 ? 'medium' : 'low'); + + const diffRow = (label, currentVal, proposedVal, formatter) => { + const fmt = formatter || (v => v == null || v === '' ? '–' : String(v)); + const c = fmt(currentVal); + const p = fmt(proposedVal); + const changed = c !== p; + return `
+ ${this.escape(label)} + ${this.escape(c)} + + ${this.escape(p)} +
`; + }; + + const polFmt = v => (v && v !== 'na') ? (this._politicalLabels[v]?.full || v) : '–'; + const mtFmt = v => (v && v !== 'sonstige') ? (this._mediaTypeLabels[v] || v) : (v === 'sonstige' ? 'Sonstige' : '–'); + const relFmt = v => (v && v !== 'na') ? (this._reliabilityLabels[v] || v) : '–'; + const stateFmt = v => v ? 'ja' : 'nein'; + const ccFmt = v => v || '–'; + const alignFmt = v => (Array.isArray(v) && v.length > 0) + ? v.map(a => this._alignmentLabels[a] || a).join(', ') + : '–'; + + const globalBadge = item.is_global ? 'Grundquelle' : ''; + const reasoning = prop.reasoning ? this.escape(prop.reasoning) : ''; + + return `
+
+
+ ${this.escape(item.name)} + ${globalBadge} + ${this.escape(item.domain || '')} +
+
+ ${confPct}% + Konfidenz +
+
+
+ ${diffRow('Politik', cur.political_orientation, prop.political_orientation, polFmt)} + ${diffRow('Medientyp', cur.media_type, prop.media_type, mtFmt)} + ${diffRow('Glaubwürdigkeit', cur.reliability, prop.reliability, relFmt)} + ${diffRow('Staatsnah', cur.state_affiliated, prop.state_affiliated, stateFmt)} + ${diffRow('Land', cur.country_code, prop.country_code, ccFmt)} + ${diffRow('Geopol. Nähe', cur.alignments, prop.alignments, alignFmt)} +
+ ${reasoning ? `
Begründung: ${reasoning}
` : ''} +
+ + + +
+
`; + }, + _renderClassificationBadges(feed) { const parts = []; const pol = feed.political_orientation; From 5fc246755975368c2c0b66d4bd61268d6196bce9 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 7 May 2026 19:40:30 +0000 Subject: [PATCH 06/15] feat(sources): externer Reputations-Layer (IFCN + EUvsDisinfo) Externe Datenquellen (kostenlos, Open Data) ergaenzen die LLM-geschaetzte Reliability-Achse mit objektiven Signalen: - IFCN-Signatories (raw.githubusercontent.com/IFCN/verified-signatories): Plain-Text-Liste anerkannter Faktencheck-Organisationen. - EUvsDisinfo (Zenodo CSV): Pro-Kreml-Desinformations-Datenbank. Schema-Erweiterung: - ifcn_signatory, eu_disinfo_listed, eu_disinfo_case_count, eu_disinfo_last_seen, external_data_synced_at. Service src/services/external_reputation.py: - sync_ifcn_signatories(), sync_eu_disinfo(), apply_reputation_overrides(), sync_all() mit Domain-Normalisierung (lowercase, ohne www., ohne Schema). Reliability-Override-Regeln (laufen nach Approve und manuellem Sync): - ifcn_signatory=1 -> reliability=sehr_hoch - eu_disinfo_case_count >= 5 -> reliability=sehr_niedrig - eu_disinfo_case_count >= 1 -> Reliability eine Stufe runter (max niedrig) API: POST /api/sources/external-reputation/sync (Admin, BackgroundTask). Filter: ?ifcn_signatory=true, ?eu_disinfo_listed=true. UI: - Filter-Dropdown "Externe Reputation" im Quellen-Modal. - Badges: gruenes "IFCN" und rotes "EU-Desinfo (n)". - Tooltip macht Reliability-Quelle transparent: "(IFCN-Faktenchecker)", "(EU-Desinfo, n Faelle)" oder "(LLM-Schaetzung)". - "Externe Daten syncen"-Button im Review-Toolbar (Admin-only). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/database.py | 21 ++- src/models.py | 5 + src/routers/sources.py | 42 +++++ src/services/external_reputation.py | 268 ++++++++++++++++++++++++++++ src/static/css/style.css | 26 +++ src/static/dashboard.html | 7 + src/static/js/api.js | 3 + src/static/js/app.js | 16 ++ src/static/js/components.js | 25 ++- 9 files changed, 410 insertions(+), 3 deletions(-) create mode 100644 src/services/external_reputation.py diff --git a/src/database.py b/src/database.py index 54d6b7e..b8d9366 100644 --- a/src/database.py +++ b/src/database.py @@ -176,7 +176,12 @@ CREATE TABLE IF NOT EXISTS sources ( proposed_alignments_json TEXT, proposed_confidence REAL, proposed_reasoning TEXT, - proposed_at TIMESTAMP + proposed_at TIMESTAMP, + eu_disinfo_listed INTEGER DEFAULT 0, + eu_disinfo_case_count INTEGER DEFAULT 0, + eu_disinfo_last_seen TIMESTAMP, + ifcn_signatory INTEGER DEFAULT 0, + external_data_synced_at TIMESTAMP ); CREATE TABLE IF NOT EXISTS source_alignments ( @@ -668,6 +673,20 @@ async def init_db(): if any(c not in src_columns for c in ("political_orientation", "media_type", "reliability")): logger.info("Migration: Klassifikations-Spalten zu sources hinzugefuegt") + # Migration: externe Reputations-Daten (EUvsDisinfo + IFCN) + for col, ddl in [ + ("eu_disinfo_listed", "ALTER TABLE sources ADD COLUMN eu_disinfo_listed INTEGER DEFAULT 0"), + ("eu_disinfo_case_count", "ALTER TABLE sources ADD COLUMN eu_disinfo_case_count INTEGER DEFAULT 0"), + ("eu_disinfo_last_seen", "ALTER TABLE sources ADD COLUMN eu_disinfo_last_seen TIMESTAMP"), + ("ifcn_signatory", "ALTER TABLE sources ADD COLUMN ifcn_signatory INTEGER DEFAULT 0"), + ("external_data_synced_at", "ALTER TABLE sources ADD COLUMN external_data_synced_at TIMESTAMP"), + ]: + if col not in src_columns: + await db.execute(ddl) + await db.commit() + if any(c not in src_columns for c in ("eu_disinfo_listed", "ifcn_signatory")): + logger.info("Migration: externe Reputations-Spalten zu sources hinzugefuegt") + # Migration: source_alignments-Tabelle (Mehrfach-Tags fuer geopolitische Naehe) cursor = await db.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='source_alignments'" diff --git a/src/models.py b/src/models.py index 32d3bb7..7682310 100644 --- a/src/models.py +++ b/src/models.py @@ -210,6 +210,11 @@ class SourceResponse(BaseModel): classified_at: Optional[str] = None alignments: list[str] = [] is_global: bool = False + ifcn_signatory: bool = False + eu_disinfo_listed: bool = False + eu_disinfo_case_count: int = 0 + eu_disinfo_last_seen: Optional[str] = None + external_data_synced_at: Optional[str] = None # Source Discovery diff --git a/src/routers/sources.py b/src/routers/sources.py index 25a898f..e0f2014 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest from auth import get_current_user from database import db_dependency, get_db, refresh_source_counts +from services.external_reputation import apply_reputation_overrides, sync_all as sync_external_reputation from services.source_classifier import bulk_classify, classify_source from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES import aiosqlite @@ -90,6 +91,8 @@ async def list_sources( reliability: str = None, state_affiliated: bool = None, alignment: str = None, + ifcn_signatory: bool = None, + eu_disinfo_listed: bool = None, current_user: dict = Depends(get_current_user), db: aiosqlite.Connection = Depends(db_dependency), ): @@ -124,6 +127,12 @@ async def list_sources( if alignment: query += " AND EXISTS (SELECT 1 FROM source_alignments sa WHERE sa.source_id = s.id AND sa.alignment = ?)" params.append(alignment.lower()) + if ifcn_signatory is not None: + query += " AND s.ifcn_signatory = ?" + params.append(1 if ifcn_signatory else 0) + if eu_disinfo_listed is not None: + query += " AND s.eu_disinfo_listed = ?" + params.append(1 if eu_disinfo_listed else 0) query += " ORDER BY s.source_type, s.category, s.name" cursor = await db.execute(query, params) @@ -133,6 +142,8 @@ async def list_sources( for d in results: d["is_global"] = d.get("tenant_id") is None d["state_affiliated"] = bool(d.get("state_affiliated")) + d["ifcn_signatory"] = bool(d.get("ifcn_signatory")) + d["eu_disinfo_listed"] = bool(d.get("eu_disinfo_listed")) d["alignments"] = alignments_map.get(d["id"], []) return results @@ -864,6 +875,11 @@ async def approve_classification( await _replace_alignments(db, source_id, [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS]) await _clear_proposed(db, source_id) await db.commit() + # Reliability-Override anwenden (IFCN/EUvsDisinfo) + try: + await apply_reputation_overrides(db, source_id) + except Exception as e: + logger.warning("Reputation-Override fuer source_id=%s fehlgeschlagen: %s", source_id, e) return {"source_id": source_id, "status": "approved"} @@ -939,6 +955,26 @@ async def trigger_bulk_classify( return {"status": "started", "limit": limit, "only_unclassified": only_unclassified} +@router.post("/external-reputation/sync") +async def trigger_external_reputation_sync( + background_tasks: BackgroundTasks, + current_user: dict = Depends(get_current_user), +): + """Startet Sync von IFCN- und EUvsDisinfo-Daten (Admin, Hintergrund).""" + if current_user.get("role") != "org_admin": + raise HTTPException(status_code=403, detail="Nur Admins koennen den externen Sync starten") + + async def _bg(): + db = await get_db() + try: + await sync_external_reputation(db) + finally: + await db.close() + + background_tasks.add_task(_bg) + return {"status": "started"} + + @router.post("/classification/bulk-approve") async def bulk_approve_classifications( min_confidence: float = 0.85, @@ -995,4 +1031,10 @@ async def bulk_approve_classifications( await _clear_proposed(db, src["id"]) approved_ids.append(src["id"]) await db.commit() + # Reliability-Override fuer alle gerade Approved + try: + for sid in approved_ids: + await apply_reputation_overrides(db, sid) + except Exception as e: + logger.warning("Bulk Reputation-Override fehlgeschlagen: %s", e) return {"approved_count": len(approved_ids), "min_confidence": min_confidence} diff --git a/src/services/external_reputation.py b/src/services/external_reputation.py new file mode 100644 index 0000000..1e900b0 --- /dev/null +++ b/src/services/external_reputation.py @@ -0,0 +1,268 @@ +"""Externe Reputations-Daten fuer Quellen. + +Synchronisiert Domain-Listen von oeffentlichen Reputations-/Faktencheck-Datenbanken +und schreibt die Treffer in die sources-Spalten: + +- IFCN-Signatories (anerkannte Faktenchecker) -> ifcn_signatory +- EUvsDisinfo (pro-Kreml-Desinformation, Zenodo-CSV) -> eu_disinfo_listed, + eu_disinfo_case_count, eu_disinfo_last_seen + +Anschliessend wendet apply_reputation_overrides() Override-Regeln auf die +reliability-Spalte an: +- ifcn_signatory=1 -> reliability='sehr_hoch' +- eu_disinfo_case_count >= 5 -> reliability='sehr_niedrig' +- eu_disinfo_case_count >= 1 -> reliability eine Stufe runter (max bis 'niedrig') +""" +import csv +import io +import logging +from collections import defaultdict +from urllib.parse import urlparse + +import aiosqlite +import httpx + +logger = logging.getLogger("osint.external_reputation") + +IFCN_LIST_URL = "https://raw.githubusercontent.com/IFCN/verified-signatories/main/list" +EU_DISINFO_CSV_URL = "https://zenodo.org/records/10514307/files/euvsdisinfo_base.csv?download=1" + +HTTP_TIMEOUT = httpx.Timeout(60.0, connect=10.0) + +# Reliability-Skala in Stufenfolge (schlecht -> gut) +RELIABILITY_ORDER = ["sehr_niedrig", "niedrig", "gemischt", "hoch", "sehr_hoch"] + + +def _normalize_domain(raw: str | None) -> str | None: + """Normalisiert eine Domain: lowercase, ohne www., ohne Schema/Pfad.""" + if not raw: + return None + raw = raw.strip().lower() + if not raw: + return None + # Falls eine vollstaendige URL uebergeben wurde + if "://" in raw: + try: + raw = urlparse(raw).netloc or raw + except ValueError: + pass + # Pfad/Query strippen + raw = raw.split("/")[0].split("?")[0].split("#")[0] + if raw.startswith("www."): + raw = raw[4:] + return raw or None + + +async def _fetch_text(url: str) -> str: + """Laedt Text von einer URL. Wirft HTTPException bei Fehler.""" + async with httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True) as client: + resp = await client.get(url) + resp.raise_for_status() + return resp.text + + +async def sync_ifcn_signatories(db: aiosqlite.Connection) -> dict: + """Laedt IFCN-Domain-Liste und matcht gegen sources.domain. + + Setzt ifcn_signatory=1 wo die Domain in der Liste vorkommt, sonst 0. + """ + text = await _fetch_text(IFCN_LIST_URL) + domains: set[str] = set() + for line in text.splitlines(): + d = _normalize_domain(line) + if d: + domains.add(d) + logger.info("IFCN-Liste geladen: %d Domains", len(domains)) + + # Aktuelle Quellen mit Domain laden + cursor = await db.execute( + "SELECT id, domain FROM sources WHERE domain IS NOT NULL AND domain != ''" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + matched_ids: list[int] = [] + unmatched_ids: list[int] = [] + for s in sources: + nd = _normalize_domain(s["domain"]) + if nd and nd in domains: + matched_ids.append(s["id"]) + else: + unmatched_ids.append(s["id"]) + + # Bulk-Update in zwei Statements + if matched_ids: + placeholders = ",".join("?" for _ in matched_ids) + await db.execute( + f"UPDATE sources SET ifcn_signatory = 1 WHERE id IN ({placeholders})", + matched_ids, + ) + if unmatched_ids: + placeholders = ",".join("?" for _ in unmatched_ids) + await db.execute( + f"UPDATE sources SET ifcn_signatory = 0 WHERE id IN ({placeholders})", + unmatched_ids, + ) + await db.commit() + logger.info("IFCN-Sync: %d Quellen als Faktenchecker markiert (von %d)", + len(matched_ids), len(sources)) + return { + "list_size": len(domains), + "sources_checked": len(sources), + "matched": len(matched_ids), + } + + +async def sync_eu_disinfo(db: aiosqlite.Connection) -> dict: + """Laedt EUvsDisinfo-CSV von Zenodo, aggregiert pro Domain, schreibt sources. + + - eu_disinfo_listed: 1 wenn Domain mindestens 1x als 'disinformation' debunkt + - eu_disinfo_case_count: Anzahl Disinformation-Faelle + - eu_disinfo_last_seen: spaetestes debunk_date + """ + text = await _fetch_text(EU_DISINFO_CSV_URL) + reader = csv.DictReader(io.StringIO(text)) + + # Per-Domain aggregieren (nur class='disinformation') + counts: dict[str, int] = defaultdict(int) + last_seen: dict[str, str] = {} + total_rows = 0 + for row in reader: + total_rows += 1 + if (row.get("class") or "").strip().lower() != "disinformation": + continue + d = _normalize_domain(row.get("article_domain")) + if not d: + continue + counts[d] += 1 + debunk_date = (row.get("debunk_date") or "").strip() + if debunk_date: + prev = last_seen.get(d) + if not prev or debunk_date > prev: + last_seen[d] = debunk_date + logger.info("EUvsDisinfo-CSV: %d Zeilen, %d Domains mit Desinformation", + total_rows, len(counts)) + + # Quellen laden + matchen + cursor = await db.execute( + "SELECT id, domain FROM sources WHERE domain IS NOT NULL AND domain != ''" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + matched = 0 + for s in sources: + nd = _normalize_domain(s["domain"]) + if nd and nd in counts: + await db.execute( + """UPDATE sources SET + eu_disinfo_listed = 1, + eu_disinfo_case_count = ?, + eu_disinfo_last_seen = ? + WHERE id = ?""", + (counts[nd], last_seen.get(nd), s["id"]), + ) + matched += 1 + else: + await db.execute( + """UPDATE sources SET + eu_disinfo_listed = 0, + eu_disinfo_case_count = 0, + eu_disinfo_last_seen = NULL + WHERE id = ?""", + (s["id"],), + ) + await db.commit() + logger.info("EUvsDisinfo-Sync: %d Quellen als Desinformations-Quelle markiert (von %d)", + matched, len(sources)) + return { + "rows_in_csv": total_rows, + "domains_with_disinfo_in_csv": len(counts), + "sources_checked": len(sources), + "matched": matched, + } + + +def _override_reliability(current: str | None, ifcn: bool, eu_count: int) -> str | None: + """Wendet Override-Regeln auf eine reliability-Stufe an. + + Rueckgabe: neue Stufe (oder None, wenn unveraendert). + """ + cur = current or "na" + + # IFCN gewinnt: zertifizierter Faktenchecker -> sehr_hoch (immer) + if ifcn: + return "sehr_hoch" if cur != "sehr_hoch" else None + + # EUvsDisinfo: Downgrade + if eu_count >= 5: + return "sehr_niedrig" if cur != "sehr_niedrig" else None + if eu_count >= 1: + # Eine Stufe runter, mindestens bis 'niedrig' + if cur == "na": + return "niedrig" + if cur in RELIABILITY_ORDER: + idx = RELIABILITY_ORDER.index(cur) + new_idx = max(0, idx - 1) + new = RELIABILITY_ORDER[new_idx] + # Mindeststufe 'niedrig' bei eu_count >= 1 + if RELIABILITY_ORDER.index(new) > RELIABILITY_ORDER.index("niedrig"): + new = "niedrig" + return new if new != cur else None + return None + + +async def apply_reputation_overrides(db: aiosqlite.Connection, source_id: int | None = None) -> dict: + """Wendet Reliability-Override-Regeln an. + + Wenn source_id angegeben ist, nur fuer diese Quelle. Sonst fuer alle Quellen. + """ + if source_id is not None: + cursor = await db.execute( + "SELECT id, reliability, ifcn_signatory, eu_disinfo_case_count " + "FROM sources WHERE id = ?", + (source_id,), + ) + else: + cursor = await db.execute( + "SELECT id, reliability, ifcn_signatory, eu_disinfo_case_count FROM sources" + ) + sources = [dict(r) for r in await cursor.fetchall()] + + changed = 0 + for s in sources: + new = _override_reliability( + s.get("reliability"), + bool(s.get("ifcn_signatory")), + int(s.get("eu_disinfo_case_count") or 0), + ) + if new is not None: + await db.execute( + "UPDATE sources SET reliability = ? WHERE id = ?", + (new, s["id"]), + ) + changed += 1 + await db.commit() + logger.info("Reliability-Override: %d Quellen angepasst (von %d gepruefte)", + changed, len(sources)) + return {"checked": len(sources), "changed": changed} + + +async def sync_all(db: aiosqlite.Connection) -> dict: + """Vollstaendiger Sync: IFCN + EUvsDisinfo + Reliability-Override. + + Setzt external_data_synced_at fuer alle Quellen. + """ + ifcn_result = await sync_ifcn_signatories(db) + eu_result = await sync_eu_disinfo(db) + override_result = await apply_reputation_overrides(db) + + await db.execute( + "UPDATE sources SET external_data_synced_at = CURRENT_TIMESTAMP " + "WHERE domain IS NOT NULL AND domain != ''" + ) + await db.commit() + + return { + "ifcn": ifcn_result, + "eu_disinfo": eu_result, + "override": override_result, + } diff --git a/src/static/css/style.css b/src/static/css/style.css index 777d490..4b03934 100644 --- a/src/static/css/style.css +++ b/src/static/css/style.css @@ -3759,6 +3759,32 @@ a.dev-source-pill:hover { line-height: 1; } +.source-ifcn-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: var(--radius); + background: #e8f5e9; + color: #1b5e20; + border: 1px solid #66bb6a; + font-size: 10px; + font-weight: 600; + letter-spacing: 0.3px; +} + +.source-eu-disinfo-badge { + display: inline-flex; + align-items: center; + padding: 1px 6px; + border-radius: var(--radius); + background: #ffebee; + color: #b71c1c; + border: 1px solid #c62828; + font-size: 10px; + font-weight: 600; + letter-spacing: 0.3px; +} + .source-alignment-chip-badge { display: inline-flex; align-items: center; diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 8e73d59..f664cf9 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -538,6 +538,12 @@ + + -> ohne checked (UI-Default ist jetzt unchecked, User muss bewusst aktivieren fuer internationale Lagen) - Tooltip-Text ergaenzt: "Deaktiviert (Standard): ... empfohlen fuer DACH-Lagen." Bestandslagen sind nicht betroffen - DB-Schema-Default INTEGER DEFAULT 1 bleibt unveraendert, fuer alle existierenden Lagen behaelt international seinen aktuellen Wert. Damit ist die Buckelwal-Diagnose komplett geloest: - Bug 1 (rss_parser min_matches adaptiv) seit a08df3d auf main - Bug 2 (Eigennamen-Pflicht-Keywords) seit e83f80d auf main - Bug 3 (international-Default) jetzt auf develop, gleich Cherry-pick auf main --- src/models.py | 2 +- src/static/dashboard.html | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models.py b/src/models.py index 7682310..87aefa1 100644 --- a/src/models.py +++ b/src/models.py @@ -54,7 +54,7 @@ class IncidentCreate(BaseModel): refresh_interval: int = Field(default=15, ge=10, le=10080) refresh_start_time: Optional[str] = Field(default=None, pattern=r"^([01]\d|2[0-3]):[0-5]\d$") retention_days: int = Field(default=0, ge=0, le=999) - international_sources: bool = True + international_sources: bool = False include_telegram: bool = False visibility: str = Field(default="public", pattern="^(public|private)$") diff --git a/src/static/dashboard.html b/src/static/dashboard.html index f664cf9..09175d1 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -362,9 +362,9 @@
From 72b306d90c083a1796d3472268ba63331c633159 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 9 May 2026 04:43:01 +0000 Subject: [PATCH 11/15] fix(source_health): tenant-faehig + History (Phase 2 in den Monitor ziehen) Phase 2 hatte die Verbesserungen nur in der Verwaltung (src/shared/services/source_health.py). Der Daily-Health-Check laeuft aber im Monitor-Backend (Cron 04:00 UTC) und nutzte deshalb weiter den alten Code - Folge: - Tenant-Quellen wurden NIE gecheckt (0 Eintraege in source_health_checks fuer tenant_id IS NOT NULL). - source_health_history blieb leer. Diese Aenderung holt die Phase-2-Logik in den Monitor: - services/source_health.py: Verwaltung-Version 1:1 uebernommen (tenant_id-Filter weg + History-Save vor DELETE + UA/Timeout aus config). - config.py: HEALTH_CHECK_USER_AGENT + HEALTH_CHECK_TIMEOUT_S ergaenzt. Manueller Test auf Staging-Monitor: 283 Quellen geprueft, 253 Issues, 61 davon Tenant-Quellen. History 0 -> 458 Eintraege. Damit ist die shared/-LOCKED-FILES-Markierung in der Verwaltung obsolet - beide Repos haben jetzt den gleichen Code. --- src/config.py | 6 ++++++ src/services/source_health.py | 28 ++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/config.py b/src/config.py index 21a48f2..1b39ea5 100644 --- a/src/config.py +++ b/src/config.py @@ -95,3 +95,9 @@ TELEGRAM_API_ID = int(os.environ.get("TELEGRAM_API_ID", "0")) TELEGRAM_API_HASH = os.environ.get("TELEGRAM_API_HASH", "") TELEGRAM_SESSION_PATH = os.environ.get("TELEGRAM_SESSION_PATH", "/home/claude-dev/.telegram/telegram_session") +# Health-Check (genutzt von services/source_health.py) +HEALTH_CHECK_USER_AGENT = os.environ.get( + "HEALTH_CHECK_USER_AGENT", + "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)", +) +HEALTH_CHECK_TIMEOUT_S = float(os.environ.get("HEALTH_CHECK_TIMEOUT_S", "15.0")) diff --git a/src/services/source_health.py b/src/services/source_health.py index 0f073c9..e6b1cdd 100644 --- a/src/services/source_health.py +++ b/src/services/source_health.py @@ -2,29 +2,45 @@ import asyncio import logging import json +import uuid from urllib.parse import urlparse import httpx import feedparser import aiosqlite +try: + from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S +except ImportError: + HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)" + HEALTH_CHECK_TIMEOUT_S = 15.0 + logger = logging.getLogger("osint.source_health") async def run_health_checks(db: aiosqlite.Connection) -> dict: - """Führt alle Health-Checks für aktive Grundquellen durch.""" + """Führt Health-Checks für alle aktiven Quellen durch (global + Tenant).""" logger.info("Starte Quellen-Health-Check...") - # Alle aktiven Grundquellen laden + # Alle aktiven Quellen laden (global UND Tenant-spezifisch) cursor = await db.execute( "SELECT id, name, url, domain, source_type, article_count, last_seen_at " - "FROM sources WHERE status = 'active' AND tenant_id IS NULL" + "FROM sources WHERE status = 'active' " ) sources = [dict(row) for row in await cursor.fetchall()] - # Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben) + # Bisherigen Stand in History archivieren, dann frisch starten + run_id = uuid.uuid4().hex[:12] + await db.execute( + "INSERT INTO source_health_history " + "(run_id, source_id, check_type, status, message, details, checked_at) " + "SELECT ?, source_id, check_type, status, message, details, checked_at " + "FROM source_health_checks", + (run_id,), + ) await db.execute("DELETE FROM source_health_checks") await db.commit() + logger.info(f"Health-Check Run {run_id}: vorigen Stand archiviert") checks_done = 0 issues_found = 0 @@ -33,9 +49,9 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict: sources_with_url = [s for s in sources if s["url"]] async with httpx.AsyncClient( - timeout=15.0, + timeout=HEALTH_CHECK_TIMEOUT_S, follow_redirects=True, - headers={"User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0)"}, + headers={"User-Agent": HEALTH_CHECK_USER_AGENT}, ) as client: for i in range(0, len(sources_with_url), 5): batch = sources_with_url[i:i + 5] From 1ee6c4ddf1ebdf0f146c6c9eddb675819527da28 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 9 May 2026 04:45:18 +0000 Subject: [PATCH 12/15] fix(source_health): URL-Schema vor httpx.get sicherstellen Telegram-Quellen mit url=t.me/kanal (ohne https:// Prefix) liessen httpx mit "ValueError: unknown url type" crashen. Fix: vor dem Request https:// vorne anhaengen wenn kein Schema vorhanden ist. Beobachtet auf Live: 110 Health-Errors, davon einige Telegram-Kanaele mit "ValueError: unknown url type:" als Fehlermeldung. --- src/services/source_health.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/services/source_health.py b/src/services/source_health.py index e6b1cdd..9837cda 100644 --- a/src/services/source_health.py +++ b/src/services/source_health.py @@ -112,6 +112,10 @@ async def _check_source_reachability( checks = [] url = source["url"] + # URL-Schema sicherstellen: t.me-Kanaele und andere Domains koennen ohne https:// vorkommen + if url and not url.startswith(("http://", "https://")): + url = "https://" + url.lstrip("/") + try: resp = await client.get(url) From 8af0fa07c85bdd9546692ac498b615be1eef7595 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 9 May 2026 04:56:06 +0000 Subject: [PATCH 13/15] feat(source_health): fetch_strategy + Retry mit Googlebot/removepaywalls (Phase 18) Pro Quelle ein Feld sources.fetch_strategy (default | googlebot | paywall | skip): - default: normaler UA, Retry mit Googlebot bei 403/406/429. - googlebot: direkt mit Googlebot-UA (fuer SEO-freundliche Sites). - paywall: Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.). - skip: Health-Check ueberspringen (bekannte unerreichbare Quellen wie Login-only). Pre-Flagging in der Migration: FT/WSJ/NZZ/Handelsblatt/WiWo -> paywall, Rheinische Post/Verfassungsschutz -> googlebot. (Test mit den vier prominent fehlerhaften Quellen zeigt: FT/RP/Verfassungsschutz sind besonders streng, gehen auch nicht ueber Googlebot/removepaywalls durch. Fuer milder restriktive Quellen wirkt der Retry-Mechanismus.) --- src/services/source_health.py | 58 ++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/services/source_health.py b/src/services/source_health.py index 9837cda..b07b5a0 100644 --- a/src/services/source_health.py +++ b/src/services/source_health.py @@ -15,6 +15,17 @@ except ImportError: HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)" HEALTH_CHECK_TIMEOUT_S = 15.0 +# Phase 18: alternative User-Agents fuer Bot-Block-Bypass +USER_AGENT_GOOGLEBOT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +USER_AGENT_BROWSER = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" +) +REMOVEPAYWALLS_PREFIX = "https://www.removepaywalls.com/search?url=" + +# HTTP-Codes, die einen Retry mit anderem UA rechtfertigen +RETRY_ON_STATUS = {403, 406, 429} + logger = logging.getLogger("osint.source_health") @@ -24,7 +35,8 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict: # Alle aktiven Quellen laden (global UND Tenant-spezifisch) cursor = await db.execute( - "SELECT id, name, url, domain, source_type, article_count, last_seen_at " + "SELECT id, name, url, domain, source_type, article_count, last_seen_at, " + "COALESCE(fetch_strategy, 'default') AS fetch_strategy " "FROM sources WHERE status = 'active' " ) sources = [dict(row) for row in await cursor.fetchall()] @@ -108,16 +120,54 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict: async def _check_source_reachability( client: httpx.AsyncClient, source: dict, ) -> list[dict]: - """Prüft Erreichbarkeit und Feed-Validität einer Quelle.""" + """Prüft Erreichbarkeit und Feed-Validität einer Quelle. + + Phase 18: pro Quelle eine fetch_strategy ('default' | 'googlebot' | 'paywall' | 'skip'). + Bei 'default' wird im Fehlerfall (403/406/429) ein Retry mit Googlebot-UA gemacht. + Bei 'paywall' wird auf removepaywalls.com umgeleitet. + Bei 'skip' wird kein Check ausgeführt. + """ checks = [] url = source["url"] + strategy = source.get("fetch_strategy") or "default" - # URL-Schema sicherstellen: t.me-Kanaele und andere Domains koennen ohne https:// vorkommen + # 'skip' -> kein Check (bekannte unerreichbare Quellen, z.B. Login-only) + if strategy == "skip": + checks.append({ + "type": "reachability", "status": "ok", + "message": "Health-Check uebersprungen (fetch_strategy=skip)", + }) + return checks + + # URL-Schema sicherstellen if url and not url.startswith(("http://", "https://")): url = "https://" + url.lstrip("/") + # Initialen UA waehlen: googlebot direkt; paywall ueber removepaywalls; default normal + initial_ua = HEALTH_CHECK_USER_AGENT + initial_url = url + if strategy == "googlebot": + initial_ua = USER_AGENT_GOOGLEBOT + elif strategy == "paywall": + initial_url = REMOVEPAYWALLS_PREFIX + url + initial_ua = USER_AGENT_BROWSER + try: - resp = await client.get(url) + resp = await client.get(initial_url, headers={"User-Agent": initial_ua}) + + # Bot-Block-Retry nur bei strategy='default' + if ( + strategy == "default" + and resp.status_code in RETRY_ON_STATUS + ): + retry = await client.get(url, headers={"User-Agent": USER_AGENT_GOOGLEBOT}) + if retry.status_code < 400: + resp = retry # Retry hat geholfen + checks.append({ + "type": "reachability", "status": "warning", + "message": f"Erreichbar nur mit Googlebot-UA (Standard-UA bekam HTTP {initial_url and 'unknown' or 'XXX'})", + }) + # Hinweis-Eintrag, aber Hauptcheck folgt unten als 'ok' weil resp jetzt die Retry-Antwort ist if resp.status_code >= 400: checks.append({ From f22c8dbc618484ec34b588e79e93132a6097844f Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 9 May 2026 05:00:11 +0000 Subject: [PATCH 14/15] fix: removepaywalls.com -> removepaywall.com (Singular ist die echte Domain) User-Korrektur: die echte Service-Domain heisst removepaywall.com (Singular). removepaywalls.com (Plural) liefert HTTP 403 - vermutlich nicht der gleiche Service oder gar nicht mehr existent. Betrifft: - services/source_health.py: REMOVEPAYWALLS_PREFIX-Konstante (Phase 18) - agents/researcher.py: Claude-Prompts fuer Paywall-Hinweise (zwei Stellen) Verifiziert mit curl: removepaywall.com -> 200, removepaywalls.com -> 403. --- src/agents/researcher.py | 4 ++-- src/services/source_health.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/agents/researcher.py b/src/agents/researcher.py index 76b25dc..734b62d 100644 --- a/src/agents/researcher.py +++ b/src/agents/researcher.py @@ -77,7 +77,7 @@ REGELN: {language_instruction} - Faktenbasiert und neutral - keine Spekulationen - KRITISCH für source_url: Kopiere die EXAKTE URL aus den WebSearch-Ergebnissen. Erfinde oder konstruiere NIEMALS URLs aus Mustern oder Erinnerung. Wenn du die exakte URL eines Artikels nicht aus den Suchergebnissen hast, lass diesen Artikel komplett weg. -- Nutze removepaywalls.com für Paywall-geschützte Artikel (z.B. Spiegel+, Zeit+, SZ+): https://www.removepaywalls.com/search?url=ARTIKEL_URL +- Nutze removepaywall.com für Paywall-geschützte Artikel (z.B. Spiegel+, Zeit+, SZ+): https://www.removepaywall.com/search?url=ARTIKEL_URL - Nutze WebFetch um die 3-5 wichtigsten Artikel vollständig abzurufen und zusammenzufassen Gib die Ergebnisse AUSSCHLIESSLICH als JSON-Array zurück, ohne Erklärungen davor oder danach. @@ -124,7 +124,7 @@ Nutze spezifische Suchbegriffe für institutionelle Quellen. Ziel: 6-10 weitere PHASE 4 — VERIFIKATION UND VERTIEFUNG: Nutze WebFetch um die 6-10 wichtigsten Artikel vollständig abzurufen und ausführlich zusammenzufassen. Priorisiere dabei Primärquellen und investigative Berichte. -Nutze removepaywalls.com für Paywall-geschützte Artikel (z.B. https://www.removepaywalls.com/search?url=ARTIKEL_URL) +Nutze removepaywall.com für Paywall-geschützte Artikel (z.B. https://www.removepaywall.com/search?url=ARTIKEL_URL) {language_instruction} diff --git a/src/services/source_health.py b/src/services/source_health.py index b07b5a0..ed1242c 100644 --- a/src/services/source_health.py +++ b/src/services/source_health.py @@ -21,7 +21,7 @@ USER_AGENT_BROWSER = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" ) -REMOVEPAYWALLS_PREFIX = "https://www.removepaywalls.com/search?url=" +REMOVEPAYWALLS_PREFIX = "https://www.removepaywall.com/search?url=" # HTTP-Codes, die einen Retry mit anderem UA rechtfertigen RETRY_ON_STATUS = {403, 406, 429} @@ -124,7 +124,7 @@ async def _check_source_reachability( Phase 18: pro Quelle eine fetch_strategy ('default' | 'googlebot' | 'paywall' | 'skip'). Bei 'default' wird im Fehlerfall (403/406/429) ein Retry mit Googlebot-UA gemacht. - Bei 'paywall' wird auf removepaywalls.com umgeleitet. + Bei 'paywall' wird auf removepaywall.com umgeleitet. Bei 'skip' wird kein Check ausgeführt. """ checks = [] From a716726e36261ac916322d89ed29156776e568c2 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Sat, 9 May 2026 05:02:18 +0000 Subject: [PATCH 15/15] fix(source_health): paywall-Strategie nicht ueber removepaywall fuer Feed-URL removepaywall.com liefert HTML (Article-Renderer), nicht XML - der Feed-Validity-Check schlug daher fehl mit "Kein gueltiger RSS/Atom-Feed". Korrektur: - paywall: Feed-URL direkt mit Browser-UA laden (kein URL-Rewrite). - Bei paywall + 4xx: status=warning (erwartbar), Feed-Validity skippen. - removepaywall.com bleibt im Researcher-Prompt fuer Article-Inhalte (das ist der korrekte Use-Case). --- src/services/source_health.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/services/source_health.py b/src/services/source_health.py index ed1242c..6cc0e10 100644 --- a/src/services/source_health.py +++ b/src/services/source_health.py @@ -143,18 +143,28 @@ async def _check_source_reachability( if url and not url.startswith(("http://", "https://")): url = "https://" + url.lstrip("/") - # Initialen UA waehlen: googlebot direkt; paywall ueber removepaywalls; default normal + # Initialen UA waehlen initial_ua = HEALTH_CHECK_USER_AGENT initial_url = url if strategy == "googlebot": initial_ua = USER_AGENT_GOOGLEBOT elif strategy == "paywall": - initial_url = REMOVEPAYWALLS_PREFIX + url + # Paywall-Quellen: Feed-URL direkt laden, aber mit Browser-UA (versucht Bot-Detection zu umgehen). + # removepaywall.com ist fuer Article-URLs, NICHT fuer RSS-Feed-Validity-Checks + # (gibt HTML statt XML zurueck). Researcher-Pipeline nutzt removepaywall fuer Inhalte. initial_ua = USER_AGENT_BROWSER try: resp = await client.get(initial_url, headers={"User-Agent": initial_ua}) + # Paywall-Quellen: 4xx ist erwartbar (Bot-Detection), als warning markieren statt error + if strategy == "paywall" and resp.status_code in RETRY_ON_STATUS: + checks.append({ + "type": "reachability", "status": "warning", + "message": f"Paywall-Quelle, Direkt-Zugang HTTP {resp.status_code} (Researcher-Pipeline nutzt removepaywall.com fuer Inhalte)", + }) + return checks # Feed-Validity-Check skippen (Paywall liefert kein RSS) + # Bot-Block-Retry nur bei strategy='default' if ( strategy == "default" @@ -167,7 +177,6 @@ async def _check_source_reachability( "type": "reachability", "status": "warning", "message": f"Erreichbar nur mit Googlebot-UA (Standard-UA bekam HTTP {initial_url and 'unknown' or 'XXX'})", }) - # Hinweis-Eintrag, aber Hauptcheck folgt unten als 'ok' weil resp jetzt die Retry-Antwort ist if resp.status_code >= 400: checks.append({