feat(sources): LLM-Klassifikator + Review-API + Bulk-Migrationsskript
- src/services/source_classifier.py: classify_source(db, id) ruft Haiku mit
strukturiertem Prompt (4 Achsen + state_affiliated + country + Konfidenz)
und schreibt Vorschlaege in proposed_*-Spalten. bulk_classify(db, limit)
iteriert sequenziell ueber unklassifizierte Quellen.
- API-Endpoints (alle hinter Auth, globale Quellen nur fuer org_admin):
- GET /api/sources/classification/stats
- GET /api/sources/classification/queue
- POST /api/sources/{id}/classification/approve (proposed_* -> echte Felder)
- POST /api/sources/{id}/classification/reject (proposed_* loeschen)
- POST /api/sources/{id}/classification/reclassify (sofort, ~3-5s)
- POST /api/sources/classification/bulk-classify (BackgroundTask)
- scripts/migrate_sources_classification.py: CLI-Wrapper fuer Bulk-Migration
zur einmaligen Erstbestueckung aller Bestandsquellen.
Sample-Test auf Staging steht aus.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
64
scripts/migrate_sources_classification.py
Normale Datei
64
scripts/migrate_sources_classification.py
Normale Datei
@@ -0,0 +1,64 @@
|
||||
"""Einmalige LLM-Klassifikation aller noch unklassifizierten Quellen.
|
||||
|
||||
Verwendung:
|
||||
python3 scripts/migrate_sources_classification.py --limit 50
|
||||
python3 scripts/migrate_sources_classification.py --limit 500 # Alle
|
||||
python3 scripts/migrate_sources_classification.py --recheck-pending # bereits Pending neu
|
||||
|
||||
Schreibt Vorschlaege in proposed_*-Spalten. Approval erfolgt anschliessend
|
||||
ueber das Verwaltungs-UI / API (POST /api/sources/{id}/classification/approve).
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# src/ in PYTHONPATH aufnehmen, wenn Skript direkt aufgerufen wird
|
||||
HERE = Path(__file__).resolve().parent
|
||||
SRC = HERE.parent / "src"
|
||||
if str(SRC) not in sys.path:
|
||||
sys.path.insert(0, str(SRC))
|
||||
|
||||
from database import get_db # noqa: E402
|
||||
from services.source_classifier import bulk_classify # noqa: E402
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("migrate_sources")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="LLM-Klassifikation aller Quellen.")
|
||||
parser.add_argument("--limit", type=int, default=50, help="Max. Quellen pro Lauf")
|
||||
parser.add_argument(
|
||||
"--recheck-pending",
|
||||
action="store_true",
|
||||
help="Auch Quellen mit classification_source='llm_pending' neu klassifizieren",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
db = await get_db()
|
||||
try:
|
||||
result = await bulk_classify(
|
||||
db,
|
||||
limit=args.limit,
|
||||
only_unclassified=not args.recheck_pending,
|
||||
)
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
print(f"Verarbeitet: {result['processed']}")
|
||||
print(f"Erfolgreich: {result['success']}")
|
||||
print(f"Fehler: {len(result['errors'])}")
|
||||
print(f"Kosten: ${result['total_cost_usd']:.4f}")
|
||||
if result["errors"]:
|
||||
print("\nFehler-Details:")
|
||||
for e in result["errors"][:10]:
|
||||
print(f" source_id={e['source_id']}: {e['error']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,10 +1,12 @@
|
||||
"""Sources-Router: Quellenverwaltung (Multi-Tenant)."""
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
|
||||
from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest
|
||||
from auth import get_current_user
|
||||
from database import db_dependency, refresh_source_counts
|
||||
from database import db_dependency, get_db, refresh_source_counts
|
||||
from services.source_classifier import bulk_classify, classify_source
|
||||
from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES
|
||||
import aiosqlite
|
||||
|
||||
@@ -700,3 +702,238 @@ async def trigger_refresh_counts(
|
||||
"""Artikelzaehler fuer alle Quellen neu berechnen."""
|
||||
await refresh_source_counts(db)
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
# === Klassifikations-Review (LLM-Vorschlaege approve/reject/reclassify) ===
|
||||
|
||||
def _require_admin_for_global(row: dict, current_user: dict):
|
||||
"""Globale Quellen (tenant_id IS NULL) duerfen nur org_admins approve-en/reclassify-en."""
|
||||
if row.get("tenant_id") is None and current_user.get("role") != "org_admin":
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Globale Quellen koennen nur von Admins klassifiziert werden",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/classification/stats")
|
||||
async def classification_stats(
|
||||
current_user: dict = Depends(get_current_user),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Counts pro classification_source-Wert (global + eigene Org)."""
|
||||
tenant_id = current_user.get("tenant_id")
|
||||
cursor = await db.execute(
|
||||
"""SELECT classification_source, COUNT(*) as cnt
|
||||
FROM sources
|
||||
WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active'
|
||||
GROUP BY classification_source""",
|
||||
(tenant_id,),
|
||||
)
|
||||
by_source = {row["classification_source"] or "legacy": row["cnt"] for row in await cursor.fetchall()}
|
||||
cursor = await db.execute(
|
||||
"""SELECT COUNT(*) as cnt FROM sources
|
||||
WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active'
|
||||
AND proposed_political_orientation IS NOT NULL""",
|
||||
(tenant_id,),
|
||||
)
|
||||
pending = (await cursor.fetchone())["cnt"]
|
||||
return {
|
||||
"by_classification_source": by_source,
|
||||
"pending_review": pending,
|
||||
"total": sum(by_source.values()),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/classification/queue")
|
||||
async def classification_queue(
|
||||
limit: int = 50,
|
||||
min_confidence: float = 0.0,
|
||||
current_user: dict = Depends(get_current_user),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Liefert Quellen mit nicht-leeren proposed_*-Spalten (Review-Queue)."""
|
||||
tenant_id = current_user.get("tenant_id")
|
||||
cursor = await db.execute(
|
||||
"""SELECT s.* FROM sources s
|
||||
WHERE (s.tenant_id IS NULL OR s.tenant_id = ?)
|
||||
AND s.proposed_political_orientation IS NOT NULL
|
||||
AND COALESCE(s.proposed_confidence, 0) >= ?
|
||||
ORDER BY s.proposed_confidence DESC, s.proposed_at DESC
|
||||
LIMIT ?""",
|
||||
(tenant_id, min_confidence, limit),
|
||||
)
|
||||
rows = [dict(r) for r in await cursor.fetchall()]
|
||||
alignments_map = await _load_alignments_for(db, [r["id"] for r in rows])
|
||||
out = []
|
||||
for d in rows:
|
||||
try:
|
||||
proposed_aligns = json.loads(d.get("proposed_alignments_json") or "[]")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
proposed_aligns = []
|
||||
out.append({
|
||||
"id": d["id"],
|
||||
"name": d["name"],
|
||||
"url": d.get("url"),
|
||||
"domain": d.get("domain"),
|
||||
"source_type": d.get("source_type"),
|
||||
"category": d.get("category"),
|
||||
"is_global": d.get("tenant_id") is None,
|
||||
"current": {
|
||||
"political_orientation": d.get("political_orientation"),
|
||||
"media_type": d.get("media_type"),
|
||||
"reliability": d.get("reliability"),
|
||||
"state_affiliated": bool(d.get("state_affiliated")),
|
||||
"country_code": d.get("country_code"),
|
||||
"alignments": alignments_map.get(d["id"], []),
|
||||
"classification_source": d.get("classification_source"),
|
||||
},
|
||||
"proposed": {
|
||||
"political_orientation": d.get("proposed_political_orientation"),
|
||||
"media_type": d.get("proposed_media_type"),
|
||||
"reliability": d.get("proposed_reliability"),
|
||||
"state_affiliated": bool(d.get("proposed_state_affiliated")),
|
||||
"country_code": d.get("proposed_country_code"),
|
||||
"alignments": proposed_aligns,
|
||||
"confidence": d.get("proposed_confidence"),
|
||||
"reasoning": d.get("proposed_reasoning"),
|
||||
"proposed_at": d.get("proposed_at"),
|
||||
},
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
async def _clear_proposed(db: aiosqlite.Connection, source_id: int):
|
||||
"""Loescht die proposed_*-Felder einer Quelle (ohne commit)."""
|
||||
await db.execute(
|
||||
"""UPDATE sources SET
|
||||
proposed_political_orientation = NULL,
|
||||
proposed_media_type = NULL,
|
||||
proposed_reliability = NULL,
|
||||
proposed_state_affiliated = NULL,
|
||||
proposed_country_code = NULL,
|
||||
proposed_alignments_json = NULL,
|
||||
proposed_confidence = NULL,
|
||||
proposed_reasoning = NULL,
|
||||
proposed_at = NULL
|
||||
WHERE id = ?""",
|
||||
(source_id,),
|
||||
)
|
||||
|
||||
|
||||
@router.post("/{source_id}/classification/approve")
|
||||
async def approve_classification(
|
||||
source_id: int,
|
||||
current_user: dict = Depends(get_current_user),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Uebernimmt proposed_* in echte Felder, setzt classification_source='llm_approved'."""
|
||||
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
|
||||
row = await cursor.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
|
||||
src = dict(row)
|
||||
_require_admin_for_global(src, current_user)
|
||||
|
||||
if src.get("proposed_political_orientation") is None:
|
||||
raise HTTPException(status_code=400, detail="Keine LLM-Vorschlaege fuer diese Quelle vorhanden")
|
||||
|
||||
try:
|
||||
proposed_aligns = json.loads(src.get("proposed_alignments_json") or "[]")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
proposed_aligns = []
|
||||
|
||||
await db.execute(
|
||||
"""UPDATE sources SET
|
||||
political_orientation = ?,
|
||||
media_type = ?,
|
||||
reliability = ?,
|
||||
state_affiliated = ?,
|
||||
country_code = ?,
|
||||
classification_source = 'llm_approved',
|
||||
classified_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?""",
|
||||
(
|
||||
src["proposed_political_orientation"],
|
||||
src["proposed_media_type"],
|
||||
src["proposed_reliability"],
|
||||
1 if src.get("proposed_state_affiliated") else 0,
|
||||
src.get("proposed_country_code"),
|
||||
source_id,
|
||||
),
|
||||
)
|
||||
await _replace_alignments(db, source_id, [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS])
|
||||
await _clear_proposed(db, source_id)
|
||||
await db.commit()
|
||||
return {"source_id": source_id, "status": "approved"}
|
||||
|
||||
|
||||
@router.post("/{source_id}/classification/reject")
|
||||
async def reject_classification(
|
||||
source_id: int,
|
||||
current_user: dict = Depends(get_current_user),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Verwirft die LLM-Vorschlaege ohne Uebernahme. classification_source bleibt unveraendert."""
|
||||
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
|
||||
row = await cursor.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
|
||||
src = dict(row)
|
||||
_require_admin_for_global(src, current_user)
|
||||
|
||||
await _clear_proposed(db, source_id)
|
||||
# Wenn classification_source noch 'llm_pending' war, zurueck auf 'legacy'
|
||||
if src.get("classification_source") == "llm_pending":
|
||||
await db.execute(
|
||||
"UPDATE sources SET classification_source = 'legacy' WHERE id = ?",
|
||||
(source_id,),
|
||||
)
|
||||
await db.commit()
|
||||
return {"source_id": source_id, "status": "rejected"}
|
||||
|
||||
|
||||
@router.post("/{source_id}/classification/reclassify")
|
||||
async def reclassify_source(
|
||||
source_id: int,
|
||||
current_user: dict = Depends(get_current_user),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Triggert eine LLM-Klassifikation einer einzelnen Quelle (synchron, ~3-5s)."""
|
||||
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
|
||||
row = await cursor.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
|
||||
src = dict(row)
|
||||
_require_admin_for_global(src, current_user)
|
||||
|
||||
try:
|
||||
result = await classify_source(db, source_id)
|
||||
except Exception as e:
|
||||
logger.error("Reclassify source_id=%s fehlgeschlagen: %s", source_id, e, exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Klassifikation fehlgeschlagen: {e}")
|
||||
return result
|
||||
|
||||
|
||||
async def _bulk_classify_background(limit: int, only_unclassified: bool):
|
||||
"""Hintergrund-Task: oeffnet eigene DB-Connection."""
|
||||
db = await get_db()
|
||||
try:
|
||||
await bulk_classify(db, limit=limit, only_unclassified=only_unclassified)
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
@router.post("/classification/bulk-classify")
|
||||
async def trigger_bulk_classify(
|
||||
background_tasks: BackgroundTasks,
|
||||
limit: int = 50,
|
||||
only_unclassified: bool = True,
|
||||
current_user: dict = Depends(get_current_user),
|
||||
):
|
||||
"""Startet eine Bulk-Klassifikation im Hintergrund (nur Admins)."""
|
||||
if current_user.get("role") != "org_admin":
|
||||
raise HTTPException(status_code=403, detail="Nur Admins koennen Bulk-Klassifikation starten")
|
||||
if limit < 1 or limit > 500:
|
||||
raise HTTPException(status_code=400, detail="limit muss zwischen 1 und 500 liegen")
|
||||
background_tasks.add_task(_bulk_classify_background, limit, only_unclassified)
|
||||
return {"status": "started", "limit": limit, "only_unclassified": only_unclassified}
|
||||
|
||||
295
src/services/source_classifier.py
Normale Datei
295
src/services/source_classifier.py
Normale Datei
@@ -0,0 +1,295 @@
|
||||
"""Klassifiziert Quellen via Claude (Haiku) nach 4 Achsen + state_affiliated + country.
|
||||
|
||||
Schreibt Vorschlaege in die proposed_*-Spalten von sources und setzt
|
||||
classification_source='llm_pending'. Approval erfolgt ueber separate Endpoints,
|
||||
die proposed_* in die echten Spalten kopieren.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
import aiosqlite
|
||||
|
||||
from agents.claude_client import call_claude
|
||||
from config import CLAUDE_MODEL_FAST
|
||||
|
||||
logger = logging.getLogger("osint.source_classifier")
|
||||
|
||||
POLITICAL_VALUES = {
|
||||
"links_extrem", "links", "mitte_links", "liberal", "mitte",
|
||||
"konservativ", "mitte_rechts", "rechts", "rechts_extrem", "na",
|
||||
}
|
||||
MEDIA_TYPE_VALUES = {
|
||||
"tageszeitung", "wochenzeitung", "magazin", "tv_sender", "radio",
|
||||
"oeffentlich_rechtlich", "nachrichtenagentur", "online_only", "blog",
|
||||
"telegram_kanal", "telegram_bot", "podcast", "social_media", "imageboard",
|
||||
"think_tank", "ngo", "behoerde", "staatsmedium", "fachmedium", "sonstige",
|
||||
}
|
||||
RELIABILITY_VALUES = {"sehr_hoch", "hoch", "gemischt", "niedrig", "sehr_niedrig", "na"}
|
||||
ALIGNMENT_VALUES = {
|
||||
"prorussisch", "proiranisch", "prowestlich", "proukrainisch",
|
||||
"prochinesisch", "projapanisch", "proisraelisch", "propalaestinensisch",
|
||||
"protuerkisch", "panarabisch", "neutral", "sonstige",
|
||||
}
|
||||
|
||||
|
||||
def _build_prompt(src: dict, sample_articles: list[dict]) -> str:
|
||||
sample_text = ""
|
||||
if sample_articles:
|
||||
lines = []
|
||||
for i, art in enumerate(sample_articles[:5], 1):
|
||||
headline = (art.get("headline") or art.get("headline_de") or "").strip()
|
||||
if headline:
|
||||
lines.append(f"{i}. {headline[:200]}")
|
||||
if lines:
|
||||
sample_text = "\nLetzte Artikel/Headlines:\n" + "\n".join(lines)
|
||||
|
||||
return f"""Du bist ein OSINT-Analyst und klassifizierst Nachrichten- und Medienquellen fuer ein Lagebild-Monitoring-System (DACH-Raum).
|
||||
|
||||
QUELLE:
|
||||
Name: {src.get('name')}
|
||||
URL: {src.get('url') or '-'}
|
||||
Domain: {src.get('domain') or '-'}
|
||||
Quellentyp: {src.get('source_type')}
|
||||
Bisherige Kategorie: {src.get('category')}
|
||||
Sprache: {src.get('language') or 'unbekannt'}
|
||||
Bisherige Notiz (Freitext): {src.get('bias') or '-'}{sample_text}
|
||||
|
||||
AUFGABE: Klassifiziere die Quelle nach folgenden Achsen.
|
||||
|
||||
1. political_orientation:
|
||||
- links_extrem (z.B. linksunten.indymedia)
|
||||
- links (klar links, z.B. junge Welt, taz)
|
||||
- mitte_links (linksliberal/sozialdemokratisch, z.B. SZ, Spiegel)
|
||||
- liberal (wirtschafts-/grünliberal, z.B. NZZ, Zeit)
|
||||
- mitte (politisch neutral, Agentur, z.B. dpa, Reuters, tagesschau)
|
||||
- konservativ (buergerlich-konservativ, z.B. FAZ, Welt)
|
||||
- mitte_rechts (rechts-buergerlich, z.B. Tichys Einblick, Achgut)
|
||||
- rechts (klar rechts, z.B. Junge Freiheit, EpochTimes)
|
||||
- rechts_extrem (z.B. Compact, PI-News)
|
||||
- na (nicht klassifizierbar: Behoerde, Fachmedium, Think Tank ohne klare politische Linie)
|
||||
|
||||
2. media_type (genau einer):
|
||||
tageszeitung, wochenzeitung, magazin, tv_sender, radio, oeffentlich_rechtlich,
|
||||
nachrichtenagentur, online_only, blog, telegram_kanal, telegram_bot, podcast,
|
||||
social_media, imageboard, think_tank, ngo, behoerde, staatsmedium, fachmedium, sonstige
|
||||
|
||||
3. reliability:
|
||||
- sehr_hoch (etablierte Qualitaet, Faktencheck: tagesschau, dpa, FAZ, Reuters)
|
||||
- hoch (serioes mit gelegentlichen Schwaechen: taz, Welt, BILD bei harten News)
|
||||
- gemischt (Mix Meinung/Einseitigkeit: Tichys Einblick, Achgut, Boulevard)
|
||||
- niedrig (haeufig irrefuehrend, schwache Quellenarbeit: Junge Freiheit, EpochTimes)
|
||||
- sehr_niedrig (bekannt fuer Desinformation/Verschwoerung: Compact, RT, Sputnik, PI-News)
|
||||
- na (nicht bewertbar)
|
||||
|
||||
4. alignments (Mehrfach, leeres Array wenn keine ausgepraegte Naehe):
|
||||
prorussisch, proiranisch, prowestlich, proukrainisch, prochinesisch, projapanisch,
|
||||
proisraelisch, propalaestinensisch, protuerkisch, panarabisch, neutral, sonstige
|
||||
|
||||
5. state_affiliated (true/false): true wenn vom Staat finanziert/kontrolliert
|
||||
(RT, Sputnik, CGTN, PressTV, Xinhua, TRT). Public Service Broadcaster
|
||||
wie ARD/ZDF/BBC sind NICHT state_affiliated.
|
||||
|
||||
6. country_code (ISO 3166-1 alpha-2): Heimatland (DE, AT, CH, RU, US, ...). null wenn unklar.
|
||||
|
||||
7. confidence (0.0-1.0): 0.85+ fuer bekannte Outlets, 0.5-0.85 fuer mittelbekannt, <0.5 fuer unsicher.
|
||||
|
||||
8. reasoning (1-2 Saetze): Kurze Begruendung der Hauptklassifikationen.
|
||||
|
||||
WICHTIG:
|
||||
- Antworte AUSSCHLIESSLICH mit einem JSON-Objekt, kein Text drumherum.
|
||||
- Nutze ausschliesslich die genannten enum-Werte (snake_case).
|
||||
- Bei Unklarheit lieber `na` und niedrige confidence.
|
||||
|
||||
JSON-Schema:
|
||||
{{
|
||||
"political_orientation": "...",
|
||||
"media_type": "...",
|
||||
"reliability": "...",
|
||||
"alignments": ["..."],
|
||||
"state_affiliated": false,
|
||||
"country_code": "DE",
|
||||
"confidence": 0.9,
|
||||
"reasoning": "..."
|
||||
}}"""
|
||||
|
||||
|
||||
async def _load_sample_articles(db: aiosqlite.Connection, name: str, domain: str | None, limit: int = 5) -> list[dict]:
|
||||
"""Laedt die letzten Headlines einer Quelle (per name oder Domain-Match)."""
|
||||
rows: list = []
|
||||
if name:
|
||||
cursor = await db.execute(
|
||||
"SELECT headline, headline_de FROM articles WHERE source = ? ORDER BY collected_at DESC LIMIT ?",
|
||||
(name, limit),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
if not rows and domain:
|
||||
cursor = await db.execute(
|
||||
"SELECT headline, headline_de FROM articles WHERE source_url LIKE ? ORDER BY collected_at DESC LIMIT ?",
|
||||
(f"%{domain}%", limit),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def _validate(parsed: dict) -> dict:
|
||||
"""Validiert + normalisiert eine LLM-Antwort gegen die Enums."""
|
||||
pol = parsed.get("political_orientation", "na")
|
||||
if pol not in POLITICAL_VALUES:
|
||||
pol = "na"
|
||||
mt = parsed.get("media_type", "sonstige")
|
||||
if mt not in MEDIA_TYPE_VALUES:
|
||||
mt = "sonstige"
|
||||
rel = parsed.get("reliability", "na")
|
||||
if rel not in RELIABILITY_VALUES:
|
||||
rel = "na"
|
||||
aligns_raw = parsed.get("alignments") or []
|
||||
if not isinstance(aligns_raw, list):
|
||||
aligns_raw = []
|
||||
aligns = sorted({a for a in aligns_raw if isinstance(a, str) and a in ALIGNMENT_VALUES})
|
||||
sa = bool(parsed.get("state_affiliated", False))
|
||||
cc = parsed.get("country_code")
|
||||
if isinstance(cc, str) and len(cc) == 2 and cc.isalpha():
|
||||
cc = cc.upper()
|
||||
else:
|
||||
cc = None
|
||||
try:
|
||||
confidence = float(parsed.get("confidence", 0.5))
|
||||
confidence = max(0.0, min(1.0, confidence))
|
||||
except (TypeError, ValueError):
|
||||
confidence = 0.5
|
||||
reasoning = str(parsed.get("reasoning", ""))[:1000]
|
||||
return {
|
||||
"political_orientation": pol,
|
||||
"media_type": mt,
|
||||
"reliability": rel,
|
||||
"alignments": aligns,
|
||||
"state_affiliated": sa,
|
||||
"country_code": cc,
|
||||
"confidence": confidence,
|
||||
"reasoning": reasoning,
|
||||
}
|
||||
|
||||
|
||||
async def classify_source(
|
||||
db: aiosqlite.Connection,
|
||||
source_id: int,
|
||||
sample_limit: int = 5,
|
||||
model: str = CLAUDE_MODEL_FAST,
|
||||
) -> dict:
|
||||
"""Klassifiziert eine einzelne Quelle und schreibt die Vorschlaege in proposed_*-Spalten."""
|
||||
cursor = await db.execute(
|
||||
"SELECT id, name, url, domain, source_type, category, language, bias, "
|
||||
"classification_source FROM sources WHERE id = ?",
|
||||
(source_id,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
if not row:
|
||||
raise ValueError(f"Quelle {source_id} nicht gefunden")
|
||||
src = dict(row)
|
||||
|
||||
sample = await _load_sample_articles(db, src["name"], src.get("domain"), sample_limit)
|
||||
prompt = _build_prompt(src, sample)
|
||||
response, usage = await call_claude(prompt, tools=None, model=model)
|
||||
|
||||
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
||||
if not json_match:
|
||||
raise ValueError(f"Keine JSON-Antwort von Claude fuer source_id={source_id}: {response[:200]}")
|
||||
parsed = json.loads(json_match.group(0))
|
||||
result = _validate(parsed)
|
||||
|
||||
# Nur classification_source auf 'llm_pending' setzen, wenn nicht bereits manuell/approved
|
||||
new_src = "CASE WHEN classification_source IN ('manual','llm_approved') THEN classification_source ELSE 'llm_pending' END"
|
||||
await db.execute(
|
||||
f"""UPDATE sources SET
|
||||
proposed_political_orientation = ?,
|
||||
proposed_media_type = ?,
|
||||
proposed_reliability = ?,
|
||||
proposed_state_affiliated = ?,
|
||||
proposed_country_code = ?,
|
||||
proposed_alignments_json = ?,
|
||||
proposed_confidence = ?,
|
||||
proposed_reasoning = ?,
|
||||
proposed_at = CURRENT_TIMESTAMP,
|
||||
classification_source = {new_src}
|
||||
WHERE id = ?""",
|
||||
(
|
||||
result["political_orientation"],
|
||||
result["media_type"],
|
||||
result["reliability"],
|
||||
1 if result["state_affiliated"] else 0,
|
||||
result["country_code"],
|
||||
json.dumps(result["alignments"], ensure_ascii=False),
|
||||
result["confidence"],
|
||||
result["reasoning"],
|
||||
source_id,
|
||||
),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
logger.info(
|
||||
"Klassifiziert source_id=%s '%s' -> %s/%s/%s conf=%.2f ($%.4f)",
|
||||
source_id, src["name"], result["political_orientation"],
|
||||
result["media_type"], result["reliability"], result["confidence"],
|
||||
usage.cost_usd,
|
||||
)
|
||||
|
||||
result["source_id"] = source_id
|
||||
result["usage"] = {
|
||||
"cost_usd": usage.cost_usd,
|
||||
"input_tokens": usage.input_tokens,
|
||||
"output_tokens": usage.output_tokens,
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
async def bulk_classify(
|
||||
db: aiosqlite.Connection,
|
||||
limit: int = 50,
|
||||
only_unclassified: bool = True,
|
||||
model: str = CLAUDE_MODEL_FAST,
|
||||
) -> dict:
|
||||
"""Klassifiziert noch unklassifizierte Quellen (sequenziell).
|
||||
|
||||
Args:
|
||||
limit: Maximale Anzahl Quellen pro Aufruf
|
||||
only_unclassified: Wenn True, nur classification_source='legacy'.
|
||||
Wenn False, auch 'llm_pending' neu klassifizieren.
|
||||
"""
|
||||
if only_unclassified:
|
||||
where = "classification_source = 'legacy'"
|
||||
else:
|
||||
where = "classification_source IN ('legacy', 'llm_pending')"
|
||||
cursor = await db.execute(
|
||||
f"SELECT id FROM sources WHERE {where} AND status = 'active' "
|
||||
f"AND source_type != 'excluded' ORDER BY id LIMIT ?",
|
||||
(limit,),
|
||||
)
|
||||
ids = [row["id"] for row in await cursor.fetchall()]
|
||||
|
||||
total_cost = 0.0
|
||||
success = 0
|
||||
errors: list[dict] = []
|
||||
|
||||
for sid in ids:
|
||||
try:
|
||||
r = await classify_source(db, sid, model=model)
|
||||
total_cost += r["usage"]["cost_usd"]
|
||||
success += 1
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Klassifikation source_id=%s fehlgeschlagen: %s", sid, e, exc_info=True)
|
||||
errors.append({"source_id": sid, "error": str(e)})
|
||||
|
||||
logger.info(
|
||||
"Bulk-Klassifikation fertig: %d/%d erfolgreich, $%.4f Kosten, %d Fehler",
|
||||
success, len(ids), total_cost, len(errors),
|
||||
)
|
||||
return {
|
||||
"processed": len(ids),
|
||||
"success": success,
|
||||
"errors": errors,
|
||||
"total_cost_usd": total_cost,
|
||||
}
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren