feat(sources): LLM-Klassifikator + Review-API + Bulk-Migrationsskript

- src/services/source_classifier.py: classify_source(db, id) ruft Haiku mit
  strukturiertem Prompt (4 Achsen + state_affiliated + country + Konfidenz)
  und schreibt Vorschlaege in proposed_*-Spalten. bulk_classify(db, limit)
  iteriert sequenziell ueber unklassifizierte Quellen.

- API-Endpoints (alle hinter Auth, globale Quellen nur fuer org_admin):
  - GET  /api/sources/classification/stats
  - GET  /api/sources/classification/queue
  - POST /api/sources/{id}/classification/approve  (proposed_* -> echte Felder)
  - POST /api/sources/{id}/classification/reject   (proposed_* loeschen)
  - POST /api/sources/{id}/classification/reclassify (sofort, ~3-5s)
  - POST /api/sources/classification/bulk-classify  (BackgroundTask)

- scripts/migrate_sources_classification.py: CLI-Wrapper fuer Bulk-Migration
  zur einmaligen Erstbestueckung aller Bestandsquellen.

Sample-Test auf Staging steht aus.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
Claude Code
2026-05-07 18:46:54 +00:00
Ursprung 715af17ac3
Commit 62ba38ae46
3 geänderte Dateien mit 598 neuen und 2 gelöschten Zeilen

Datei anzeigen

@@ -1,10 +1,12 @@
"""Sources-Router: Quellenverwaltung (Multi-Tenant)."""
import json
import logging
from collections import defaultdict
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest
from auth import get_current_user
from database import db_dependency, refresh_source_counts
from database import db_dependency, get_db, refresh_source_counts
from services.source_classifier import bulk_classify, classify_source
from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES
import aiosqlite
@@ -700,3 +702,238 @@ async def trigger_refresh_counts(
"""Artikelzaehler fuer alle Quellen neu berechnen."""
await refresh_source_counts(db)
return {"status": "ok"}
# === Klassifikations-Review (LLM-Vorschlaege approve/reject/reclassify) ===
def _require_admin_for_global(row: dict, current_user: dict):
"""Globale Quellen (tenant_id IS NULL) duerfen nur org_admins approve-en/reclassify-en."""
if row.get("tenant_id") is None and current_user.get("role") != "org_admin":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Globale Quellen koennen nur von Admins klassifiziert werden",
)
@router.get("/classification/stats")
async def classification_stats(
current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency),
):
"""Counts pro classification_source-Wert (global + eigene Org)."""
tenant_id = current_user.get("tenant_id")
cursor = await db.execute(
"""SELECT classification_source, COUNT(*) as cnt
FROM sources
WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active'
GROUP BY classification_source""",
(tenant_id,),
)
by_source = {row["classification_source"] or "legacy": row["cnt"] for row in await cursor.fetchall()}
cursor = await db.execute(
"""SELECT COUNT(*) as cnt FROM sources
WHERE (tenant_id IS NULL OR tenant_id = ?) AND status = 'active'
AND proposed_political_orientation IS NOT NULL""",
(tenant_id,),
)
pending = (await cursor.fetchone())["cnt"]
return {
"by_classification_source": by_source,
"pending_review": pending,
"total": sum(by_source.values()),
}
@router.get("/classification/queue")
async def classification_queue(
limit: int = 50,
min_confidence: float = 0.0,
current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency),
):
"""Liefert Quellen mit nicht-leeren proposed_*-Spalten (Review-Queue)."""
tenant_id = current_user.get("tenant_id")
cursor = await db.execute(
"""SELECT s.* FROM sources s
WHERE (s.tenant_id IS NULL OR s.tenant_id = ?)
AND s.proposed_political_orientation IS NOT NULL
AND COALESCE(s.proposed_confidence, 0) >= ?
ORDER BY s.proposed_confidence DESC, s.proposed_at DESC
LIMIT ?""",
(tenant_id, min_confidence, limit),
)
rows = [dict(r) for r in await cursor.fetchall()]
alignments_map = await _load_alignments_for(db, [r["id"] for r in rows])
out = []
for d in rows:
try:
proposed_aligns = json.loads(d.get("proposed_alignments_json") or "[]")
except (json.JSONDecodeError, TypeError):
proposed_aligns = []
out.append({
"id": d["id"],
"name": d["name"],
"url": d.get("url"),
"domain": d.get("domain"),
"source_type": d.get("source_type"),
"category": d.get("category"),
"is_global": d.get("tenant_id") is None,
"current": {
"political_orientation": d.get("political_orientation"),
"media_type": d.get("media_type"),
"reliability": d.get("reliability"),
"state_affiliated": bool(d.get("state_affiliated")),
"country_code": d.get("country_code"),
"alignments": alignments_map.get(d["id"], []),
"classification_source": d.get("classification_source"),
},
"proposed": {
"political_orientation": d.get("proposed_political_orientation"),
"media_type": d.get("proposed_media_type"),
"reliability": d.get("proposed_reliability"),
"state_affiliated": bool(d.get("proposed_state_affiliated")),
"country_code": d.get("proposed_country_code"),
"alignments": proposed_aligns,
"confidence": d.get("proposed_confidence"),
"reasoning": d.get("proposed_reasoning"),
"proposed_at": d.get("proposed_at"),
},
})
return out
async def _clear_proposed(db: aiosqlite.Connection, source_id: int):
"""Loescht die proposed_*-Felder einer Quelle (ohne commit)."""
await db.execute(
"""UPDATE sources SET
proposed_political_orientation = NULL,
proposed_media_type = NULL,
proposed_reliability = NULL,
proposed_state_affiliated = NULL,
proposed_country_code = NULL,
proposed_alignments_json = NULL,
proposed_confidence = NULL,
proposed_reasoning = NULL,
proposed_at = NULL
WHERE id = ?""",
(source_id,),
)
@router.post("/{source_id}/classification/approve")
async def approve_classification(
source_id: int,
current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency),
):
"""Uebernimmt proposed_* in echte Felder, setzt classification_source='llm_approved'."""
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
row = await cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
src = dict(row)
_require_admin_for_global(src, current_user)
if src.get("proposed_political_orientation") is None:
raise HTTPException(status_code=400, detail="Keine LLM-Vorschlaege fuer diese Quelle vorhanden")
try:
proposed_aligns = json.loads(src.get("proposed_alignments_json") or "[]")
except (json.JSONDecodeError, TypeError):
proposed_aligns = []
await db.execute(
"""UPDATE sources SET
political_orientation = ?,
media_type = ?,
reliability = ?,
state_affiliated = ?,
country_code = ?,
classification_source = 'llm_approved',
classified_at = CURRENT_TIMESTAMP
WHERE id = ?""",
(
src["proposed_political_orientation"],
src["proposed_media_type"],
src["proposed_reliability"],
1 if src.get("proposed_state_affiliated") else 0,
src.get("proposed_country_code"),
source_id,
),
)
await _replace_alignments(db, source_id, [a for a in proposed_aligns if a in ALLOWED_ALIGNMENTS])
await _clear_proposed(db, source_id)
await db.commit()
return {"source_id": source_id, "status": "approved"}
@router.post("/{source_id}/classification/reject")
async def reject_classification(
source_id: int,
current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency),
):
"""Verwirft die LLM-Vorschlaege ohne Uebernahme. classification_source bleibt unveraendert."""
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
row = await cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
src = dict(row)
_require_admin_for_global(src, current_user)
await _clear_proposed(db, source_id)
# Wenn classification_source noch 'llm_pending' war, zurueck auf 'legacy'
if src.get("classification_source") == "llm_pending":
await db.execute(
"UPDATE sources SET classification_source = 'legacy' WHERE id = ?",
(source_id,),
)
await db.commit()
return {"source_id": source_id, "status": "rejected"}
@router.post("/{source_id}/classification/reclassify")
async def reclassify_source(
source_id: int,
current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency),
):
"""Triggert eine LLM-Klassifikation einer einzelnen Quelle (synchron, ~3-5s)."""
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (source_id,))
row = await cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Quelle nicht gefunden")
src = dict(row)
_require_admin_for_global(src, current_user)
try:
result = await classify_source(db, source_id)
except Exception as e:
logger.error("Reclassify source_id=%s fehlgeschlagen: %s", source_id, e, exc_info=True)
raise HTTPException(status_code=500, detail=f"Klassifikation fehlgeschlagen: {e}")
return result
async def _bulk_classify_background(limit: int, only_unclassified: bool):
"""Hintergrund-Task: oeffnet eigene DB-Connection."""
db = await get_db()
try:
await bulk_classify(db, limit=limit, only_unclassified=only_unclassified)
finally:
await db.close()
@router.post("/classification/bulk-classify")
async def trigger_bulk_classify(
background_tasks: BackgroundTasks,
limit: int = 50,
only_unclassified: bool = True,
current_user: dict = Depends(get_current_user),
):
"""Startet eine Bulk-Klassifikation im Hintergrund (nur Admins)."""
if current_user.get("role") != "org_admin":
raise HTTPException(status_code=403, detail="Nur Admins koennen Bulk-Klassifikation starten")
if limit < 1 or limit > 500:
raise HTTPException(status_code=400, detail="limit muss zwischen 1 und 500 liegen")
background_tasks.add_task(_bulk_classify_background, limit, only_unclassified)
return {"status": "started", "limit": limit, "only_unclassified": only_unclassified}