diff --git a/requirements.txt b/requirements.txt index 9903104..10fa183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ python-multipart aiosmtplib httpx>=0.28 feedparser>=6.0 +# PDF-Upload-Validierung +pypdf>=5.0 diff --git a/src/routers/sources.py b/src/routers/sources.py index b238423..458893b 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -1,9 +1,12 @@ """Grundquellen-Verwaltung und Kundenquellen-Übersicht.""" import json import logging +import hashlib +import os +import re import uuid -from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, status +from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Request, UploadFile, status from fastapi.responses import StreamingResponse from pydantic import BaseModel, Field from typing import Optional @@ -13,7 +16,7 @@ from auth import get_current_admin from database import db_dependency, get_db from audit import log_action, get_client_ip from source_meta import get_meta -from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S +from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S, DB_PATH from shared.source_rules import ( discover_source, discover_all_feeds, @@ -115,7 +118,7 @@ class GlobalSourceCreate(BaseModel): name: str = Field(min_length=1, max_length=200) url: Optional[str] = None domain: Optional[str] = None - source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$") + source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$") category: str = Field(default="sonstige") status: str = Field(default="active", pattern="^(active|inactive)$") notes: Optional[str] = None @@ -128,7 +131,7 @@ class GlobalSourceUpdate(BaseModel): name: Optional[str] = Field(default=None, max_length=200) url: Optional[str] = None domain: Optional[str] = None - source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$") + source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$") category: Optional[str] = None status: Optional[str] = Field(default=None, pattern="^(active|inactive)$") notes: Optional[str] = None @@ -1502,3 +1505,116 @@ async def bulk_approve_classifications( after={"bulk_approved_ids": approved_ids, "min_confidence": min_confidence}, ) return {"approved": len(approved_ids), "ids": approved_ids} + + +# --- PDF-Upload (Quelle vom Typ pdf_document) --- +# Speicherort relativ zur DB: /pdfs/{sha256}.pdf +# Der Monitor pollt pdf_document-Quellen mit processed_at IS NULL und +# extrahiert Text + Uebersetzungen (DE/EN). Dieser Endpoint legt nur die +# Datei + den Source-Eintrag an (kein LLM-Call hier). + +MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB +PDF_DIR = os.path.join(os.path.dirname(os.path.abspath(DB_PATH)), "pdfs") + + +def _pdf_dir() -> str: + os.makedirs(PDF_DIR, exist_ok=True) + return PDF_DIR + + +@router.post("/global/upload-pdf", status_code=201) +async def upload_pdf_source( + request: Request, + admin: dict = Depends(get_current_admin), + db: aiosqlite.Connection = Depends(db_dependency), + file: UploadFile = File(...), + name: Optional[str] = Form(None), + category: str = Form("sonstige"), + language: Optional[str] = Form(None), + notes: Optional[str] = Form(None), +): + """PDF hochladen + als Grundquelle (source_type=pdf_document) registrieren. + + Idempotent ueber SHA256: bestehender Eintrag wird zurueckgegeben (409 mit + Detail), die Datei wird nicht erneut gespeichert. + """ + # Magic-Bytes-Check (PDF beginnt mit %PDF-) + head = await file.read(8) + if not head.startswith(b"%PDF-"): + raise HTTPException(status_code=415, detail="Datei ist kein gueltiges PDF (Magic-Bytes fehlen)") + + # Datei streaming in Temp lesen + sha256 berechnen + Groesse pruefen + sha = hashlib.sha256() + sha.update(head) + total = len(head) + tmp_path = os.path.join(_pdf_dir(), f".upload-{uuid.uuid4().hex}.tmp") + try: + with open(tmp_path, "wb") as out: + out.write(head) + while True: + chunk = await file.read(1024 * 1024) + if not chunk: + break + total += len(chunk) + if total > MAX_PDF_SIZE_BYTES: + raise HTTPException(status_code=413, detail=f"PDF ueberschreitet Maximum von {MAX_PDF_SIZE_BYTES // 1024 // 1024} MB") + sha.update(chunk) + out.write(chunk) + sha_hex = sha.hexdigest() + final_path = os.path.join(_pdf_dir(), f"{sha_hex}.pdf") + rel_path = os.path.join("pdfs", f"{sha_hex}.pdf") + + # Duplikat-Check ueber sha256 + cursor = await db.execute( + "SELECT id, name FROM sources WHERE pdf_sha256 = ? AND tenant_id IS NULL", + (sha_hex,), + ) + existing = await cursor.fetchone() + if existing: + # Datei wegwerfen, bestehende Quelle zurueckgeben + os.unlink(tmp_path) + raise HTTPException( + status_code=409, + detail=f"PDF bereits hochgeladen als Quelle '{existing['name']}' (id={existing['id']})", + ) + + # Atomar umbenennen + if not os.path.exists(final_path): + os.replace(tmp_path, final_path) + else: + # Datei mit gleichem sha existiert physisch, aber keine Source -> wiederverwenden + os.unlink(tmp_path) + except HTTPException: + if os.path.exists(tmp_path): + try: os.unlink(tmp_path) + except OSError: pass + raise + except Exception as e: + if os.path.exists(tmp_path): + try: os.unlink(tmp_path) + except OSError: pass + logger.exception("PDF-Upload fehlgeschlagen") + raise HTTPException(status_code=500, detail=f"PDF-Upload fehlgeschlagen: {e}") + + # Name herleiten falls nicht angegeben + display_name = (name or "").strip() or re.sub(r"\.pdf$", "", file.filename or "PDF", flags=re.I) + display_name = display_name[:200] + + cursor = await db.execute( + """INSERT INTO sources + (name, url, domain, source_type, category, status, notes, language, + pdf_path, pdf_sha256, added_by, tenant_id) + VALUES (?, NULL, NULL, 'pdf_document', ?, 'active', ?, ?, ?, ?, ?, NULL)""", + (display_name, category, notes, language, rel_path, sha_hex, admin.get("email") or "system"), + ) + src_id = cursor.lastrowid + await db.commit() + + cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (src_id,)) + new_src = dict(await cursor.fetchone()) + await log_action( + db, admin, get_client_ip(request), + action="upload_pdf", resource_type="source", resource_id=src_id, + after={"name": display_name, "pdf_sha256": sha_hex, "size_bytes": total}, + ) + return new_src diff --git a/src/static/dashboard.html b/src/static/dashboard.html index f98aaa1..40ceb38 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -328,6 +328,7 @@ +
@@ -641,6 +642,7 @@ +
@@ -799,6 +801,59 @@
+ + +