feat(sources): PDF-Upload als neuer Quellentyp pdf_document
- POST /api/sources/global/upload-pdf: multipart File-Upload,
50 MB Limit, SHA256-Dedup, speichert PDF unter <dirname(DB)>/pdfs/{sha}.pdf,
legt Source mit processed_at=NULL an (Monitor verarbeitet asynchron)
- pattern in GlobalSourceUpdate um pdf_document erweitert (2x)
- dashboard.html: Button + Modal im Grundquellen-Sub-Tab
- sources.js: openPdfUploadModal + setupPdfUploadForm + FormData-Submit
- app.js: API.upload(path, formData) Helper fuer multipart
- requirements.txt: pypdf (Validierung optional)
Dieser Commit ist enthalten in:
@@ -1,9 +1,12 @@
|
||||
"""Grundquellen-Verwaltung und Kundenquellen-Übersicht."""
|
||||
import json
|
||||
import logging
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, status
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Request, UploadFile, status
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
@@ -13,7 +16,7 @@ from auth import get_current_admin
|
||||
from database import db_dependency, get_db
|
||||
from audit import log_action, get_client_ip
|
||||
from source_meta import get_meta
|
||||
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
|
||||
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S, DB_PATH
|
||||
from shared.source_rules import (
|
||||
discover_source,
|
||||
discover_all_feeds,
|
||||
@@ -115,7 +118,7 @@ class GlobalSourceCreate(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
url: Optional[str] = None
|
||||
domain: Optional[str] = None
|
||||
source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$")
|
||||
source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$")
|
||||
category: str = Field(default="sonstige")
|
||||
status: str = Field(default="active", pattern="^(active|inactive)$")
|
||||
notes: Optional[str] = None
|
||||
@@ -128,7 +131,7 @@ class GlobalSourceUpdate(BaseModel):
|
||||
name: Optional[str] = Field(default=None, max_length=200)
|
||||
url: Optional[str] = None
|
||||
domain: Optional[str] = None
|
||||
source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$")
|
||||
source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$")
|
||||
category: Optional[str] = None
|
||||
status: Optional[str] = Field(default=None, pattern="^(active|inactive)$")
|
||||
notes: Optional[str] = None
|
||||
@@ -1502,3 +1505,116 @@ async def bulk_approve_classifications(
|
||||
after={"bulk_approved_ids": approved_ids, "min_confidence": min_confidence},
|
||||
)
|
||||
return {"approved": len(approved_ids), "ids": approved_ids}
|
||||
|
||||
|
||||
# --- PDF-Upload (Quelle vom Typ pdf_document) ---
|
||||
# Speicherort relativ zur DB: <dirname(DB_PATH)>/pdfs/{sha256}.pdf
|
||||
# Der Monitor pollt pdf_document-Quellen mit processed_at IS NULL und
|
||||
# extrahiert Text + Uebersetzungen (DE/EN). Dieser Endpoint legt nur die
|
||||
# Datei + den Source-Eintrag an (kein LLM-Call hier).
|
||||
|
||||
MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
PDF_DIR = os.path.join(os.path.dirname(os.path.abspath(DB_PATH)), "pdfs")
|
||||
|
||||
|
||||
def _pdf_dir() -> str:
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
return PDF_DIR
|
||||
|
||||
|
||||
@router.post("/global/upload-pdf", status_code=201)
|
||||
async def upload_pdf_source(
|
||||
request: Request,
|
||||
admin: dict = Depends(get_current_admin),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
file: UploadFile = File(...),
|
||||
name: Optional[str] = Form(None),
|
||||
category: str = Form("sonstige"),
|
||||
language: Optional[str] = Form(None),
|
||||
notes: Optional[str] = Form(None),
|
||||
):
|
||||
"""PDF hochladen + als Grundquelle (source_type=pdf_document) registrieren.
|
||||
|
||||
Idempotent ueber SHA256: bestehender Eintrag wird zurueckgegeben (409 mit
|
||||
Detail), die Datei wird nicht erneut gespeichert.
|
||||
"""
|
||||
# Magic-Bytes-Check (PDF beginnt mit %PDF-)
|
||||
head = await file.read(8)
|
||||
if not head.startswith(b"%PDF-"):
|
||||
raise HTTPException(status_code=415, detail="Datei ist kein gueltiges PDF (Magic-Bytes fehlen)")
|
||||
|
||||
# Datei streaming in Temp lesen + sha256 berechnen + Groesse pruefen
|
||||
sha = hashlib.sha256()
|
||||
sha.update(head)
|
||||
total = len(head)
|
||||
tmp_path = os.path.join(_pdf_dir(), f".upload-{uuid.uuid4().hex}.tmp")
|
||||
try:
|
||||
with open(tmp_path, "wb") as out:
|
||||
out.write(head)
|
||||
while True:
|
||||
chunk = await file.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
if total > MAX_PDF_SIZE_BYTES:
|
||||
raise HTTPException(status_code=413, detail=f"PDF ueberschreitet Maximum von {MAX_PDF_SIZE_BYTES // 1024 // 1024} MB")
|
||||
sha.update(chunk)
|
||||
out.write(chunk)
|
||||
sha_hex = sha.hexdigest()
|
||||
final_path = os.path.join(_pdf_dir(), f"{sha_hex}.pdf")
|
||||
rel_path = os.path.join("pdfs", f"{sha_hex}.pdf")
|
||||
|
||||
# Duplikat-Check ueber sha256
|
||||
cursor = await db.execute(
|
||||
"SELECT id, name FROM sources WHERE pdf_sha256 = ? AND tenant_id IS NULL",
|
||||
(sha_hex,),
|
||||
)
|
||||
existing = await cursor.fetchone()
|
||||
if existing:
|
||||
# Datei wegwerfen, bestehende Quelle zurueckgeben
|
||||
os.unlink(tmp_path)
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"PDF bereits hochgeladen als Quelle '{existing['name']}' (id={existing['id']})",
|
||||
)
|
||||
|
||||
# Atomar umbenennen
|
||||
if not os.path.exists(final_path):
|
||||
os.replace(tmp_path, final_path)
|
||||
else:
|
||||
# Datei mit gleichem sha existiert physisch, aber keine Source -> wiederverwenden
|
||||
os.unlink(tmp_path)
|
||||
except HTTPException:
|
||||
if os.path.exists(tmp_path):
|
||||
try: os.unlink(tmp_path)
|
||||
except OSError: pass
|
||||
raise
|
||||
except Exception as e:
|
||||
if os.path.exists(tmp_path):
|
||||
try: os.unlink(tmp_path)
|
||||
except OSError: pass
|
||||
logger.exception("PDF-Upload fehlgeschlagen")
|
||||
raise HTTPException(status_code=500, detail=f"PDF-Upload fehlgeschlagen: {e}")
|
||||
|
||||
# Name herleiten falls nicht angegeben
|
||||
display_name = (name or "").strip() or re.sub(r"\.pdf$", "", file.filename or "PDF", flags=re.I)
|
||||
display_name = display_name[:200]
|
||||
|
||||
cursor = await db.execute(
|
||||
"""INSERT INTO sources
|
||||
(name, url, domain, source_type, category, status, notes, language,
|
||||
pdf_path, pdf_sha256, added_by, tenant_id)
|
||||
VALUES (?, NULL, NULL, 'pdf_document', ?, 'active', ?, ?, ?, ?, ?, NULL)""",
|
||||
(display_name, category, notes, language, rel_path, sha_hex, admin.get("email") or "system"),
|
||||
)
|
||||
src_id = cursor.lastrowid
|
||||
await db.commit()
|
||||
|
||||
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (src_id,))
|
||||
new_src = dict(await cursor.fetchone())
|
||||
await log_action(
|
||||
db, admin, get_client_ip(request),
|
||||
action="upload_pdf", resource_type="source", resource_id=src_id,
|
||||
after={"name": display_name, "pdf_sha256": sha_hex, "size_bytes": total},
|
||||
)
|
||||
return new_src
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren