feat(sources): PDF-Upload als neuer Quellentyp pdf_document
- POST /api/sources/global/upload-pdf: multipart File-Upload,
50 MB Limit, SHA256-Dedup, speichert PDF unter <dirname(DB)>/pdfs/{sha}.pdf,
legt Source mit processed_at=NULL an (Monitor verarbeitet asynchron)
- pattern in GlobalSourceUpdate um pdf_document erweitert (2x)
- dashboard.html: Button + Modal im Grundquellen-Sub-Tab
- sources.js: openPdfUploadModal + setupPdfUploadForm + FormData-Submit
- app.js: API.upload(path, formData) Helper fuer multipart
- requirements.txt: pypdf (Validierung optional)
Dieser Commit ist enthalten in:
@@ -7,3 +7,5 @@ python-multipart
|
||||
aiosmtplib
|
||||
httpx>=0.28
|
||||
feedparser>=6.0
|
||||
# PDF-Upload-Validierung
|
||||
pypdf>=5.0
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
"""Grundquellen-Verwaltung und Kundenquellen-Übersicht."""
|
||||
import json
|
||||
import logging
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, status
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Request, UploadFile, status
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
@@ -13,7 +16,7 @@ from auth import get_current_admin
|
||||
from database import db_dependency, get_db
|
||||
from audit import log_action, get_client_ip
|
||||
from source_meta import get_meta
|
||||
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S
|
||||
from config import HEALTH_CHECK_USER_AGENT, HEALTH_CHECK_TIMEOUT_S, DB_PATH
|
||||
from shared.source_rules import (
|
||||
discover_source,
|
||||
discover_all_feeds,
|
||||
@@ -115,7 +118,7 @@ class GlobalSourceCreate(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
url: Optional[str] = None
|
||||
domain: Optional[str] = None
|
||||
source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$")
|
||||
source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$")
|
||||
category: str = Field(default="sonstige")
|
||||
status: str = Field(default="active", pattern="^(active|inactive)$")
|
||||
notes: Optional[str] = None
|
||||
@@ -128,7 +131,7 @@ class GlobalSourceUpdate(BaseModel):
|
||||
name: Optional[str] = Field(default=None, max_length=200)
|
||||
url: Optional[str] = None
|
||||
domain: Optional[str] = None
|
||||
source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$")
|
||||
source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$")
|
||||
category: Optional[str] = None
|
||||
status: Optional[str] = Field(default=None, pattern="^(active|inactive)$")
|
||||
notes: Optional[str] = None
|
||||
@@ -1502,3 +1505,116 @@ async def bulk_approve_classifications(
|
||||
after={"bulk_approved_ids": approved_ids, "min_confidence": min_confidence},
|
||||
)
|
||||
return {"approved": len(approved_ids), "ids": approved_ids}
|
||||
|
||||
|
||||
# --- PDF-Upload (Quelle vom Typ pdf_document) ---
|
||||
# Speicherort relativ zur DB: <dirname(DB_PATH)>/pdfs/{sha256}.pdf
|
||||
# Der Monitor pollt pdf_document-Quellen mit processed_at IS NULL und
|
||||
# extrahiert Text + Uebersetzungen (DE/EN). Dieser Endpoint legt nur die
|
||||
# Datei + den Source-Eintrag an (kein LLM-Call hier).
|
||||
|
||||
MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
PDF_DIR = os.path.join(os.path.dirname(os.path.abspath(DB_PATH)), "pdfs")
|
||||
|
||||
|
||||
def _pdf_dir() -> str:
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
return PDF_DIR
|
||||
|
||||
|
||||
@router.post("/global/upload-pdf", status_code=201)
|
||||
async def upload_pdf_source(
|
||||
request: Request,
|
||||
admin: dict = Depends(get_current_admin),
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
file: UploadFile = File(...),
|
||||
name: Optional[str] = Form(None),
|
||||
category: str = Form("sonstige"),
|
||||
language: Optional[str] = Form(None),
|
||||
notes: Optional[str] = Form(None),
|
||||
):
|
||||
"""PDF hochladen + als Grundquelle (source_type=pdf_document) registrieren.
|
||||
|
||||
Idempotent ueber SHA256: bestehender Eintrag wird zurueckgegeben (409 mit
|
||||
Detail), die Datei wird nicht erneut gespeichert.
|
||||
"""
|
||||
# Magic-Bytes-Check (PDF beginnt mit %PDF-)
|
||||
head = await file.read(8)
|
||||
if not head.startswith(b"%PDF-"):
|
||||
raise HTTPException(status_code=415, detail="Datei ist kein gueltiges PDF (Magic-Bytes fehlen)")
|
||||
|
||||
# Datei streaming in Temp lesen + sha256 berechnen + Groesse pruefen
|
||||
sha = hashlib.sha256()
|
||||
sha.update(head)
|
||||
total = len(head)
|
||||
tmp_path = os.path.join(_pdf_dir(), f".upload-{uuid.uuid4().hex}.tmp")
|
||||
try:
|
||||
with open(tmp_path, "wb") as out:
|
||||
out.write(head)
|
||||
while True:
|
||||
chunk = await file.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
if total > MAX_PDF_SIZE_BYTES:
|
||||
raise HTTPException(status_code=413, detail=f"PDF ueberschreitet Maximum von {MAX_PDF_SIZE_BYTES // 1024 // 1024} MB")
|
||||
sha.update(chunk)
|
||||
out.write(chunk)
|
||||
sha_hex = sha.hexdigest()
|
||||
final_path = os.path.join(_pdf_dir(), f"{sha_hex}.pdf")
|
||||
rel_path = os.path.join("pdfs", f"{sha_hex}.pdf")
|
||||
|
||||
# Duplikat-Check ueber sha256
|
||||
cursor = await db.execute(
|
||||
"SELECT id, name FROM sources WHERE pdf_sha256 = ? AND tenant_id IS NULL",
|
||||
(sha_hex,),
|
||||
)
|
||||
existing = await cursor.fetchone()
|
||||
if existing:
|
||||
# Datei wegwerfen, bestehende Quelle zurueckgeben
|
||||
os.unlink(tmp_path)
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"PDF bereits hochgeladen als Quelle '{existing['name']}' (id={existing['id']})",
|
||||
)
|
||||
|
||||
# Atomar umbenennen
|
||||
if not os.path.exists(final_path):
|
||||
os.replace(tmp_path, final_path)
|
||||
else:
|
||||
# Datei mit gleichem sha existiert physisch, aber keine Source -> wiederverwenden
|
||||
os.unlink(tmp_path)
|
||||
except HTTPException:
|
||||
if os.path.exists(tmp_path):
|
||||
try: os.unlink(tmp_path)
|
||||
except OSError: pass
|
||||
raise
|
||||
except Exception as e:
|
||||
if os.path.exists(tmp_path):
|
||||
try: os.unlink(tmp_path)
|
||||
except OSError: pass
|
||||
logger.exception("PDF-Upload fehlgeschlagen")
|
||||
raise HTTPException(status_code=500, detail=f"PDF-Upload fehlgeschlagen: {e}")
|
||||
|
||||
# Name herleiten falls nicht angegeben
|
||||
display_name = (name or "").strip() or re.sub(r"\.pdf$", "", file.filename or "PDF", flags=re.I)
|
||||
display_name = display_name[:200]
|
||||
|
||||
cursor = await db.execute(
|
||||
"""INSERT INTO sources
|
||||
(name, url, domain, source_type, category, status, notes, language,
|
||||
pdf_path, pdf_sha256, added_by, tenant_id)
|
||||
VALUES (?, NULL, NULL, 'pdf_document', ?, 'active', ?, ?, ?, ?, ?, NULL)""",
|
||||
(display_name, category, notes, language, rel_path, sha_hex, admin.get("email") or "system"),
|
||||
)
|
||||
src_id = cursor.lastrowid
|
||||
await db.commit()
|
||||
|
||||
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (src_id,))
|
||||
new_src = dict(await cursor.fetchone())
|
||||
await log_action(
|
||||
db, admin, get_client_ip(request),
|
||||
action="upload_pdf", resource_type="source", resource_id=src_id,
|
||||
after={"name": display_name, "pdf_sha256": sha_hex, "size_bytes": total},
|
||||
)
|
||||
return new_src
|
||||
|
||||
@@ -328,6 +328,7 @@
|
||||
<span class="text-secondary" id="globalSourceCount"></span>
|
||||
</div>
|
||||
<button class="btn btn-secondary" id="discoverSourceBtn">Erkennen</button>
|
||||
<button class="btn btn-secondary" id="newPdfSourceBtn" style="margin-right:8px;">+ PDF hochladen</button>
|
||||
<button class="btn btn-primary" id="newGlobalSourceBtn">+ Neue Grundquelle</button>
|
||||
</div>
|
||||
<div class="card">
|
||||
@@ -641,6 +642,7 @@
|
||||
<option value="telegram_channel">Telegram-Kanal</option>
|
||||
<option value="podcast_feed">Podcast-Feed</option>
|
||||
<option value="excluded">Ausgeschlossen</option>
|
||||
<option value="pdf_document" disabled>PDF-Dokument (nur Upload)</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
@@ -799,6 +801,59 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Modal: PDF hochladen -->
|
||||
<div class="modal-overlay" id="modalPdfUpload">
|
||||
<div class="modal">
|
||||
<div class="modal-header">
|
||||
<h3>PDF als Quelle hochladen</h3>
|
||||
<button class="modal-close" onclick="closeModal(modalPdfUpload)">×</button>
|
||||
</div>
|
||||
<form id="pdfUploadForm" enctype="multipart/form-data">
|
||||
<div class="modal-body">
|
||||
<p class="text-secondary" style="margin-top:0;">
|
||||
Die PDF wird gespeichert und vom Monitor automatisch verarbeitet:
|
||||
Text extrahieren (OCR-Fallback fuer gescannte Dokumente),
|
||||
Übersetzung nach Deutsch und Englisch.
|
||||
</p>
|
||||
<div class="form-group">
|
||||
<label for="pdfFile">PDF-Datei (max. 50 MB)</label>
|
||||
<input type="file" id="pdfFile" accept="application/pdf,.pdf" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="pdfName">Anzeige-Name (optional)</label>
|
||||
<input type="text" id="pdfName" maxlength="200" placeholder="leer = Dateiname">
|
||||
</div>
|
||||
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
|
||||
<div class="form-group">
|
||||
<label for="pdfCategory">Kategorie</label>
|
||||
<select id="pdfCategory">
|
||||
<option value="sonstige" selected>Sonstige</option>
|
||||
<option value="behoerde">Behörde</option>
|
||||
<option value="think-tank">Think-Tank</option>
|
||||
<option value="fachmedien">Fachmedien</option>
|
||||
<option value="international">International</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="pdfLanguage">Sprache (optional)</label>
|
||||
<input type="text" id="pdfLanguage" list="languageSuggestions" placeholder="z.B. Deutsch, Englisch">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="pdfNotes">Notizen</label>
|
||||
<input type="text" id="pdfNotes" placeholder="Optional">
|
||||
</div>
|
||||
<div id="pdfUploadError" class="error-msg" style="display:none"></div>
|
||||
<div id="pdfUploadProgress" class="text-secondary" style="display:none;margin-top:8px;">Lädt hoch …</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" onclick="closeModal(modalPdfUpload)">Abbrechen</button>
|
||||
<button type="submit" class="btn btn-primary" id="pdfUploadSubmitBtn">Hochladen</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Modal: Discover Sources -->
|
||||
<div class="modal-overlay" id="modalDiscover">
|
||||
<div class="modal" style="max-width:600px;">
|
||||
|
||||
@@ -26,6 +26,23 @@ const API = {
|
||||
post(path, body) { return this.request(path, { method: "POST", body: JSON.stringify(body) }); },
|
||||
put(path, body) { return this.request(path, { method: "PUT", body: body ? JSON.stringify(body) : undefined }); },
|
||||
del(path) { return this.request(path, { method: "DELETE" }); },
|
||||
|
||||
async upload(path, formData) {
|
||||
const headers = {};
|
||||
if (this.token) headers["Authorization"] = `Bearer ${this.token}`;
|
||||
const res = await fetch(path, { method: "POST", headers, body: formData });
|
||||
if (res.status === 401) {
|
||||
localStorage.removeItem("token");
|
||||
localStorage.removeItem("username");
|
||||
window.location.href = "/";
|
||||
return;
|
||||
}
|
||||
if (!res.ok) {
|
||||
const data = await res.json().catch(() => ({}));
|
||||
throw new Error(data.detail || `Fehler ${res.status}`);
|
||||
}
|
||||
return res.json();
|
||||
},
|
||||
};
|
||||
|
||||
// --- State ---
|
||||
|
||||
@@ -311,6 +311,8 @@ function editGlobalSource(id) {
|
||||
|
||||
function setupSourceForms() {
|
||||
document.getElementById("newGlobalSourceBtn").addEventListener("click", openNewGlobalSource);
|
||||
document.getElementById("newPdfSourceBtn")?.addEventListener("click", openPdfUploadModal);
|
||||
setupPdfUploadForm();
|
||||
document.getElementById("discoverSourceBtn").addEventListener("click", () => {
|
||||
document.getElementById("discoverUrl").value = "";
|
||||
document.getElementById("discoverStatus").style.display = "none";
|
||||
@@ -880,3 +882,68 @@ function toggleSourceInfo(id) {
|
||||
if (btn) btn.classList.toggle("active", !isVisible);
|
||||
}
|
||||
}
|
||||
|
||||
// --- PDF-Quellen-Upload ---
|
||||
function openPdfUploadModal() {
|
||||
const form = document.getElementById("pdfUploadForm");
|
||||
if (form) form.reset();
|
||||
const err = document.getElementById("pdfUploadError");
|
||||
if (err) { err.style.display = "none"; err.textContent = ""; }
|
||||
const prog = document.getElementById("pdfUploadProgress");
|
||||
if (prog) prog.style.display = "none";
|
||||
openModal("modalPdfUpload");
|
||||
}
|
||||
|
||||
function setupPdfUploadForm() {
|
||||
const form = document.getElementById("pdfUploadForm");
|
||||
if (!form || form.dataset.bound === "1") return;
|
||||
form.dataset.bound = "1";
|
||||
|
||||
form.addEventListener("submit", async (e) => {
|
||||
e.preventDefault();
|
||||
const errEl = document.getElementById("pdfUploadError");
|
||||
const progEl = document.getElementById("pdfUploadProgress");
|
||||
const submitBtn = document.getElementById("pdfUploadSubmitBtn");
|
||||
errEl.style.display = "none";
|
||||
|
||||
const fileInput = document.getElementById("pdfFile");
|
||||
const f = fileInput?.files?.[0];
|
||||
if (!f) {
|
||||
errEl.textContent = "Bitte eine PDF-Datei auswaehlen.";
|
||||
errEl.style.display = "block";
|
||||
return;
|
||||
}
|
||||
if (f.size > 50 * 1024 * 1024) {
|
||||
errEl.textContent = "Datei ueberschreitet 50 MB.";
|
||||
errEl.style.display = "block";
|
||||
return;
|
||||
}
|
||||
|
||||
const fd = new FormData();
|
||||
fd.append("file", f);
|
||||
const nm = document.getElementById("pdfName").value.trim();
|
||||
if (nm) fd.append("name", nm);
|
||||
fd.append("category", document.getElementById("pdfCategory").value || "sonstige");
|
||||
const lng = document.getElementById("pdfLanguage").value.trim();
|
||||
if (lng) fd.append("language", lng);
|
||||
const nt = document.getElementById("pdfNotes").value.trim();
|
||||
if (nt) fd.append("notes", nt);
|
||||
|
||||
submitBtn.disabled = true;
|
||||
progEl.style.display = "block";
|
||||
try {
|
||||
await API.upload("/api/sources/global/upload-pdf", fd);
|
||||
closeModal("modalPdfUpload");
|
||||
if (typeof showToast === "function") {
|
||||
showToast("PDF hochgeladen -- Verarbeitung laeuft im Hintergrund", "success");
|
||||
}
|
||||
loadGlobalSources();
|
||||
} catch (err) {
|
||||
errEl.textContent = err.message || "Upload fehlgeschlagen";
|
||||
errEl.style.display = "block";
|
||||
} finally {
|
||||
submitBtn.disabled = false;
|
||||
progEl.style.display = "none";
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren