Promote develop → main (2026-05-17 00:40 UTC)
This commit was merged in pull request #26.
Dieser Commit ist enthalten in:
@@ -16,3 +16,8 @@ Jinja2>=3.1
|
|||||||
weasyprint>=68.0
|
weasyprint>=68.0
|
||||||
python-docx>=1.2
|
python-docx>=1.2
|
||||||
pikepdf>=9.0
|
pikepdf>=9.0
|
||||||
|
# PDF-Quellen (Ingestion)
|
||||||
|
pdfplumber>=0.11
|
||||||
|
pytesseract>=0.3
|
||||||
|
pdf2image>=1.17
|
||||||
|
Pillow>=10.0
|
||||||
|
|||||||
34
scripts/migrate_pdf_source.py
Normale Datei
34
scripts/migrate_pdf_source.py
Normale Datei
@@ -0,0 +1,34 @@
|
|||||||
|
"""Idempotente Migration: Quellen-Typ pdf_document + EN-Spalten in articles.
|
||||||
|
|
||||||
|
Beim Live-Promote anwenden:
|
||||||
|
python3 scripts/migrate_pdf_source.py /home/claude-dev/osint-data/osint.db
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def add_col(db, table, col_def):
|
||||||
|
name = col_def.split()[0]
|
||||||
|
cols = {r[1] for r in db.execute(f"PRAGMA table_info({table})").fetchall()}
|
||||||
|
if name in cols:
|
||||||
|
return False
|
||||||
|
db.execute(f"ALTER TABLE {table} ADD COLUMN {col_def}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def main(path):
|
||||||
|
with sqlite3.connect(path) as db:
|
||||||
|
for col in ("pdf_path TEXT", "pdf_sha256 TEXT", "processed_at TIMESTAMP"):
|
||||||
|
print(f"sources.{col.split()[0]}:", "added" if add_col(db, "sources", col) else "exists")
|
||||||
|
for col in ("headline_en TEXT", "content_en TEXT"):
|
||||||
|
print(f"articles.{col.split()[0]}:", "added" if add_col(db, "articles", col) else "exists")
|
||||||
|
db.execute("CREATE INDEX IF NOT EXISTS idx_sources_pdf_sha256 ON sources(pdf_sha256)")
|
||||||
|
db.commit()
|
||||||
|
print("DONE")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: migrate_pdf_source.py /path/to/osint.db")
|
||||||
|
sys.exit(1)
|
||||||
|
main(sys.argv[1])
|
||||||
@@ -298,6 +298,8 @@ async def lifespan(app: FastAPI):
|
|||||||
orchestrator.set_ws_manager(ws_manager)
|
orchestrator.set_ws_manager(ws_manager)
|
||||||
await orchestrator.start()
|
await orchestrator.start()
|
||||||
|
|
||||||
|
from services import pdf_ingest as _pdf_ingest
|
||||||
|
scheduler.add_job(_pdf_ingest.run_once, "interval", minutes=1, id="pdf_ingest", max_instances=1, coalesce=True)
|
||||||
scheduler.add_job(check_auto_refresh, "interval", minutes=1, id="auto_refresh")
|
scheduler.add_job(check_auto_refresh, "interval", minutes=1, id="auto_refresh")
|
||||||
scheduler.add_job(cleanup_expired, "interval", hours=1, id="cleanup")
|
scheduler.add_job(cleanup_expired, "interval", hours=1, id="cleanup")
|
||||||
scheduler.add_job(daily_source_health_check, "cron", hour=4, minute=0, id="source_health")
|
scheduler.add_job(daily_source_health_check, "cron", hour=4, minute=0, id="source_health")
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ class IncidentListItem(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
# Sources (Quellenverwaltung)
|
# Sources (Quellenverwaltung)
|
||||||
SOURCE_TYPE_PATTERN = "^(rss_feed|web_source|excluded|telegram_channel|podcast_feed)$"
|
SOURCE_TYPE_PATTERN = "^(rss_feed|web_source|excluded|telegram_channel|podcast_feed|pdf_document)$"
|
||||||
SOURCE_CATEGORY_PATTERN = "^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$"
|
SOURCE_CATEGORY_PATTERN = "^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$"
|
||||||
SOURCE_STATUS_PATTERN = "^(active|inactive)$"
|
SOURCE_STATUS_PATTERN = "^(active|inactive)$"
|
||||||
class SourceCreate(BaseModel):
|
class SourceCreate(BaseModel):
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
"""Sources-Router: Quellenverwaltung (Multi-Tenant). Klassifikation: Read-Only — Pflege in der Verwaltung."""
|
"""Sources-Router: Quellenverwaltung (Multi-Tenant). Klassifikation: Read-Only — Pflege in der Verwaltung."""
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import uuid
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from fastapi import APIRouter, Depends, HTTPException, status
|
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status
|
||||||
from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest
|
from models import SourceCreate, SourceUpdate, SourceResponse, DiscoverRequest, DiscoverResponse, DiscoverMultiResponse, DomainActionRequest
|
||||||
from auth import get_current_user
|
from auth import get_current_user
|
||||||
from database import db_dependency, refresh_source_counts
|
from database import db_dependency, refresh_source_counts
|
||||||
from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES
|
from source_rules import discover_source, discover_all_feeds, evaluate_feeds_with_claude, _extract_domain, _detect_category, domain_to_display_name, _DOMAIN_ALIASES
|
||||||
import aiosqlite
|
import aiosqlite
|
||||||
|
from config import DB_PATH
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
logger = logging.getLogger("osint.sources")
|
logger = logging.getLogger("osint.sources")
|
||||||
|
|
||||||
@@ -640,3 +646,110 @@ async def trigger_refresh_counts(
|
|||||||
await refresh_source_counts(db)
|
await refresh_source_counts(db)
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
# --- PDF-Upload (Kundenquelle vom Typ pdf_document) ---
|
||||||
|
# Analog zum Verwaltungs-Upload, aber tenant-spezifisch.
|
||||||
|
# Datei landet unter <dirname(DB_PATH)>/pdfs/{sha256}.pdf.
|
||||||
|
# Der Worker (services.pdf_ingest) verarbeitet sie asynchron im Minutentakt.
|
||||||
|
|
||||||
|
MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||||
|
PDF_DIR = os.path.join(os.path.dirname(os.path.abspath(DB_PATH)), "pdfs")
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_dir() -> str:
|
||||||
|
os.makedirs(PDF_DIR, exist_ok=True)
|
||||||
|
return PDF_DIR
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/upload-pdf", status_code=status.HTTP_201_CREATED)
|
||||||
|
async def upload_pdf_source(
|
||||||
|
current_user: dict = Depends(get_current_user),
|
||||||
|
db: aiosqlite.Connection = Depends(db_dependency),
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
name: Optional[str] = Form(None),
|
||||||
|
category: str = Form("sonstige"),
|
||||||
|
language: Optional[str] = Form(None),
|
||||||
|
notes: Optional[str] = Form(None),
|
||||||
|
):
|
||||||
|
"""PDF hochladen + als Kundenquelle (source_type=pdf_document) registrieren.
|
||||||
|
|
||||||
|
Idempotent ueber SHA256 innerhalb des Tenants: doppelter Upload erzeugt 409.
|
||||||
|
"""
|
||||||
|
head = await file.read(8)
|
||||||
|
if not head.startswith(b"%PDF-"):
|
||||||
|
raise HTTPException(status_code=415, detail="Datei ist kein gueltiges PDF")
|
||||||
|
|
||||||
|
tenant_id = current_user.get("tenant_id")
|
||||||
|
sha = hashlib.sha256()
|
||||||
|
sha.update(head)
|
||||||
|
total = len(head)
|
||||||
|
tmp_path = os.path.join(_pdf_dir(), f".upload-{uuid.uuid4().hex}.tmp")
|
||||||
|
try:
|
||||||
|
with open(tmp_path, "wb") as out:
|
||||||
|
out.write(head)
|
||||||
|
while True:
|
||||||
|
chunk = await file.read(1024 * 1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
total += len(chunk)
|
||||||
|
if total > MAX_PDF_SIZE_BYTES:
|
||||||
|
raise HTTPException(status_code=413, detail=f"PDF ueberschreitet {MAX_PDF_SIZE_BYTES // 1024 // 1024} MB")
|
||||||
|
sha.update(chunk)
|
||||||
|
out.write(chunk)
|
||||||
|
sha_hex = sha.hexdigest()
|
||||||
|
final_path = os.path.join(_pdf_dir(), f"{sha_hex}.pdf")
|
||||||
|
rel_path = os.path.join("pdfs", f"{sha_hex}.pdf")
|
||||||
|
|
||||||
|
# Duplikat-Pruefung innerhalb des Tenants (oder global, falls eine
|
||||||
|
# gleiche PDF bereits als Grundquelle existiert -> dann sichtbar fuer alle).
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id, name, tenant_id FROM sources WHERE pdf_sha256 = ? "
|
||||||
|
"AND (tenant_id IS NULL OR tenant_id = ?)",
|
||||||
|
(sha_hex, tenant_id),
|
||||||
|
)
|
||||||
|
existing = await cursor.fetchone()
|
||||||
|
if existing:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
scope = "global" if existing["tenant_id"] is None else "Ihrer Organisation"
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=409,
|
||||||
|
detail=f"PDF bereits in {scope} vorhanden als Quelle '{existing['name']}' (id={existing['id']})",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not os.path.exists(final_path):
|
||||||
|
os.replace(tmp_path, final_path)
|
||||||
|
else:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except HTTPException:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
try: os.unlink(tmp_path)
|
||||||
|
except OSError: pass
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
try: os.unlink(tmp_path)
|
||||||
|
except OSError: pass
|
||||||
|
logger.exception("PDF-Upload (tenant) fehlgeschlagen")
|
||||||
|
raise HTTPException(status_code=500, detail=f"PDF-Upload fehlgeschlagen: {e}")
|
||||||
|
|
||||||
|
display_name = (name or "").strip() or re.sub(r"\.pdf$", "", file.filename or "PDF", flags=re.I)
|
||||||
|
display_name = display_name[:200]
|
||||||
|
|
||||||
|
cursor = await db.execute(
|
||||||
|
"""INSERT INTO sources
|
||||||
|
(name, url, domain, source_type, category, status, notes, language,
|
||||||
|
pdf_path, pdf_sha256, added_by, tenant_id)
|
||||||
|
VALUES (?, NULL, NULL, 'pdf_document', ?, 'active', ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(display_name, category, notes, language, rel_path, sha_hex,
|
||||||
|
current_user["username"], tenant_id),
|
||||||
|
)
|
||||||
|
src_id = cursor.lastrowid
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
cursor = await db.execute("SELECT * FROM sources WHERE id = ?", (src_id,))
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
result = dict(row)
|
||||||
|
result["is_global"] = result.get("tenant_id") is None
|
||||||
|
result["state_affiliated"] = bool(result.get("state_affiliated"))
|
||||||
|
result["alignments"] = []
|
||||||
|
return result
|
||||||
|
|||||||
237
src/services/pdf_ingest.py
Normale Datei
237
src/services/pdf_ingest.py
Normale Datei
@@ -0,0 +1,237 @@
|
|||||||
|
"""PDF-Ingest: liest hochgeladene PDFs ein und legt sie als Pool-Artikel ab.
|
||||||
|
|
||||||
|
Quellen vom Typ `pdf_document` werden in der Verwaltung angelegt
|
||||||
|
(`processed_at IS NULL`). Dieser Service pollt sie, extrahiert den Text,
|
||||||
|
uebersetzt nach DE+EN und schreibt EINEN Artikel (incident_id=NULL) in
|
||||||
|
`articles`. Idempotent ueber `processed_at`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import aiosqlite
|
||||||
|
|
||||||
|
from config import DB_PATH, CLAUDE_MODEL_FAST
|
||||||
|
from agents.claude_client import call_claude
|
||||||
|
|
||||||
|
logger = logging.getLogger("osint.pdf_ingest")
|
||||||
|
|
||||||
|
MAX_CHARS_PER_PDF = 200_000 # harte Obergrenze, schuetzt vor riesigen Dumps
|
||||||
|
TRANSLATE_INPUT_MAX = 12_000 # was wir dem LLM zum Uebersetzen geben (Cost-Control)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_pdfplumber(path: str) -> str:
|
||||||
|
import pdfplumber
|
||||||
|
parts: list[str] = []
|
||||||
|
with pdfplumber.open(path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
t = page.extract_text() or ""
|
||||||
|
if t:
|
||||||
|
parts.append(t)
|
||||||
|
return "\n\n".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_ocr(path: str) -> str:
|
||||||
|
"""Tesseract-Fallback ueber pdf2image -> Pillow -> pytesseract."""
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import pytesseract
|
||||||
|
images = convert_from_path(path, dpi=200)
|
||||||
|
parts = []
|
||||||
|
for img in images:
|
||||||
|
# deu+eng zusammen, damit mehrsprachige PDFs gehen
|
||||||
|
t = pytesseract.image_to_string(img, lang="deu+eng")
|
||||||
|
if t and t.strip():
|
||||||
|
parts.append(t.strip())
|
||||||
|
return "\n\n".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text(path: str) -> tuple[str, str]:
|
||||||
|
"""Gibt (text, method) zurueck. method: 'pdfplumber' oder 'ocr'."""
|
||||||
|
try:
|
||||||
|
text = _extract_text_pdfplumber(path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("pdfplumber-Extraktion fehlgeschlagen fuer %s: %s", path, e)
|
||||||
|
text = ""
|
||||||
|
if len(text) >= 50:
|
||||||
|
return text[:MAX_CHARS_PER_PDF], "pdfplumber"
|
||||||
|
logger.info("PDF hat keinen Text-Layer (oder <50 Zeichen), versuche OCR: %s", path)
|
||||||
|
text = _extract_text_ocr(path)
|
||||||
|
return text[:MAX_CHARS_PER_PDF], "ocr"
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_headline(text: str, fallback: str) -> str:
|
||||||
|
"""Erste sinnvolle Zeile als Headline; sonst Fallback (Dateiname)."""
|
||||||
|
for raw in text.splitlines():
|
||||||
|
line = raw.strip()
|
||||||
|
if 5 <= len(line) <= 200:
|
||||||
|
return line
|
||||||
|
return fallback.strip() or "Untitled PDF"
|
||||||
|
|
||||||
|
|
||||||
|
async def _translate(text: str, headline: str, target_lang: str) -> tuple[str, str]:
|
||||||
|
"""Uebersetzt Headline + Content nach target_lang ('de' oder 'en').
|
||||||
|
|
||||||
|
Eigene mini-Funktion (statt agents.translator), weil wir je PDF nur EIN
|
||||||
|
Item haben und Headline+Content getrennt brauchen. Returnt (headline_t, content_t).
|
||||||
|
Bei Fehler oder leerem Text: ('', '').
|
||||||
|
"""
|
||||||
|
if not text and not headline:
|
||||||
|
return "", ""
|
||||||
|
lang_label = {"de": "Deutsch", "en": "Englisch"}.get(target_lang, target_lang)
|
||||||
|
content_in = (text or "")[:TRANSLATE_INPUT_MAX]
|
||||||
|
prompt = f"""Du bist ein praeziser Uebersetzer fuer Sachtexte.
|
||||||
|
Uebersetze Headline und Inhalt nach {lang_label}.
|
||||||
|
|
||||||
|
WICHTIG:
|
||||||
|
- Verwende IMMER echte UTF-8-Umlaute (ae->ä, oe->ö, ue->ü, ss->ß) bei Deutsch.
|
||||||
|
- Behalte Eigennamen im Original.
|
||||||
|
- Wenn der Text schon auf {lang_label} ist, gib ihn (nahezu) unveraendert zurueck.
|
||||||
|
- Behalte die wichtigsten Inhalte; kuerze stark auf MAX 3000 Zeichen Content.
|
||||||
|
|
||||||
|
Antworte AUSSCHLIESSLICH mit einem JSON-Objekt im Format:
|
||||||
|
{{"headline": "...", "content": "..."}}
|
||||||
|
|
||||||
|
Keine Markdown-Codefence, keine Einleitung.
|
||||||
|
|
||||||
|
HEADLINE: {headline}
|
||||||
|
INHALT:
|
||||||
|
{content_in}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result_text, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("PDF-Translator (%s) Claude-Call fehlgeschlagen: %s", target_lang, e)
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
raw = result_text.strip()
|
||||||
|
if raw.startswith("```"):
|
||||||
|
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
||||||
|
raw = re.sub(r"\s*```\s*$", "", raw).strip()
|
||||||
|
try:
|
||||||
|
data = json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
m = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
logger.warning("PDF-Translator (%s) JSON nicht parsbar: %r", target_lang, raw[:200])
|
||||||
|
return "", ""
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group(0))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return "", ""
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return "", ""
|
||||||
|
return (data.get("headline") or "").strip(), (data.get("content") or "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
async def _process_one(db: aiosqlite.Connection, src: dict) -> None:
|
||||||
|
sid = src["id"]
|
||||||
|
name = src["name"] or "PDF"
|
||||||
|
rel_path = src["pdf_path"]
|
||||||
|
if not rel_path:
|
||||||
|
logger.warning("PDF-Source #%d ohne pdf_path, ueberspringe", sid)
|
||||||
|
return
|
||||||
|
|
||||||
|
abs_path = rel_path if os.path.isabs(rel_path) else os.path.join(
|
||||||
|
os.path.dirname(DB_PATH), rel_path
|
||||||
|
)
|
||||||
|
if not os.path.exists(abs_path):
|
||||||
|
logger.error("PDF-Datei fehlt fuer Source #%d: %s", sid, abs_path)
|
||||||
|
# auf processed_at setzen aber Notiz hinterlegen, damit kein Endlos-Retry
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE sources SET processed_at = CURRENT_TIMESTAMP, "
|
||||||
|
"notes = COALESCE(notes,'') || ' [PDF-Datei nicht gefunden]' WHERE id = ?",
|
||||||
|
(sid,),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("PDF-Ingest start: source #%d (%s)", sid, abs_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
text, method = await asyncio.to_thread(_extract_text, abs_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("PDF-Extraktion fehlgeschlagen fuer #%d: %s", sid, e)
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE sources SET processed_at = CURRENT_TIMESTAMP, "
|
||||||
|
"notes = COALESCE(notes,'') || ' [PDF-Extraktion fehlgeschlagen]' WHERE id = ?",
|
||||||
|
(sid,),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
logger.warning("PDF #%d ergab keinen Text (auch OCR leer)", sid)
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE sources SET processed_at = CURRENT_TIMESTAMP, "
|
||||||
|
"notes = COALESCE(notes,'') || ' [PDF leer/nicht lesbar]' WHERE id = ?",
|
||||||
|
(sid,),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
fallback_name = re.sub(r"\.pdf$", "", os.path.basename(abs_path), flags=re.I)
|
||||||
|
headline = _derive_headline(text, fallback_name)
|
||||||
|
# Hochgeladene PDFs sind meist deutsch oder englisch; LLM kann das im Prompt erkennen
|
||||||
|
src_lang = (src.get("language") or "").lower() or "auto"
|
||||||
|
|
||||||
|
# Wir senden parallel DE + EN
|
||||||
|
(de_h, de_c), (en_h, en_c) = await asyncio.gather(
|
||||||
|
_translate(text, headline, "de"),
|
||||||
|
_translate(text, headline, "en"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Originaltext kappen, damit articles-Tabelle handhabbar bleibt
|
||||||
|
content_original = text[:5000]
|
||||||
|
|
||||||
|
await db.execute(
|
||||||
|
"""INSERT INTO articles (incident_id, headline, headline_de, headline_en,
|
||||||
|
source, source_url, content_original, content_de, content_en, language,
|
||||||
|
published_at, tenant_id, verification_status)
|
||||||
|
VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, NULL, ?, 'unverified')""",
|
||||||
|
(
|
||||||
|
headline,
|
||||||
|
de_h or None,
|
||||||
|
en_h or None,
|
||||||
|
name,
|
||||||
|
f"pdf://{src.get('pdf_sha256') or sid}",
|
||||||
|
content_original,
|
||||||
|
de_c or None,
|
||||||
|
en_c or None,
|
||||||
|
src_lang if src_lang != "auto" else None,
|
||||||
|
src.get("tenant_id"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE sources SET processed_at = CURRENT_TIMESTAMP, article_count = article_count + 1, "
|
||||||
|
"last_seen_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||||
|
(sid,),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
logger.info("PDF-Ingest fertig: source #%d (%s, %d Zeichen)", sid, method, len(text))
|
||||||
|
|
||||||
|
|
||||||
|
async def run_once() -> int:
|
||||||
|
"""Verarbeitet alle pdf_document-Sources ohne processed_at. Returnt Anzahl.
|
||||||
|
|
||||||
|
Wird vom APScheduler als interval-Job aufgerufen. Pro Tick max 5 PDFs,
|
||||||
|
damit ein hochgeladener Stapel nicht einen einzelnen Lauf monopolisiert.
|
||||||
|
"""
|
||||||
|
async with aiosqlite.connect(DB_PATH) as db:
|
||||||
|
db.row_factory = aiosqlite.Row
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id, name, pdf_path, pdf_sha256, language, tenant_id "
|
||||||
|
"FROM sources WHERE source_type = 'pdf_document' AND processed_at IS NULL "
|
||||||
|
"ORDER BY created_at ASC LIMIT 5"
|
||||||
|
)
|
||||||
|
rows = [dict(r) for r in await cursor.fetchall()]
|
||||||
|
for src in rows:
|
||||||
|
try:
|
||||||
|
await _process_one(db, src)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("PDF-Ingest unerwarteter Fehler bei source #%d", src["id"])
|
||||||
|
return len(rows)
|
||||||
@@ -555,6 +555,7 @@
|
|||||||
<input type="text" id="sources-search" class="timeline-filter-input sources-search-input" placeholder="Suche..." oninput="App.filterSources()" data-i18n-attr="placeholder:sources_modal.search_placeholder">
|
<input type="text" id="sources-search" class="timeline-filter-input sources-search-input" placeholder="Suche..." oninput="App.filterSources()" data-i18n-attr="placeholder:sources_modal.search_placeholder">
|
||||||
</div>
|
</div>
|
||||||
<div class="sources-toolbar-actions">
|
<div class="sources-toolbar-actions">
|
||||||
|
<button class="btn btn-secondary btn-small" onclick="App.openPdfUpload()" style="margin-right:8px;">+ PDF hochladen</button>
|
||||||
<button class="btn btn-primary btn-small" onclick="App.toggleSourceForm()" data-i18n="sources_modal.add_source">+ Quelle</button>
|
<button class="btn btn-primary btn-small" onclick="App.toggleSourceForm()" data-i18n="sources_modal.add_source">+ Quelle</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -633,6 +634,57 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Modal: PDF als Quelle hochladen -->
|
||||||
|
<div class="modal-overlay" id="modal-pdf-upload" role="dialog" aria-modal="true" aria-labelledby="modal-pdf-upload-title">
|
||||||
|
<div class="modal">
|
||||||
|
<div class="modal-header">
|
||||||
|
<div class="modal-title" id="modal-pdf-upload-title">PDF als Quelle hochladen</div>
|
||||||
|
<button class="modal-close" onclick="closeModal('modal-pdf-upload')" aria-label="Schliessen">×</button>
|
||||||
|
</div>
|
||||||
|
<form id="pdf-upload-form" enctype="multipart/form-data">
|
||||||
|
<div class="modal-body">
|
||||||
|
<p class="text-secondary" style="margin-top:0;">
|
||||||
|
Die PDF wird gespeichert und im Hintergrund verarbeitet: Text wird extrahiert (OCR-Fallback fuer gescannte Dokumente) und nach Deutsch und Englisch uebersetzt. Sie erscheint danach in Ihrer Quellenliste.
|
||||||
|
</p>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="pdf-upload-file">PDF-Datei (max. 50 MB)</label>
|
||||||
|
<input type="file" id="pdf-upload-file" accept="application/pdf,.pdf" required>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="pdf-upload-name">Anzeige-Name (optional)</label>
|
||||||
|
<input type="text" id="pdf-upload-name" maxlength="200" placeholder="leer = Dateiname">
|
||||||
|
</div>
|
||||||
|
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="pdf-upload-category">Kategorie</label>
|
||||||
|
<select id="pdf-upload-category">
|
||||||
|
<option value="sonstige" selected>Sonstige</option>
|
||||||
|
<option value="behoerde">Behoerde</option>
|
||||||
|
<option value="think-tank">Think-Tank</option>
|
||||||
|
<option value="fachmedien">Fachmedien</option>
|
||||||
|
<option value="international">International</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="pdf-upload-language">Sprache (optional)</label>
|
||||||
|
<input type="text" id="pdf-upload-language" placeholder="z.B. Deutsch, Englisch">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="pdf-upload-notes">Notizen</label>
|
||||||
|
<input type="text" id="pdf-upload-notes" placeholder="Optional">
|
||||||
|
</div>
|
||||||
|
<div id="pdf-upload-error" class="error-msg" style="display:none"></div>
|
||||||
|
<div id="pdf-upload-progress" class="text-secondary" style="display:none;margin-top:8px;">Laedt hoch …</div>
|
||||||
|
</div>
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-secondary" onclick="closeModal('modal-pdf-upload')">Abbrechen</button>
|
||||||
|
<button type="submit" class="btn btn-primary" id="pdf-upload-submit">Hochladen</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Modal: Content-Viewer (wiederverwendbar für Lagebild, Faktencheck, Quellenübersicht, Timeline) -->
|
<!-- Modal: Content-Viewer (wiederverwendbar für Lagebild, Faktencheck, Quellenübersicht, Timeline) -->
|
||||||
<div class="modal-overlay" id="modal-content-viewer" role="dialog" aria-modal="true" aria-labelledby="content-viewer-title">
|
<div class="modal-overlay" id="modal-content-viewer" role="dialog" aria-modal="true" aria-labelledby="content-viewer-title">
|
||||||
<div class="modal modal-content-viewer">
|
<div class="modal modal-content-viewer">
|
||||||
|
|||||||
@@ -22,6 +22,31 @@ const API = {
|
|||||||
};
|
};
|
||||||
},
|
},
|
||||||
|
|
||||||
|
async upload(path, formData) {
|
||||||
|
const token = localStorage.getItem("osint_token");
|
||||||
|
const headers = {};
|
||||||
|
if (token) headers["Authorization"] = `Bearer ${token}`;
|
||||||
|
const response = await fetch(`${this.baseUrl}${path}`, {
|
||||||
|
method: "POST",
|
||||||
|
headers,
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
if (response.status === 401) {
|
||||||
|
localStorage.removeItem("osint_token");
|
||||||
|
localStorage.removeItem("osint_username");
|
||||||
|
window.location.href = "/";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!response.ok) {
|
||||||
|
const data = await response.json().catch(() => ({}));
|
||||||
|
let d = data.detail;
|
||||||
|
if (Array.isArray(d)) d = d.map(e => e.msg || JSON.stringify(e)).join("; ");
|
||||||
|
else if (typeof d === "object" && d !== null) d = JSON.stringify(d);
|
||||||
|
throw new Error(d || `Fehler ${response.status}`);
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
},
|
||||||
|
|
||||||
async _request(method, path, body = null, externalSignal = null) {
|
async _request(method, path, body = null, externalSignal = null) {
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||||
|
|||||||
@@ -3106,6 +3106,70 @@ async handleRefresh() {
|
|||||||
|
|
||||||
_discoveredData: null,
|
_discoveredData: null,
|
||||||
|
|
||||||
|
openPdfUpload() {
|
||||||
|
const form = document.getElementById("pdf-upload-form");
|
||||||
|
if (form) form.reset();
|
||||||
|
const err = document.getElementById("pdf-upload-error");
|
||||||
|
if (err) { err.style.display = "none"; err.textContent = ""; }
|
||||||
|
const prog = document.getElementById("pdf-upload-progress");
|
||||||
|
if (prog) prog.style.display = "none";
|
||||||
|
openModal("modal-pdf-upload");
|
||||||
|
this._bindPdfUploadFormOnce();
|
||||||
|
},
|
||||||
|
|
||||||
|
_bindPdfUploadFormOnce() {
|
||||||
|
const form = document.getElementById("pdf-upload-form");
|
||||||
|
if (!form || form.dataset.bound === "1") return;
|
||||||
|
form.dataset.bound = "1";
|
||||||
|
form.addEventListener("submit", async (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
const errEl = document.getElementById("pdf-upload-error");
|
||||||
|
const progEl = document.getElementById("pdf-upload-progress");
|
||||||
|
const submitBtn = document.getElementById("pdf-upload-submit");
|
||||||
|
errEl.style.display = "none";
|
||||||
|
|
||||||
|
const fileInput = document.getElementById("pdf-upload-file");
|
||||||
|
const f = fileInput && fileInput.files && fileInput.files[0];
|
||||||
|
if (!f) {
|
||||||
|
errEl.textContent = "Bitte eine PDF-Datei auswaehlen.";
|
||||||
|
errEl.style.display = "block";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (f.size > 50 * 1024 * 1024) {
|
||||||
|
errEl.textContent = "Datei ueberschreitet 50 MB.";
|
||||||
|
errEl.style.display = "block";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const fd = new FormData();
|
||||||
|
fd.append("file", f);
|
||||||
|
const nm = (document.getElementById("pdf-upload-name").value || "").trim();
|
||||||
|
if (nm) fd.append("name", nm);
|
||||||
|
fd.append("category", document.getElementById("pdf-upload-category").value || "sonstige");
|
||||||
|
const lng = (document.getElementById("pdf-upload-language").value || "").trim();
|
||||||
|
if (lng) fd.append("language", lng);
|
||||||
|
const nt = (document.getElementById("pdf-upload-notes").value || "").trim();
|
||||||
|
if (nt) fd.append("notes", nt);
|
||||||
|
|
||||||
|
submitBtn.disabled = true;
|
||||||
|
progEl.style.display = "block";
|
||||||
|
try {
|
||||||
|
await API.upload("/sources/upload-pdf", fd);
|
||||||
|
closeModal("modal-pdf-upload");
|
||||||
|
if (typeof UI !== "undefined" && UI.showToast) {
|
||||||
|
UI.showToast("PDF hochgeladen -- Verarbeitung laeuft im Hintergrund", "success");
|
||||||
|
}
|
||||||
|
await App.loadSources();
|
||||||
|
} catch (err) {
|
||||||
|
errEl.textContent = err && err.message ? err.message : "Upload fehlgeschlagen";
|
||||||
|
errEl.style.display = "block";
|
||||||
|
} finally {
|
||||||
|
submitBtn.disabled = false;
|
||||||
|
progEl.style.display = "none";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
toggleSourceForm(show) {
|
toggleSourceForm(show) {
|
||||||
const form = document.getElementById('sources-add-form');
|
const form = document.getElementById('sources-add-form');
|
||||||
if (!form) return;
|
if (!form) return;
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren