Export: XMP-Metadatenblock und CreationDate/ModDate via pikepdf nachziehen

WeasyPrint 68.1 schreibt weder XMP noch Create-/ModDate ins PDF. Das Post-
Processing via pikepdf ergaenzt beide:

- Info-Dict: /CreationDate + /ModDate im PDF-Standardformat
  (D:YYYYMMDDHHmmSS+HHmm) aus Incident.created_at / updated_at
- XMP-Block mit Dublin Core (dc:title, dc:creator, dc:description,
  dc:subject, dc:language), PDF (pdf:Keywords) und XMP (CreatorTool,
  CreateDate, ModifyDate, MetadataDate) Namespaces

Damit werden die Exporte sowohl von klassischen Tools (Explorer, Finder)
als auch von DMS-Systemen (SharePoint, Bridge, Acrobat) vollstaendig
indexiert. Fallback: Bei Fehler im Post-Processing wird das Original-PDF
zurueckgegeben, Export schlaegt nie fehl.
Dieser Commit ist enthalten in:
claude-dev
2026-04-20 19:15:14 +00:00
Ursprung 9293e66d01
Commit 949df868ff
2 geänderte Dateien mit 60 neuen und 0 gelöschten Zeilen

Datei anzeigen

@@ -11,3 +11,4 @@ python-multipart
aiosmtplib aiosmtplib
geonamescache>=2.0 geonamescache>=2.0
telethon telethon
pikepdf>=9.0

Datei anzeigen

@@ -8,6 +8,7 @@ from collections import defaultdict
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import pikepdf
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML from weasyprint import HTML
from docx import Document from docx import Document
@@ -536,6 +537,63 @@ def _build_export_metadata(
} }
def _format_pdf_date(dt: datetime) -> str:
"""PDF-Datumsformat: D:YYYYMMDDHHmmSS+HH'mm' (mit Zeitzone) oder Z (UTC)."""
if dt.tzinfo is None:
# Naive dt — als lokale TIMEZONE interpretieren
dt = dt.replace(tzinfo=TIMEZONE)
base = dt.strftime("D:%Y%m%d%H%M%S")
offset = dt.utcoffset()
if offset is None:
return base + "Z"
total_minutes = int(offset.total_seconds() // 60)
sign = "+" if total_minutes >= 0 else "-"
total_minutes = abs(total_minutes)
return f"{base}{sign}{total_minutes // 60:02d}'{total_minutes % 60:02d}'"
def _enrich_pdf_metadata(pdf_bytes: bytes, meta: dict) -> bytes:
"""PDF-Ausgabe um XMP-Metadaten und CreationDate/ModDate erweitern (post-process via pikepdf)."""
try:
buf_in = io.BytesIO(pdf_bytes)
with pikepdf.Pdf.open(buf_in) as pdf:
created: datetime = meta.get("created")
modified: datetime = meta.get("modified")
if created and created.tzinfo is None:
created = created.replace(tzinfo=TIMEZONE)
if modified and modified.tzinfo is None:
modified = modified.replace(tzinfo=TIMEZONE)
# Klassisches Info-Dict: CreationDate + ModDate nachziehen
if created:
pdf.docinfo["/CreationDate"] = pikepdf.String(_format_pdf_date(created))
if modified:
pdf.docinfo["/ModDate"] = pikepdf.String(_format_pdf_date(modified))
# XMP-Metadatenblock schreiben (Dublin Core + XMP + PDF Namespaces)
with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp:
xmp["dc:title"] = meta.get("title", "")
xmp["dc:creator"] = [meta.get("author", "")]
xmp["dc:description"] = meta.get("subject", "")
if meta.get("keywords"):
xmp["dc:subject"] = list(meta["keywords"])
xmp["dc:language"] = [meta.get("language", "de-DE")]
xmp["pdf:Keywords"] = meta.get("keywords_comma", "")
xmp["xmp:CreatorTool"] = meta.get("creator_app", "AegisSight Monitor")
if created:
xmp["xmp:CreateDate"] = created.strftime("%Y-%m-%dT%H:%M:%S%z")
if modified:
xmp["xmp:ModifyDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z")
xmp["xmp:MetadataDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z")
buf_out = io.BytesIO()
pdf.save(buf_out)
return buf_out.getvalue()
except Exception as e:
logger.warning(f"PDF-Metadaten-Anreicherung (XMP/Dates) fehlgeschlagen: {e}")
return pdf_bytes
async def generate_pdf( async def generate_pdf(
incident: dict, articles: list, fact_checks: list, snapshots: list, incident: dict, articles: list, fact_checks: list, snapshots: list,
scope: str, creator: str, executive_summary_html: str, scope: str, creator: str, executive_summary_html: str,
@@ -614,6 +672,7 @@ async def generate_pdf(
art["pub_date"] = pub[:10] if pub else "" art["pub_date"] = pub[:10] if pub else ""
pdf_bytes = HTML(string=html_content).write_pdf() pdf_bytes = HTML(string=html_content).write_pdf()
pdf_bytes = _enrich_pdf_metadata(pdf_bytes, meta)
return pdf_bytes return pdf_bytes