From 949df868ffe4f9cbb686ea6d1781888a384522dd Mon Sep 17 00:00:00 2001 From: claude-dev Date: Mon, 20 Apr 2026 19:15:14 +0000 Subject: [PATCH] Export: XMP-Metadatenblock und CreationDate/ModDate via pikepdf nachziehen WeasyPrint 68.1 schreibt weder XMP noch Create-/ModDate ins PDF. Das Post- Processing via pikepdf ergaenzt beide: - Info-Dict: /CreationDate + /ModDate im PDF-Standardformat (D:YYYYMMDDHHmmSS+HHmm) aus Incident.created_at / updated_at - XMP-Block mit Dublin Core (dc:title, dc:creator, dc:description, dc:subject, dc:language), PDF (pdf:Keywords) und XMP (CreatorTool, CreateDate, ModifyDate, MetadataDate) Namespaces Damit werden die Exporte sowohl von klassischen Tools (Explorer, Finder) als auch von DMS-Systemen (SharePoint, Bridge, Acrobat) vollstaendig indexiert. Fallback: Bei Fehler im Post-Processing wird das Original-PDF zurueckgegeben, Export schlaegt nie fehl. --- requirements.txt | 1 + src/report_generator.py | 59 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/requirements.txt b/requirements.txt index f60fe59..c84d01e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ python-multipart aiosmtplib geonamescache>=2.0 telethon +pikepdf>=9.0 diff --git a/src/report_generator.py b/src/report_generator.py index a446c43..de93894 100644 --- a/src/report_generator.py +++ b/src/report_generator.py @@ -8,6 +8,7 @@ from collections import defaultdict from datetime import datetime from pathlib import Path +import pikepdf from jinja2 import Environment, FileSystemLoader from weasyprint import HTML from docx import Document @@ -536,6 +537,63 @@ def _build_export_metadata( } +def _format_pdf_date(dt: datetime) -> str: + """PDF-Datumsformat: D:YYYYMMDDHHmmSS+HH'mm' (mit Zeitzone) oder Z (UTC).""" + if dt.tzinfo is None: + # Naive dt — als lokale TIMEZONE interpretieren + dt = dt.replace(tzinfo=TIMEZONE) + base = dt.strftime("D:%Y%m%d%H%M%S") + offset = dt.utcoffset() + if offset is None: + return base + "Z" + total_minutes = int(offset.total_seconds() // 60) + sign = "+" if total_minutes >= 0 else "-" + total_minutes = abs(total_minutes) + return f"{base}{sign}{total_minutes // 60:02d}'{total_minutes % 60:02d}'" + + +def _enrich_pdf_metadata(pdf_bytes: bytes, meta: dict) -> bytes: + """PDF-Ausgabe um XMP-Metadaten und CreationDate/ModDate erweitern (post-process via pikepdf).""" + try: + buf_in = io.BytesIO(pdf_bytes) + with pikepdf.Pdf.open(buf_in) as pdf: + created: datetime = meta.get("created") + modified: datetime = meta.get("modified") + if created and created.tzinfo is None: + created = created.replace(tzinfo=TIMEZONE) + if modified and modified.tzinfo is None: + modified = modified.replace(tzinfo=TIMEZONE) + + # Klassisches Info-Dict: CreationDate + ModDate nachziehen + if created: + pdf.docinfo["/CreationDate"] = pikepdf.String(_format_pdf_date(created)) + if modified: + pdf.docinfo["/ModDate"] = pikepdf.String(_format_pdf_date(modified)) + + # XMP-Metadatenblock schreiben (Dublin Core + XMP + PDF Namespaces) + with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp: + xmp["dc:title"] = meta.get("title", "") + xmp["dc:creator"] = [meta.get("author", "")] + xmp["dc:description"] = meta.get("subject", "") + if meta.get("keywords"): + xmp["dc:subject"] = list(meta["keywords"]) + xmp["dc:language"] = [meta.get("language", "de-DE")] + xmp["pdf:Keywords"] = meta.get("keywords_comma", "") + xmp["xmp:CreatorTool"] = meta.get("creator_app", "AegisSight Monitor") + if created: + xmp["xmp:CreateDate"] = created.strftime("%Y-%m-%dT%H:%M:%S%z") + if modified: + xmp["xmp:ModifyDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z") + xmp["xmp:MetadataDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z") + + buf_out = io.BytesIO() + pdf.save(buf_out) + return buf_out.getvalue() + except Exception as e: + logger.warning(f"PDF-Metadaten-Anreicherung (XMP/Dates) fehlgeschlagen: {e}") + return pdf_bytes + + async def generate_pdf( incident: dict, articles: list, fact_checks: list, snapshots: list, scope: str, creator: str, executive_summary_html: str, @@ -614,6 +672,7 @@ async def generate_pdf( art["pub_date"] = pub[:10] if pub else "" pdf_bytes = HTML(string=html_content).write_pdf() + pdf_bytes = _enrich_pdf_metadata(pdf_bytes, meta) return pdf_bytes