diff --git a/requirements.txt b/requirements.txt index f60fe59..c84d01e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ python-multipart aiosmtplib geonamescache>=2.0 telethon +pikepdf>=9.0 diff --git a/src/report_generator.py b/src/report_generator.py index a446c43..de93894 100644 --- a/src/report_generator.py +++ b/src/report_generator.py @@ -8,6 +8,7 @@ from collections import defaultdict from datetime import datetime from pathlib import Path +import pikepdf from jinja2 import Environment, FileSystemLoader from weasyprint import HTML from docx import Document @@ -536,6 +537,63 @@ def _build_export_metadata( } +def _format_pdf_date(dt: datetime) -> str: + """PDF-Datumsformat: D:YYYYMMDDHHmmSS+HH'mm' (mit Zeitzone) oder Z (UTC).""" + if dt.tzinfo is None: + # Naive dt — als lokale TIMEZONE interpretieren + dt = dt.replace(tzinfo=TIMEZONE) + base = dt.strftime("D:%Y%m%d%H%M%S") + offset = dt.utcoffset() + if offset is None: + return base + "Z" + total_minutes = int(offset.total_seconds() // 60) + sign = "+" if total_minutes >= 0 else "-" + total_minutes = abs(total_minutes) + return f"{base}{sign}{total_minutes // 60:02d}'{total_minutes % 60:02d}'" + + +def _enrich_pdf_metadata(pdf_bytes: bytes, meta: dict) -> bytes: + """PDF-Ausgabe um XMP-Metadaten und CreationDate/ModDate erweitern (post-process via pikepdf).""" + try: + buf_in = io.BytesIO(pdf_bytes) + with pikepdf.Pdf.open(buf_in) as pdf: + created: datetime = meta.get("created") + modified: datetime = meta.get("modified") + if created and created.tzinfo is None: + created = created.replace(tzinfo=TIMEZONE) + if modified and modified.tzinfo is None: + modified = modified.replace(tzinfo=TIMEZONE) + + # Klassisches Info-Dict: CreationDate + ModDate nachziehen + if created: + pdf.docinfo["/CreationDate"] = pikepdf.String(_format_pdf_date(created)) + if modified: + pdf.docinfo["/ModDate"] = pikepdf.String(_format_pdf_date(modified)) + + # XMP-Metadatenblock schreiben (Dublin Core + XMP + PDF Namespaces) + with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp: + xmp["dc:title"] = meta.get("title", "") + xmp["dc:creator"] = [meta.get("author", "")] + xmp["dc:description"] = meta.get("subject", "") + if meta.get("keywords"): + xmp["dc:subject"] = list(meta["keywords"]) + xmp["dc:language"] = [meta.get("language", "de-DE")] + xmp["pdf:Keywords"] = meta.get("keywords_comma", "") + xmp["xmp:CreatorTool"] = meta.get("creator_app", "AegisSight Monitor") + if created: + xmp["xmp:CreateDate"] = created.strftime("%Y-%m-%dT%H:%M:%S%z") + if modified: + xmp["xmp:ModifyDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z") + xmp["xmp:MetadataDate"] = modified.strftime("%Y-%m-%dT%H:%M:%S%z") + + buf_out = io.BytesIO() + pdf.save(buf_out) + return buf_out.getvalue() + except Exception as e: + logger.warning(f"PDF-Metadaten-Anreicherung (XMP/Dates) fehlgeschlagen: {e}") + return pdf_bytes + + async def generate_pdf( incident: dict, articles: list, fact_checks: list, snapshots: list, scope: str, creator: str, executive_summary_html: str, @@ -614,6 +672,7 @@ async def generate_pdf( art["pub_date"] = pub[:10] if pub else "" pdf_bytes = HTML(string=html_content).write_pdf() + pdf_bytes = _enrich_pdf_metadata(pdf_bytes, meta) return pdf_bytes