Export: PDF/DOCX-Dateimetadaten (Title, Author, Subject, Keywords, Category, Comments)

- Neue Helper-Funktion _build_export_metadata baut einheitliches Metadaten-Dict - PDF via HTML-Meta-Tags (title, author, description, keywords, generator, lang) - DOCX via doc.core_properties (title, author, subject, keywords, comments, category, last_modified_by, language, content_status, created, modified) - Keywords aus OSINT + Typ + Organisation + category_labels + Top-5-Orten - Comments-Feld mit strukturiertem Block (Incident-ID, Typ, Scope, Umfang, Orte) - Router laedt Organisation + Top-Orte aus article_locations und reicht sie durch
2026-04-20 18:58:34 +00:00
Commit c0f68e40a5
--- a/src/report_generator.py
+++ b/src/report_generator.py
@@ -391,10 +391,132 @@ LAGEBILD:
        return "<ul><li>Zusammenfassung konnte nicht generiert werden.</li></ul>"


+def _parse_db_timestamp(value) -> datetime | None:
+    """SQLite-Timestamp robust als datetime parsen (ISO oder 'YYYY-MM-DD HH:MM:SS')."""
+    if not value:
+        return None
+    if isinstance(value, datetime):
+        return value
+    try:
+        text = str(value).replace("T", " ").replace("Z", "")
+        # Sekundenbruchteile und Timezone-Offset abschneiden (python-docx mag nur naive dt)
+        text = text.split(".")[0].split("+")[0].strip()
+        return datetime.strptime(text, "%Y-%m-%d %H:%M:%S")
+    except (ValueError, TypeError):
+        try:
+            return datetime.strptime(str(value)[:10], "%Y-%m-%d")
+        except (ValueError, TypeError):
+            return None
+
+
+def _slug_scope_label(scope: str, sections: set[str] | None) -> str:
+    """Scope-Label fuer Metadaten und Dateinamen."""
+    if sections:
+        if sections == {"zusammenfassung"}:
+            return "Zusammenfassung"
+        if "timeline" in sections:
+            return "Vollstaendiger Bericht"
+        return "Lagebericht"
+    return {"summary": "Zusammenfassung", "report": "Lagebericht", "full": "Vollstaendiger Bericht"}.get(
+        scope, "Lagebericht"
+    )
+
+
+def _build_export_metadata(
+    incident: dict,
+    articles: list,
+    fact_checks: list,
+    sources: list,
+    creator: str,
+    scope: str,
+    sections: set[str] | None,
+    organization_name: str | None,
+    top_locations: list[str] | None,
+) -> dict:
+    """Einheitlicher Metadaten-Dict fuer PDF (HTML-Meta-Tags) und DOCX (core_properties)."""
+    is_research = incident.get("type") == "research"
+    type_label = "Hintergrundrecherche" if is_research else "Live-Monitoring"
+    category = "OSINT-Hintergrundrecherche" if is_research else "OSINT-Lagebericht"
+    scope_label = _slug_scope_label(scope, sections)
+
+    title_raw = (incident.get("title") or "Unbenannte Lage").strip()
+    title = f"{title_raw} — {type_label}"
+
+    subject = (incident.get("description") or "").strip()
+    if not subject:
+        subject = f"{type_label} zu: {title_raw}"
+
+    # Keywords sammeln (Reihenfolge relevant für Anzeige, Dedup mit dict.fromkeys)
+    keywords: list[str] = ["OSINT", type_label]
+    if organization_name:
+        keywords.append(organization_name)
+
+    # category_labels ist ein Komma-getrennter String
+    cat_labels = incident.get("category_labels") or ""
+    for lbl in cat_labels.split(","):
+        lbl = lbl.strip()
+        if lbl:
+            keywords.append(lbl)
+
+    if top_locations:
+        keywords.extend([loc for loc in top_locations if loc])
+
+    # Dedup (case-insensitive) mit Reihenfolge erhalten, max 15
+    seen = set()
+    unique_keywords: list[str] = []
+    for kw in keywords:
+        key = kw.lower()
+        if key not in seen:
+            seen.add(key)
+            unique_keywords.append(kw)
+        if len(unique_keywords) >= 15:
+            break
+
+    now = datetime.now(TIMEZONE)
+    created = _parse_db_timestamp(incident.get("created_at")) or now.replace(tzinfo=None)
+    modified = _parse_db_timestamp(incident.get("updated_at")) or created
+
+    # Strukturierter Comments-Block (wird in DOCX angezeigt, kompakt)
+    stand = now.strftime("%d.%m.%Y")
+    comments_lines = [
+        f"Incident-ID: {incident.get('id', '?')} | Typ: {incident.get('type', 'adhoc')} | Scope: {scope_label}",
+        f"Stand: {stand}",
+    ]
+    if organization_name:
+        comments_lines.append(f"Organisation: {organization_name}")
+    comments_lines.append(
+        f"Umfang: {len(articles)} Artikel, {len(fact_checks)} Faktenchecks, {len(sources)} Quellen"
+    )
+    if top_locations:
+        comments_lines.append("Orte: " + ", ".join(top_locations[:5]))
+    comments = "\n".join(comments_lines)
+
+    return {
+        "title": title,
+        "author": creator or "AegisSight Monitor",
+        "subject": subject,
+        "keywords": unique_keywords,
+        "keywords_comma": ", ".join(unique_keywords),
+        "keywords_semicolon": "; ".join(unique_keywords),
+        "category": category,
+        "comments": comments,
+        "creator_app": "AegisSight Monitor",
+        "language": "de-DE",
+        "created": created,
+        "modified": modified,
+        "created_iso": created.strftime("%Y-%m-%dT%H:%M:%S"),
+        "modified_iso": modified.strftime("%Y-%m-%dT%H:%M:%S"),
+        "type_label": type_label,
+        "scope_label": scope_label,
+    }
+
+
 async def generate_pdf(
    incident: dict, articles: list, fact_checks: list, snapshots: list,
    scope: str, creator: str, executive_summary_html: str,
    sections: set[str] | None = None,
+    organization_name: str | None = None,
+    top_locations: list[str] | None = None,
 ) -> bytes:
    """PDF-Report via WeasyPrint generieren."""
    # Sections aus scope ableiten wenn nicht explizit angegeben
@@ -424,6 +546,11 @@ async def generate_pdf(
    if not is_research and zusammenfassung_html:
        zusammenfassung_html = _linkify_citations_html(zusammenfassung_html, all_sources)

+    meta = _build_export_metadata(
+        incident, articles, fact_checks, all_sources, creator, scope, sections,
+        organization_name, top_locations,
+    )
+
    env = Environment(loader=FileSystemLoader(str(TEMPLATE_DIR)))
    template = env.get_template("report.html")

@@ -449,6 +576,7 @@ async def generate_pdf(
        source_stats=_prepare_source_stats(articles)[:20] if scope == "report" else _prepare_source_stats(articles),
        timeline=_prepare_timeline(articles) if scope == "full" else [],
        articles=articles if scope == "full" else [],
+        meta=meta,
    )

    # Artikel pub_date aufbereiten
@@ -468,6 +596,8 @@ async def generate_docx(
    incident: dict, articles: list, fact_checks: list, snapshots: list,
    scope: str, creator: str, executive_summary_text: str,
    sections: set[str] | None = None,
+    organization_name: str | None = None,
+    top_locations: list[str] | None = None,
 ) -> bytes:
    """Word-Report via python-docx generieren."""
    doc = Document()
@@ -496,6 +626,28 @@ async def generate_docx(
            zusammenfassung_title = "Zusammenfassung"
            bericht_summary = remaining

+    meta = _build_export_metadata(
+        incident, articles, fact_checks, all_sources, creator, scope, sections,
+        organization_name, top_locations,
+    )
+
+    # Dateimetadaten setzen (sichtbar in Explorer/Finder, DMS-Systemen)
+    cp = doc.core_properties
+    cp.title = meta["title"]
+    cp.author = meta["author"]
+    cp.subject = meta["subject"]
+    cp.keywords = meta["keywords_semicolon"]
+    cp.comments = meta["comments"]
+    cp.category = meta["category"]
+    cp.last_modified_by = meta["author"]
+    cp.language = meta["language"]
+    cp.content_status = "Final"
+    try:
+        cp.created = meta["created"]
+        cp.modified = meta["modified"]
+    except (ValueError, TypeError) as e:
+        logger.warning(f"DOCX created/modified konnte nicht gesetzt werden: {e}")
+
    # Styles
    style = doc.styles['Normal']
    style.font.size = Pt(10)