report-export: verlinkte Zitate in Zusammenfassung und Bericht

2026-04-14 17:55:01 +00:00
Commit 0ea7f9e305
--- a/src/report_generator.py
+++ b/src/report_generator.py
@@ -171,53 +171,151 @@ def _strip_citation_numbers(text: str) -> str:
    return text


+def _find_source_for_citation(num: str, sources: list) -> dict | None:
+    """Sucht eine Quelle anhand der Zitat-Nummer (inkl. Suffix-Fallback wie 1383a -> 1383)."""
+    if not sources:
+        return None
+    for s in sources:
+        try:
+            if str(s.get("nr")) == num:
+                return s
+        except Exception:
+            continue
+    # Suffix-Fallback: 1383a -> 1383
+    if re.search(r"[a-z]$", num):
+        base = re.sub(r"[a-z]$", "", num)
+        for s in sources:
+            if str(s.get("nr")) == base:
+                return s
+    return None
+
+
+def _linkify_citations_html(text: str, sources: list) -> str:
+    """Ersetzt [1234]-Zitate durch HTML-Links zur jeweiligen Quelle.
+
+    Nummern ohne zugeordnete Quelle bleiben als sichtbare Zahl erhalten.
+    """
+    if not text:
+        return text
+    if not sources:
+        return text
+
+    def repl(match: re.Match) -> str:
+        num = match.group(1)
+        src = _find_source_for_citation(num, sources)
+        if src and src.get("url"):
+            url = src["url"].replace('"', "&quot;")
+            name = (src.get("name") or "").replace('"', "&quot;")
+            return f'<a href="{url}" class="citation" title="{name}">[{num}]</a>'
+        return match.group(0)
+
+    return re.sub(r"\[(\d{1,5}[a-z]?)\]", repl, text)
+
+
+def _add_docx_hyperlink(paragraph, url: str, text: str):
+    """Fügt einen klickbaren Hyperlink in ein python-docx-Paragraph-Objekt ein."""
+    from docx.oxml.shared import OxmlElement, qn
+
+    part = paragraph.part
+    r_id = part.relate_to(
+        url,
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
+        is_external=True,
+    )
+    hyperlink = OxmlElement("w:hyperlink")
+    hyperlink.set(qn("r:id"), r_id)
+
+    new_run = OxmlElement("w:r")
+    rPr = OxmlElement("w:rPr")
+    color = OxmlElement("w:color")
+    color.set(qn("w:val"), "0066CC")
+    rPr.append(color)
+    u = OxmlElement("w:u")
+    u.set(qn("w:val"), "single")
+    rPr.append(u)
+    sz = OxmlElement("w:sz")
+    sz.set(qn("w:val"), "20")
+    rPr.append(sz)
+    new_run.append(rPr)
+
+    t = OxmlElement("w:t")
+    t.text = text
+    t.set(qn("xml:space"), "preserve")
+    new_run.append(t)
+    hyperlink.append(new_run)
+    paragraph._p.append(hyperlink)
+    return hyperlink
+
+
+def _add_docx_paragraph_with_citations(doc_or_para, text: str, sources: list, style: str | None = None):
+    """Fügt ein Paragraph hinzu, bei dem [1234]-Zitate als Hyperlink-Runs eingefügt werden.
+
+    doc_or_para darf ein Document sein (neues Paragraph wird angelegt) oder bereits ein Paragraph.
+    """
+    if hasattr(doc_or_para, "add_paragraph"):
+        para = doc_or_para.add_paragraph(style=style) if style else doc_or_para.add_paragraph()
+    else:
+        para = doc_or_para
+
+    pattern = re.compile(r"\[(\d{1,5}[a-z]?)\]")
+    pos = 0
+    for m in pattern.finditer(text):
+        if m.start() > pos:
+            para.add_run(text[pos:m.start()])
+        num = m.group(1)
+        src = _find_source_for_citation(num, sources)
+        if src and src.get("url"):
+            _add_docx_hyperlink(para, src["url"], f"[{num}]")
+        else:
+            para.add_run(m.group(0))
+        pos = m.end()
+    if pos < len(text):
+        para.add_run(text[pos:])
+    return para



-def _extract_zusammenfassung(summary_text: str) -> tuple[str, str]:
-    """Extrahiert die ZUSAMMENFASSUNG-Sektion aus einem Research-Briefing.
+
+
+def _extract_zusammenfassung_lines(summary_text: str) -> tuple[list[str], str]:
+    """Extrahiert die ZUSAMMENFASSUNG-Sektion als Liste von Rohzeilen (ohne Zitatbearbeitung).

    Returns:
-        (zusammenfassung_html, remaining_summary)
-        zusammenfassung_html: HTML-formatierte Bullet Points
-        remaining_summary: Der Rest des Berichts ohne die Zusammenfassung
+        (lines, remaining_summary)
    """
    if not summary_text:
-        return "", summary_text
+        return [], summary_text

-    # Suche nach ## ZUSAMMENFASSUNG ... bis zur naechsten ## Ueberschrift
    pattern = r"(## (?:ZUSAMMENFASSUNG|ÜBERBLICK)\s*\n)(.*?)(?=\n## |\Z)"
    match = re.search(pattern, summary_text, re.DOTALL)
    if not match:
-        return "", summary_text
+        return [], summary_text

    zusammenfassung_raw = match.group(2).strip()
-    # Rest des Berichts ohne die Zusammenfassung-Sektion
    remaining = summary_text[:match.start()] + summary_text[match.end():]
    remaining = remaining.strip()

-    # Bullet Points als HTML formatieren
-    lines = []
+    lines: list[str] = []
    for line in zusammenfassung_raw.split("\n"):
        stripped = line.strip()
-        if stripped.startswith("- "):
-            clean = _strip_citation_numbers(stripped[2:].strip())
-            if clean:
-                lines.append(clean)
-        elif stripped.startswith("* "):
-            clean = _strip_citation_numbers(stripped[2:].strip())
-            if clean:
-                lines.append(clean)
+        if stripped.startswith("- ") or stripped.startswith("* "):
+            content = stripped[2:].strip()
+            if content:
+                lines.append(content)
        elif stripped and not stripped.startswith("#"):
-            clean = _strip_citation_numbers(stripped)
-            if clean:
-                lines.append(clean)
+            lines.append(stripped)
+    return lines, remaining

-    if lines:
-        html = "<ul>\n" + "\n".join(f"<li>{line}</li>" for line in lines) + "\n</ul>"
-    else:
-        html = f"<p>{_strip_citation_numbers(zusammenfassung_raw)}</p>"

+def _extract_zusammenfassung(summary_text: str, sources: list | None = None) -> tuple[str, str]:
+    """Extrahiert die ZUSAMMENFASSUNG-Sektion und liefert sie als HTML mit verlinkten Zitaten."""
+    lines, remaining = _extract_zusammenfassung_lines(summary_text)
+    if not lines:
+        return "", summary_text
+
+    src_list = sources or []
+    html_lines = [f"<li>{_linkify_citations_html(line, src_list)}</li>" for line in lines]
+    html = "<ul>\n" + "\n".join(html_lines) + "\n</ul>"
    return html, remaining


@@ -310,17 +408,22 @@ async def generate_pdf(

    # Fuer Research-Lagen: Zusammenfassung aus dem Bericht extrahieren
    is_research = incident.get("type") == "research"
+    all_sources = _prepare_sources(incident)
    zusammenfassung_html = executive_summary_html
    bericht_summary = incident.get("summary", "")
    zusammenfassung_title = "Zusammenfassung"

    if is_research and bericht_summary:
-        extracted_html, remaining = _extract_zusammenfassung(bericht_summary)
+        extracted_html, remaining = _extract_zusammenfassung(bericht_summary, all_sources)
        if extracted_html:
            zusammenfassung_html = extracted_html
            zusammenfassung_title = "Zusammenfassung"
            bericht_summary = remaining

+    # Auch das (nicht-research) Executive Summary linkifizieren — ggf. enthaelt es Zitate
+    if not is_research and zusammenfassung_html:
+        zusammenfassung_html = _linkify_citations_html(zusammenfassung_html, all_sources)
+
    env = Environment(loader=FileSystemLoader(str(TEMPLATE_DIR)))
    template = env.get_template("report.html")

@@ -337,8 +440,8 @@ async def generate_pdf(
        zusammenfassung_title=zusammenfassung_title,
        sections=sections,
        scope=scope,
-        lagebild_html=_markdown_to_html(
-            _strip_citation_numbers(bericht_summary)
+        lagebild_html=_linkify_citations_html(
+            _markdown_to_html(bericht_summary), all_sources
        ),
        lagebild_timestamp=(incident.get("updated_at") or "")[:16].replace("T", " "),
        sources=_prepare_sources(incident)[:30] if scope == "report" else _prepare_sources(incident),
@@ -380,14 +483,16 @@ async def generate_docx(

    # Fuer Research-Lagen: Zusammenfassung aus dem Bericht extrahieren
    is_research = incident.get("type") == "research"
+    all_sources = _prepare_sources(incident)
    zusammenfassung_text = executive_summary_text
    bericht_summary = incident.get("summary") or "Keine Zusammenfassung verfuegbar."
    zusammenfassung_title = "Zusammenfassung"
+    zusammenfassung_lines: list[str] = []

    if is_research and bericht_summary:
-        extracted_html, remaining = _extract_zusammenfassung(bericht_summary)
-        if extracted_html:
-            zusammenfassung_text = extracted_html
+        extracted_lines, remaining = _extract_zusammenfassung_lines(bericht_summary)
+        if extracted_lines:
+            zusammenfassung_lines = extracted_lines
            zusammenfassung_title = "Zusammenfassung"
            bericht_summary = remaining

@@ -446,31 +551,35 @@ async def generate_docx(
    if "zusammenfassung" in sections:
        doc.add_heading(zusammenfassung_title, level=1)

-        # HTML-Tags entfernen und als Bullet Points
-        clean_text = re.sub(r'<[^>]+>', '', zusammenfassung_text)
-        lines = [line.strip().lstrip("- ").lstrip("* ") for line in clean_text.strip().split("\n") if line.strip()]
-        for line in lines:
-            if line:
-                doc.add_paragraph(line, style='List Bullet')
+        if zusammenfassung_lines:
+            for line in zusammenfassung_lines:
+                _add_docx_paragraph_with_citations(doc, line, all_sources, style='List Bullet')
+        else:
+            # Fallback: HTML-Tags aus executive_summary_text strippen, dann Bullets bilden
+            clean_text = re.sub(r'<[^>]+>', '', zusammenfassung_text or '')
+            lines = [line.strip().lstrip("- ").lstrip("* ") for line in clean_text.strip().split("\n") if line.strip()]
+            for line in lines:
+                if line:
+                    _add_docx_paragraph_with_citations(doc, line, all_sources, style='List Bullet')

    if "bericht" in sections:
        # --- Lagebild / Recherchebericht ---
        doc.add_heading("Recherchebericht" if is_research else "Lagebild", level=1)
-        summary = _strip_citation_numbers(bericht_summary)
-        # Markdown-Formatierung entfernen
-        clean_summary = re.sub(r'\*\*(.+?)\*\*', r'\1', summary)
-        clean_summary = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_summary)
+        # Markdown-Formatierung entfernen, Zitate aber als [NNN] beibehalten und als Hyperlinks rendern
+        clean_summary = re.sub(r'\*\*(.+?)\*\*', r'\1', bericht_summary)
+        clean_summary = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', clean_summary)
        clean_summary = re.sub(r'^#{1,3}\s+', '', clean_summary, flags=re.MULTILINE)
        for para_text in clean_summary.split("\n\n"):
            para_text = para_text.strip()
-            if para_text:
-                if para_text.startswith("- "):
-                    for bullet in para_text.split("\n"):
-                        bullet = bullet.lstrip("- ").strip()
-                        if bullet:
-                            doc.add_paragraph(bullet, style='List Bullet')
-                else:
-                    doc.add_paragraph(para_text)
+            if not para_text:
+                continue
+            if para_text.startswith("- "):
+                for bullet in para_text.split("\n"):
+                    bullet = bullet.lstrip("- ").strip()
+                    if bullet:
+                        _add_docx_paragraph_with_citations(doc, bullet, all_sources, style='List Bullet')
+            else:
+                _add_docx_paragraph_with_citations(doc, para_text, all_sources)

    if "faktencheck" in sections:
        # --- Faktencheck ---