report-export: verlinkte Zitate in Zusammenfassung und Bericht

2026-04-14 17:55:01 +00:00
Commit 0ea7f9e305
--- a/src/report_generator.py
+++ b/src/report_generator.py
@@ -171,53 +171,151 @@ def _strip_citation_numbers(text: str) -> str:
    return text
 def _find_source_for_citation(num: str, sources: list) -> dict | None:
    """Sucht eine Quelle anhand der Zitat-Nummer (inkl. Suffix-Fallback wie 1383a -> 1383)."""
    if not sources:
        return None
    for s in sources:
        try:
            if str(s.get("nr")) == num:
                return s
        except Exception:
            continue
    # Suffix-Fallback: 1383a -> 1383
    if re.search(r"[a-z]$", num):
        base = re.sub(r"[a-z]$", "", num)
        for s in sources:
            if str(s.get("nr")) == base:
                return s
    return None
 def _linkify_citations_html(text: str, sources: list) -> str:
    """Ersetzt [1234]-Zitate durch HTML-Links zur jeweiligen Quelle.
    Nummern ohne zugeordnete Quelle bleiben als sichtbare Zahl erhalten.
    """
    if not text:
        return text
    if not sources:
        return text
    def repl(match: re.Match) -> str:
        num = match.group(1)
        src = _find_source_for_citation(num, sources)
        if src and src.get("url"):
            url = src["url"].replace('"', "&quot;")
            name = (src.get("name") or "").replace('"', "&quot;")
            return f'<a href="{url}" class="citation" title="{name}">[{num}]</a>'
        return match.group(0)
    return re.sub(r"\[(\d{1,5}[a-z]?)\]", repl, text)
 def _add_docx_hyperlink(paragraph, url: str, text: str):
    """Fügt einen klickbaren Hyperlink in ein python-docx-Paragraph-Objekt ein."""
    from docx.oxml.shared import OxmlElement, qn
    part = paragraph.part
    r_id = part.relate_to(
        url,
        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
        is_external=True,
    )
    hyperlink = OxmlElement("w:hyperlink")
    hyperlink.set(qn("r:id"), r_id)
    new_run = OxmlElement("w:r")
    rPr = OxmlElement("w:rPr")
    color = OxmlElement("w:color")
    color.set(qn("w:val"), "0066CC")
    rPr.append(color)
    u = OxmlElement("w:u")
    u.set(qn("w:val"), "single")
    rPr.append(u)
    sz = OxmlElement("w:sz")
    sz.set(qn("w:val"), "20")
    rPr.append(sz)
    new_run.append(rPr)
    t = OxmlElement("w:t")
    t.text = text
    t.set(qn("xml:space"), "preserve")
    new_run.append(t)
    hyperlink.append(new_run)
    paragraph._p.append(hyperlink)
    return hyperlink
 def _add_docx_paragraph_with_citations(doc_or_para, text: str, sources: list, style: str | None = None):
    """Fügt ein Paragraph hinzu, bei dem [1234]-Zitate als Hyperlink-Runs eingefügt werden.
    doc_or_para darf ein Document sein (neues Paragraph wird angelegt) oder bereits ein Paragraph.
    """
    if hasattr(doc_or_para, "add_paragraph"):
        para = doc_or_para.add_paragraph(style=style) if style else doc_or_para.add_paragraph()
    else:
        para = doc_or_para
    pattern = re.compile(r"\[(\d{1,5}[a-z]?)\]")
    pos = 0
    for m in pattern.finditer(text):
        if m.start() > pos:
            para.add_run(text[pos:m.start()])
        num = m.group(1)
        src = _find_source_for_citation(num, sources)
        if src and src.get("url"):
            _add_docx_hyperlink(para, src["url"], f"[{num}]")
        else:
            para.add_run(m.group(0))
        pos = m.end()
    if pos < len(text):
        para.add_run(text[pos:])
    return para
-def _extract_zusammenfassung(summary_text: str) -> tuple[str, str]:
+
-    """Extrahiert die ZUSAMMENFASSUNG-Sektion aus einem Research-Briefing.
+
 def _extract_zusammenfassung_lines(summary_text: str) -> tuple[list[str], str]:
    """Extrahiert die ZUSAMMENFASSUNG-Sektion als Liste von Rohzeilen (ohne Zitatbearbeitung).
    Returns:
-        (zusammenfassung_html, remaining_summary)
+        (lines, remaining_summary)
        zusammenfassung_html: HTML-formatierte Bullet Points
        remaining_summary: Der Rest des Berichts ohne die Zusammenfassung
    """
    if not summary_text:
-        return "", summary_text
+        return [], summary_text
    # Suche nach ## ZUSAMMENFASSUNG ... bis zur naechsten ## Ueberschrift
    pattern = r"(## (?:ZUSAMMENFASSUNG|ÜBERBLICK)\s*\n)(.*?)(?=\n## |\Z)"
    match = re.search(pattern, summary_text, re.DOTALL)
    if not match:
-        return "", summary_text
+        return [], summary_text
    zusammenfassung_raw = match.group(2).strip()
    # Rest des Berichts ohne die Zusammenfassung-Sektion
    remaining = summary_text[:match.start()] + summary_text[match.end():]
    remaining = remaining.strip()
-    # Bullet Points als HTML formatieren
+    lines: list[str] = []
    lines = []
    for line in zusammenfassung_raw.split("\n"):
        stripped = line.strip()
-        if stripped.startswith("- "):
+        if stripped.startswith("- ") or stripped.startswith("* "):
-            clean = _strip_citation_numbers(stripped[2:].strip())
+            content = stripped[2:].strip()
-            if clean:
+            if content:
-                lines.append(clean)
+                lines.append(content)
        elif stripped.startswith("* "):
            clean = _strip_citation_numbers(stripped[2:].strip())
            if clean:
                lines.append(clean)
        elif stripped and not stripped.startswith("#"):
-            clean = _strip_citation_numbers(stripped)
+            lines.append(stripped)
-            if clean:
+    return lines, remaining
                lines.append(clean)
    if lines:
        html = "<ul>\n" + "\n".join(f"<li>{line}</li>" for line in lines) + "\n</ul>"
    else:
        html = f"<p>{_strip_citation_numbers(zusammenfassung_raw)}</p>"
 def _extract_zusammenfassung(summary_text: str, sources: list | None = None) -> tuple[str, str]:
    """Extrahiert die ZUSAMMENFASSUNG-Sektion und liefert sie als HTML mit verlinkten Zitaten."""
    lines, remaining = _extract_zusammenfassung_lines(summary_text)
    if not lines:
        return "", summary_text
    src_list = sources or []
    html_lines = [f"<li>{_linkify_citations_html(line, src_list)}</li>" for line in lines]
    html = "<ul>\n" + "\n".join(html_lines) + "\n</ul>"
    return html, remaining
@@ -310,17 +408,22 @@ async def generate_pdf(
    # Fuer Research-Lagen: Zusammenfassung aus dem Bericht extrahieren
    is_research = incident.get("type") == "research"
    all_sources = _prepare_sources(incident)
    zusammenfassung_html = executive_summary_html
    bericht_summary = incident.get("summary", "")
    zusammenfassung_title = "Zusammenfassung"
    if is_research and bericht_summary:
-        extracted_html, remaining = _extract_zusammenfassung(bericht_summary)
+        extracted_html, remaining = _extract_zusammenfassung(bericht_summary, all_sources)
        if extracted_html:
            zusammenfassung_html = extracted_html
            zusammenfassung_title = "Zusammenfassung"
            bericht_summary = remaining
    # Auch das (nicht-research) Executive Summary linkifizieren — ggf. enthaelt es Zitate
    if not is_research and zusammenfassung_html:
        zusammenfassung_html = _linkify_citations_html(zusammenfassung_html, all_sources)
    env = Environment(loader=FileSystemLoader(str(TEMPLATE_DIR)))
    template = env.get_template("report.html")
@@ -337,8 +440,8 @@ async def generate_pdf(
        zusammenfassung_title=zusammenfassung_title,
        sections=sections,
        scope=scope,
-        lagebild_html=_markdown_to_html(
+        lagebild_html=_linkify_citations_html(
-            _strip_citation_numbers(bericht_summary)
+            _markdown_to_html(bericht_summary), all_sources
        ),
        lagebild_timestamp=(incident.get("updated_at") or "")[:16].replace("T", " "),
        sources=_prepare_sources(incident)[:30] if scope == "report" else _prepare_sources(incident),
@@ -380,14 +483,16 @@ async def generate_docx(
    # Fuer Research-Lagen: Zusammenfassung aus dem Bericht extrahieren
    is_research = incident.get("type") == "research"
    all_sources = _prepare_sources(incident)
    zusammenfassung_text = executive_summary_text
    bericht_summary = incident.get("summary") or "Keine Zusammenfassung verfuegbar."
    zusammenfassung_title = "Zusammenfassung"
    zusammenfassung_lines: list[str] = []
    if is_research and bericht_summary:
-        extracted_html, remaining = _extract_zusammenfassung(bericht_summary)
+        extracted_lines, remaining = _extract_zusammenfassung_lines(bericht_summary)
-        if extracted_html:
+        if extracted_lines:
-            zusammenfassung_text = extracted_html
+            zusammenfassung_lines = extracted_lines
            zusammenfassung_title = "Zusammenfassung"
            bericht_summary = remaining
@@ -446,31 +551,35 @@ async def generate_docx(
    if "zusammenfassung" in sections:
        doc.add_heading(zusammenfassung_title, level=1)
-        # HTML-Tags entfernen und als Bullet Points
+        if zusammenfassung_lines:
-        clean_text = re.sub(r'<[^>]+>', '', zusammenfassung_text)
+            for line in zusammenfassung_lines:
                _add_docx_paragraph_with_citations(doc, line, all_sources, style='List Bullet')
        else:
            # Fallback: HTML-Tags aus executive_summary_text strippen, dann Bullets bilden
            clean_text = re.sub(r'<[^>]+>', '', zusammenfassung_text or '')
            lines = [line.strip().lstrip("- ").lstrip("* ") for line in clean_text.strip().split("\n") if line.strip()]
            for line in lines:
                if line:
-                doc.add_paragraph(line, style='List Bullet')
+                    _add_docx_paragraph_with_citations(doc, line, all_sources, style='List Bullet')
    if "bericht" in sections:
        # --- Lagebild / Recherchebericht ---
        doc.add_heading("Recherchebericht" if is_research else "Lagebild", level=1)
-        summary = _strip_citation_numbers(bericht_summary)
+        # Markdown-Formatierung entfernen, Zitate aber als [NNN] beibehalten und als Hyperlinks rendern
-        # Markdown-Formatierung entfernen
+        clean_summary = re.sub(r'\*\*(.+?)\*\*', r'\1', bericht_summary)
-        clean_summary = re.sub(r'\*\*(.+?)\*\*', r'\1', summary)
+        clean_summary = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', clean_summary)
        clean_summary = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean_summary)
        clean_summary = re.sub(r'^#{1,3}\s+', '', clean_summary, flags=re.MULTILINE)
        for para_text in clean_summary.split("\n\n"):
            para_text = para_text.strip()
-            if para_text:
+            if not para_text:
                continue
            if para_text.startswith("- "):
                for bullet in para_text.split("\n"):
                    bullet = bullet.lstrip("- ").strip()
                    if bullet:
-                            doc.add_paragraph(bullet, style='List Bullet')
+                        _add_docx_paragraph_with_citations(doc, bullet, all_sources, style='List Bullet')
            else:
-                    doc.add_paragraph(para_text)
+                _add_docx_paragraph_with_citations(doc, para_text, all_sources)
    if "faktencheck" in sections:
        # --- Faktencheck ---