Promote develop → main (2026-05-03 00:02 UTC)

Umlaut-Normalisierung an drei Stellen + auch articles im QC
Fix fuer ASCII-Umlaute in Headlines/Inhalten (Gespraeche statt Gespraeche). Zwei Quellen des Problems: 1. Quellen wie dpa-AFX, Telegram TASS/RIA liefern Headlines schon ASCII-fiziert 2. LLM-Uebersetzungen drift en gelegentlich zu ae/oe/ue trotz Prompt Aenderungen: - rss_parser.py: nach html_to_text auch normalize_german_umlauts auf title und summary anwenden (sicher, hunspell-Dict ignoriert englische Woerter wie Boeing/Business) - orchestrator.py:1418 Translation-INSERT: headline_de und content_de durch normalize_german_umlauts schicken (LLM-Drift abfangen) - post_refresh_qc.py: neue Funktion normalize_umlaut_articles als Sicher- heitsnetz analog zu normalize_umlaut_fields. Behandelt headline_de und content_de aller Artikel des Incidents; bei language=de zusaetzlich headline und content_original. Wird in run_post_refresh_qc nach normalize_umlaut_fields aufgerufen. Backfill: migrations/migrate_umlauts_2026-05-03.py (im Verwaltungs-Repo)
2026-05-03 02:02:20 +02:00 · 2026-05-02 23:26:19 +00:00 · 2026-05-02 23:13:32 +00:00
--- a/src/agents/orchestrator.py
+++ b/src/agents/orchestrator.py
@@ -1411,12 +1411,22 @@ class AgentOrchestrator:
                        )
                    # Übersetzungen aktualisieren (nur für gültige DB-IDs)
                    # LLM-Drift abfangen: trotz Prompt-Anweisung kommen manchmal
                    # ASCII-Umlaute ("Gespraeche" statt "Gespräche") in der Übersetzung.
                    # Dictionary-basierte Korrektur schreibt nur deutsche Woerter um.
                    from services.post_refresh_qc import normalize_german_umlauts as _norm_de
                    for translation in analysis.get("translations", []):
                        article_id = translation.get("article_id")
                        if isinstance(article_id, int):
                            hd = translation.get("headline_de")
                            cd = translation.get("content_de")
                            if hd:
                                hd, _ = _norm_de(hd)
                            if cd:
                                cd, _ = _norm_de(cd)
                            await db.execute(
                                "UPDATE articles SET headline_de = ?, content_de = ? WHERE id = ? AND incident_id = ?",
-                                (translation.get("headline_de"), translation.get("content_de"), article_id, incident_id),
+                                (hd, cd, article_id, incident_id),
                            )
                    await db.commit()
--- a/src/feeds/rss_parser.py
+++ b/src/feeds/rss_parser.py
@@ -6,6 +6,8 @@ import httpx
 from datetime import datetime, timezone
 from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
 from source_rules import _extract_domain
 from feeds.transcript_extractors._common import html_to_text
 from services.post_refresh_qc import normalize_german_umlauts
 logger = logging.getLogger("osint.rss")
@@ -152,7 +154,15 @@ class RSSParser:
            for entry in feed.entries[:50]:
                title = entry.get("title", "")
-                summary = entry.get("summary", "")
+                # RSS-summary ist bei vielen Quellen HTML (Guardian, AP, SZ, ...).
                # Vor weiterer Verwendung strippen, sonst landet HTML in DB
                # und KI-Agenten und Sprach-Heuristik werden gestoert.
                summary_raw = entry.get("summary", "")
                summary = html_to_text(summary_raw) if summary_raw else ""
                # ASCII-Umlaut-Normalisierung (z.B. dpa-AFX schreibt "Gespraeche").
                # Dictionary-basiert, sicher gegen englische Woerter wie "Boeing".
                title, _ = normalize_german_umlauts(title)
                summary, _ = normalize_german_umlauts(summary)
                text = f"{title} {summary}".lower()
                # Adaptive Match-Schwelle:
--- a/src/services/post_refresh_qc.py
+++ b/src/services/post_refresh_qc.py
@@ -400,18 +400,20 @@ async def run_post_refresh_qc(db, incident_id: int) -> dict:
            db, incident_id, incident_title, incident_desc
        )
        umlauts_fixed = await normalize_umlaut_fields(db, incident_id)
        article_umlauts_fixed = await normalize_umlaut_articles(db, incident_id)
-        if facts_removed > 0 or locations_fixed > 0 or umlauts_fixed > 0:
+        total_umlaut_changes = umlauts_fixed + article_umlauts_fixed
        if facts_removed > 0 or locations_fixed > 0 or total_umlaut_changes > 0:
            await db.commit()
            logger.info(
-                "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert",
+                "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert (davon %d in Articles)",
-                incident_id, facts_removed, locations_fixed, umlauts_fixed,
+                incident_id, facts_removed, locations_fixed, total_umlaut_changes, article_umlauts_fixed,
            )
        return {
            "facts_removed": facts_removed,
            "locations_fixed": locations_fixed,
-            "umlauts_fixed": umlauts_fixed,
+            "umlauts_fixed": total_umlaut_changes,
        }
    except Exception as e:
@@ -568,3 +570,64 @@ async def normalize_umlaut_fields(db, incident_id: int) -> int:
        incident_id, count_summary, count_dev,
    )
    return total
 async def normalize_umlaut_articles(db, incident_id: int) -> int:
    """Normalisiert Umlaute in allen Artikel-Texten des Incidents.
    Felder die behandelt werden:
    - headline_de und content_de bei allen Artikeln (LLM-Uebersetzung kann
      ASCII-Umlaute liefern trotz Prompt-Anweisung)
    - headline und content_original bei language='de' (manche Quellen wie
      dpa-AFX, Telegram-Kanaele liefern selbst schon ASCII-Umlaute)
    Idempotent: Wenn der Text schon korrekt ist, macht das Dict-Lookup
    keine Aenderung und wir schreiben nicht zurueck.
    Rueckgabe: Gesamtzahl der Wort-Ersetzungen ueber alle Artikel.
    """
    cursor = await db.execute(
        """SELECT id, language, headline, headline_de, content_original, content_de
           FROM articles WHERE incident_id = ?""",
        (incident_id,),
    )
    rows = await cursor.fetchall()
    if not rows:
        return 0
    total = 0
    for row in rows:
        is_de = (row["language"] or "").lower() == "de"
        updates = {}
        # Felder die immer behandelt werden (LLM-Uebersetzungen)
        if row["headline_de"]:
            new, n = normalize_german_umlauts(row["headline_de"])
            if n > 0:
                updates["headline_de"] = new
                total += n
        if row["content_de"]:
            new, n = normalize_german_umlauts(row["content_de"])
            if n > 0:
                updates["content_de"] = new
                total += n
        # Originalfelder nur bei deutschen Quellen
        if is_de:
            if row["headline"]:
                new, n = normalize_german_umlauts(row["headline"])
                if n > 0:
                    updates["headline"] = new
                    total += n
            if row["content_original"]:
                new, n = normalize_german_umlauts(row["content_original"])
                if n > 0:
                    updates["content_original"] = new
                    total += n
        if updates:
            set_clause = ", ".join(f"{k} = ?" for k in updates)
            values = list(updates.values()) + [row["id"]]
            await db.execute(f"UPDATE articles SET {set_clause} WHERE id = ?", values)
    return total