diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 72bb3a1..a1fa096 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -1411,12 +1411,22 @@ class AgentOrchestrator: ) # Übersetzungen aktualisieren (nur für gültige DB-IDs) + # LLM-Drift abfangen: trotz Prompt-Anweisung kommen manchmal + # ASCII-Umlaute ("Gespraeche" statt "Gespräche") in der Übersetzung. + # Dictionary-basierte Korrektur schreibt nur deutsche Woerter um. + from services.post_refresh_qc import normalize_german_umlauts as _norm_de for translation in analysis.get("translations", []): article_id = translation.get("article_id") if isinstance(article_id, int): + hd = translation.get("headline_de") + cd = translation.get("content_de") + if hd: + hd, _ = _norm_de(hd) + if cd: + cd, _ = _norm_de(cd) await db.execute( "UPDATE articles SET headline_de = ?, content_de = ? WHERE id = ? AND incident_id = ?", - (translation.get("headline_de"), translation.get("content_de"), article_id, incident_id), + (hd, cd, article_id, incident_id), ) await db.commit() diff --git a/src/feeds/rss_parser.py b/src/feeds/rss_parser.py index 5e756f4..2e65d13 100644 --- a/src/feeds/rss_parser.py +++ b/src/feeds/rss_parser.py @@ -6,6 +6,8 @@ import httpx from datetime import datetime, timezone from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS from source_rules import _extract_domain +from feeds.transcript_extractors._common import html_to_text +from services.post_refresh_qc import normalize_german_umlauts logger = logging.getLogger("osint.rss") @@ -152,7 +154,15 @@ class RSSParser: for entry in feed.entries[:50]: title = entry.get("title", "") - summary = entry.get("summary", "") + # RSS-summary ist bei vielen Quellen HTML (Guardian, AP, SZ, ...). + # Vor weiterer Verwendung strippen, sonst landet HTML in DB + # und KI-Agenten und Sprach-Heuristik werden gestoert. + summary_raw = entry.get("summary", "") + summary = html_to_text(summary_raw) if summary_raw else "" + # ASCII-Umlaut-Normalisierung (z.B. dpa-AFX schreibt "Gespraeche"). + # Dictionary-basiert, sicher gegen englische Woerter wie "Boeing". + title, _ = normalize_german_umlauts(title) + summary, _ = normalize_german_umlauts(summary) text = f"{title} {summary}".lower() # Adaptive Match-Schwelle: diff --git a/src/services/post_refresh_qc.py b/src/services/post_refresh_qc.py index 2e0d7a1..25d6d47 100644 --- a/src/services/post_refresh_qc.py +++ b/src/services/post_refresh_qc.py @@ -400,18 +400,20 @@ async def run_post_refresh_qc(db, incident_id: int) -> dict: db, incident_id, incident_title, incident_desc ) umlauts_fixed = await normalize_umlaut_fields(db, incident_id) + article_umlauts_fixed = await normalize_umlaut_articles(db, incident_id) - if facts_removed > 0 or locations_fixed > 0 or umlauts_fixed > 0: + total_umlaut_changes = umlauts_fixed + article_umlauts_fixed + if facts_removed > 0 or locations_fixed > 0 or total_umlaut_changes > 0: await db.commit() logger.info( - "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert", - incident_id, facts_removed, locations_fixed, umlauts_fixed, + "Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert (davon %d in Articles)", + incident_id, facts_removed, locations_fixed, total_umlaut_changes, article_umlauts_fixed, ) return { "facts_removed": facts_removed, "locations_fixed": locations_fixed, - "umlauts_fixed": umlauts_fixed, + "umlauts_fixed": total_umlaut_changes, } except Exception as e: @@ -568,3 +570,64 @@ async def normalize_umlaut_fields(db, incident_id: int) -> int: incident_id, count_summary, count_dev, ) return total + + +async def normalize_umlaut_articles(db, incident_id: int) -> int: + """Normalisiert Umlaute in allen Artikel-Texten des Incidents. + + Felder die behandelt werden: + - headline_de und content_de bei allen Artikeln (LLM-Uebersetzung kann + ASCII-Umlaute liefern trotz Prompt-Anweisung) + - headline und content_original bei language='de' (manche Quellen wie + dpa-AFX, Telegram-Kanaele liefern selbst schon ASCII-Umlaute) + + Idempotent: Wenn der Text schon korrekt ist, macht das Dict-Lookup + keine Aenderung und wir schreiben nicht zurueck. + + Rueckgabe: Gesamtzahl der Wort-Ersetzungen ueber alle Artikel. + """ + cursor = await db.execute( + """SELECT id, language, headline, headline_de, content_original, content_de + FROM articles WHERE incident_id = ?""", + (incident_id,), + ) + rows = await cursor.fetchall() + if not rows: + return 0 + + total = 0 + for row in rows: + is_de = (row["language"] or "").lower() == "de" + updates = {} + + # Felder die immer behandelt werden (LLM-Uebersetzungen) + if row["headline_de"]: + new, n = normalize_german_umlauts(row["headline_de"]) + if n > 0: + updates["headline_de"] = new + total += n + if row["content_de"]: + new, n = normalize_german_umlauts(row["content_de"]) + if n > 0: + updates["content_de"] = new + total += n + + # Originalfelder nur bei deutschen Quellen + if is_de: + if row["headline"]: + new, n = normalize_german_umlauts(row["headline"]) + if n > 0: + updates["headline"] = new + total += n + if row["content_original"]: + new, n = normalize_german_umlauts(row["content_original"]) + if n > 0: + updates["content_original"] = new + total += n + + if updates: + set_clause = ", ".join(f"{k} = ?" for k in updates) + values = list(updates.values()) + [row["id"]] + await db.execute(f"UPDATE articles SET {set_clause} WHERE id = ?", values) + + return total