Commits vergleichen
3 Commits
d7711711aa
...
1f4d7b1837
| Autor | SHA1 | Datum | |
|---|---|---|---|
| 1f4d7b1837 | |||
|
|
98c9da64b0 | ||
|
|
307f0a1868 |
@@ -1411,12 +1411,22 @@ class AgentOrchestrator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Übersetzungen aktualisieren (nur für gültige DB-IDs)
|
# Übersetzungen aktualisieren (nur für gültige DB-IDs)
|
||||||
|
# LLM-Drift abfangen: trotz Prompt-Anweisung kommen manchmal
|
||||||
|
# ASCII-Umlaute ("Gespraeche" statt "Gespräche") in der Übersetzung.
|
||||||
|
# Dictionary-basierte Korrektur schreibt nur deutsche Woerter um.
|
||||||
|
from services.post_refresh_qc import normalize_german_umlauts as _norm_de
|
||||||
for translation in analysis.get("translations", []):
|
for translation in analysis.get("translations", []):
|
||||||
article_id = translation.get("article_id")
|
article_id = translation.get("article_id")
|
||||||
if isinstance(article_id, int):
|
if isinstance(article_id, int):
|
||||||
|
hd = translation.get("headline_de")
|
||||||
|
cd = translation.get("content_de")
|
||||||
|
if hd:
|
||||||
|
hd, _ = _norm_de(hd)
|
||||||
|
if cd:
|
||||||
|
cd, _ = _norm_de(cd)
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"UPDATE articles SET headline_de = ?, content_de = ? WHERE id = ? AND incident_id = ?",
|
"UPDATE articles SET headline_de = ?, content_de = ? WHERE id = ? AND incident_id = ?",
|
||||||
(translation.get("headline_de"), translation.get("content_de"), article_id, incident_id),
|
(hd, cd, article_id, incident_id),
|
||||||
)
|
)
|
||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ import httpx
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||||
from source_rules import _extract_domain
|
from source_rules import _extract_domain
|
||||||
|
from feeds.transcript_extractors._common import html_to_text
|
||||||
|
from services.post_refresh_qc import normalize_german_umlauts
|
||||||
|
|
||||||
logger = logging.getLogger("osint.rss")
|
logger = logging.getLogger("osint.rss")
|
||||||
|
|
||||||
@@ -152,7 +154,15 @@ class RSSParser:
|
|||||||
|
|
||||||
for entry in feed.entries[:50]:
|
for entry in feed.entries[:50]:
|
||||||
title = entry.get("title", "")
|
title = entry.get("title", "")
|
||||||
summary = entry.get("summary", "")
|
# RSS-summary ist bei vielen Quellen HTML (Guardian, AP, SZ, ...).
|
||||||
|
# Vor weiterer Verwendung strippen, sonst landet HTML in DB
|
||||||
|
# und KI-Agenten und Sprach-Heuristik werden gestoert.
|
||||||
|
summary_raw = entry.get("summary", "")
|
||||||
|
summary = html_to_text(summary_raw) if summary_raw else ""
|
||||||
|
# ASCII-Umlaut-Normalisierung (z.B. dpa-AFX schreibt "Gespraeche").
|
||||||
|
# Dictionary-basiert, sicher gegen englische Woerter wie "Boeing".
|
||||||
|
title, _ = normalize_german_umlauts(title)
|
||||||
|
summary, _ = normalize_german_umlauts(summary)
|
||||||
text = f"{title} {summary}".lower()
|
text = f"{title} {summary}".lower()
|
||||||
|
|
||||||
# Adaptive Match-Schwelle:
|
# Adaptive Match-Schwelle:
|
||||||
|
|||||||
@@ -400,18 +400,20 @@ async def run_post_refresh_qc(db, incident_id: int) -> dict:
|
|||||||
db, incident_id, incident_title, incident_desc
|
db, incident_id, incident_title, incident_desc
|
||||||
)
|
)
|
||||||
umlauts_fixed = await normalize_umlaut_fields(db, incident_id)
|
umlauts_fixed = await normalize_umlaut_fields(db, incident_id)
|
||||||
|
article_umlauts_fixed = await normalize_umlaut_articles(db, incident_id)
|
||||||
|
|
||||||
if facts_removed > 0 or locations_fixed > 0 or umlauts_fixed > 0:
|
total_umlaut_changes = umlauts_fixed + article_umlauts_fixed
|
||||||
|
if facts_removed > 0 or locations_fixed > 0 or total_umlaut_changes > 0:
|
||||||
await db.commit()
|
await db.commit()
|
||||||
logger.info(
|
logger.info(
|
||||||
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert",
|
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert, %d Umlaute normalisiert (davon %d in Articles)",
|
||||||
incident_id, facts_removed, locations_fixed, umlauts_fixed,
|
incident_id, facts_removed, locations_fixed, total_umlaut_changes, article_umlauts_fixed,
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"facts_removed": facts_removed,
|
"facts_removed": facts_removed,
|
||||||
"locations_fixed": locations_fixed,
|
"locations_fixed": locations_fixed,
|
||||||
"umlauts_fixed": umlauts_fixed,
|
"umlauts_fixed": total_umlaut_changes,
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -568,3 +570,64 @@ async def normalize_umlaut_fields(db, incident_id: int) -> int:
|
|||||||
incident_id, count_summary, count_dev,
|
incident_id, count_summary, count_dev,
|
||||||
)
|
)
|
||||||
return total
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
async def normalize_umlaut_articles(db, incident_id: int) -> int:
|
||||||
|
"""Normalisiert Umlaute in allen Artikel-Texten des Incidents.
|
||||||
|
|
||||||
|
Felder die behandelt werden:
|
||||||
|
- headline_de und content_de bei allen Artikeln (LLM-Uebersetzung kann
|
||||||
|
ASCII-Umlaute liefern trotz Prompt-Anweisung)
|
||||||
|
- headline und content_original bei language='de' (manche Quellen wie
|
||||||
|
dpa-AFX, Telegram-Kanaele liefern selbst schon ASCII-Umlaute)
|
||||||
|
|
||||||
|
Idempotent: Wenn der Text schon korrekt ist, macht das Dict-Lookup
|
||||||
|
keine Aenderung und wir schreiben nicht zurueck.
|
||||||
|
|
||||||
|
Rueckgabe: Gesamtzahl der Wort-Ersetzungen ueber alle Artikel.
|
||||||
|
"""
|
||||||
|
cursor = await db.execute(
|
||||||
|
"""SELECT id, language, headline, headline_de, content_original, content_de
|
||||||
|
FROM articles WHERE incident_id = ?""",
|
||||||
|
(incident_id,),
|
||||||
|
)
|
||||||
|
rows = await cursor.fetchall()
|
||||||
|
if not rows:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
for row in rows:
|
||||||
|
is_de = (row["language"] or "").lower() == "de"
|
||||||
|
updates = {}
|
||||||
|
|
||||||
|
# Felder die immer behandelt werden (LLM-Uebersetzungen)
|
||||||
|
if row["headline_de"]:
|
||||||
|
new, n = normalize_german_umlauts(row["headline_de"])
|
||||||
|
if n > 0:
|
||||||
|
updates["headline_de"] = new
|
||||||
|
total += n
|
||||||
|
if row["content_de"]:
|
||||||
|
new, n = normalize_german_umlauts(row["content_de"])
|
||||||
|
if n > 0:
|
||||||
|
updates["content_de"] = new
|
||||||
|
total += n
|
||||||
|
|
||||||
|
# Originalfelder nur bei deutschen Quellen
|
||||||
|
if is_de:
|
||||||
|
if row["headline"]:
|
||||||
|
new, n = normalize_german_umlauts(row["headline"])
|
||||||
|
if n > 0:
|
||||||
|
updates["headline"] = new
|
||||||
|
total += n
|
||||||
|
if row["content_original"]:
|
||||||
|
new, n = normalize_german_umlauts(row["content_original"])
|
||||||
|
if n > 0:
|
||||||
|
updates["content_original"] = new
|
||||||
|
total += n
|
||||||
|
|
||||||
|
if updates:
|
||||||
|
set_clause = ", ".join(f"{k} = ?" for k in updates)
|
||||||
|
values = list(updates.values()) + [row["id"]]
|
||||||
|
await db.execute(f"UPDATE articles SET {set_clause} WHERE id = ?", values)
|
||||||
|
|
||||||
|
return total
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren