diff --git a/src/feeds/rss_parser.py b/src/feeds/rss_parser.py index 5e756f4..481d28d 100644 --- a/src/feeds/rss_parser.py +++ b/src/feeds/rss_parser.py @@ -6,6 +6,7 @@ import httpx from datetime import datetime, timezone from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS from source_rules import _extract_domain +from feeds.transcript_extractors._common import html_to_text logger = logging.getLogger("osint.rss") @@ -152,7 +153,11 @@ class RSSParser: for entry in feed.entries[:50]: title = entry.get("title", "") - summary = entry.get("summary", "") + # RSS-summary ist bei vielen Quellen HTML (Guardian, AP, SZ, ...). + # Vor weiterer Verwendung strippen, sonst landet HTML in DB + # und KI-Agenten und Sprach-Heuristik werden gestoert. + summary_raw = entry.get("summary", "") + summary = html_to_text(summary_raw) if summary_raw else "" text = f"{title} {summary}".lower() # Adaptive Match-Schwelle: