feat(rss/telegram): sprach-aware Keyword-Matching für nicht-lateinische Quellen
Bisher generierte Haiku Keywords nur in DE/EN/Romaji. Japanische RSS-Feeds
(z.B. MOD-GNews mit "防衛省・自衛隊の宇宙政策") matchten daher nie, weil
"jieitai" ≠ "自衛隊". Arabische/persische Telegram-Channels matchten nur
durch Zufall (lateinische Eigennamen in Hashtags/URLs).
Drei zusammenhängende Änderungen:
1. get_feeds_with_metadata liefert primary_language pro Feed mit.
2. FEED_SELECTION_PROMPT_TEMPLATE und KEYWORD_EXTRACTION_PROMPT verlangen
sprach-gruppierte Keywords ({de:[...], en:[...], ja:[...], ru:[...], ...}).
"en" enthält lateinische Eigennamen (universell). Andere Sprachen werden
nur gegen Feeds derselben Sprache gematcht.
3. RSS- und Telegram-Parser kombinieren pro Feed/Channel die "en"-Universalbegriffe
mit den Keywords der Quellsprache. Die Spezifik-Schwelle (1-Treffer-Match)
greift jetzt auch ab 3 Zeichen bei Non-ASCII (CJK, Arabisch, Kyrillisch).
Backward-kompatibel: flache Keyword-Listen werden weiter akzeptiert.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -8,10 +8,25 @@ from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||
from source_rules import _extract_domain
|
||||
from feeds.transcript_extractors._common import html_to_text
|
||||
from services.post_refresh_qc import normalize_german_umlauts
|
||||
from agents.researcher import keywords_for_language, flatten_keywords
|
||||
|
||||
logger = logging.getLogger("osint.rss")
|
||||
|
||||
|
||||
def _is_specific_word(w: str) -> bool:
|
||||
"""Spezifisches Keyword = 1-Treffer reicht für Match.
|
||||
|
||||
- Lateinisch: ab 7 Zeichen (alte Heuristik).
|
||||
- Nicht-ASCII (CJK, Arabisch, Hebräisch, Kyrillisch etc.): ab 3 Zeichen.
|
||||
Beispiel: '自衛隊' (3 Kanji) oder 'путин' (5 Kyrillisch) sind spezifisch genug.
|
||||
"""
|
||||
if not w:
|
||||
return False
|
||||
if any(ord(c) > 127 for c in w):
|
||||
return len(w) >= 3
|
||||
return len(w) >= 7
|
||||
|
||||
|
||||
class RSSParser:
|
||||
"""Durchsucht RSS-Feeds nach relevanten Artikeln."""
|
||||
|
||||
@@ -28,27 +43,31 @@ class RSSParser:
|
||||
cleaned = [w for w in words if not w.isdigit()]
|
||||
return cleaned if cleaned else words
|
||||
|
||||
async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None, keywords: list[str] | None = None, user_id: int = None) -> list[dict]:
|
||||
def _fallback_search_words(self, search_term: str) -> list[str]:
|
||||
words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in self.STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not words:
|
||||
words = search_term.lower().split()[:2]
|
||||
return self._clean_search_words(words)
|
||||
|
||||
async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None, keywords: dict | list | None = None, user_id: int = None) -> list[dict]:
|
||||
"""Durchsucht RSS-Feeds nach einem Suchbegriff.
|
||||
|
||||
Args:
|
||||
search_term: Suchbegriff
|
||||
international: Wenn False, nur Feeds in der Org-Sprache + Behoerden (keine internationalen)
|
||||
tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen
|
||||
keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split)
|
||||
keywords: Sprach-Dict {iso_lang: [keyword, ...]} oder flache Liste (Backward).
|
||||
"""
|
||||
all_articles = []
|
||||
if keywords:
|
||||
search_words = [w.lower().strip() for w in keywords if w.strip()]
|
||||
logger.info(f"RSS-Suche mit Claude-Keywords: {search_words}")
|
||||
logger.info(f"RSS-Suche mit Claude-Keywords (Sprachen): "
|
||||
f"{ {k: len(v) for k, v in keywords.items()} if isinstance(keywords, dict) else len(keywords) }")
|
||||
fallback_words = None
|
||||
else:
|
||||
search_words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in self.STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not search_words:
|
||||
search_words = search_term.lower().split()[:2]
|
||||
search_words = self._clean_search_words(search_words)
|
||||
fallback_words = self._fallback_search_words(search_term)
|
||||
|
||||
rss_feeds = await self._get_rss_feeds(tenant_id=tenant_id)
|
||||
|
||||
@@ -74,7 +93,13 @@ class RSSParser:
|
||||
tasks = []
|
||||
for category in categories:
|
||||
for feed_config in rss_feeds.get(category, []):
|
||||
tasks.append(self._fetch_feed(feed_config, search_words))
|
||||
feed_lang = feed_config.get("primary_language")
|
||||
if keywords:
|
||||
words = keywords_for_language(keywords, feed_lang)
|
||||
words = [w.lower() for w in words]
|
||||
else:
|
||||
words = fallback_words
|
||||
tasks.append(self._fetch_feed(feed_config, words))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
@@ -89,30 +114,34 @@ class RSSParser:
|
||||
all_articles = self._apply_domain_cap(all_articles)
|
||||
return all_articles
|
||||
|
||||
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict], keywords: list[str] | None = None) -> list[dict]:
|
||||
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict], keywords: dict | list | None = None) -> list[dict]:
|
||||
"""Durchsucht nur die übergebenen Feeds (vorselektiert durch Claude).
|
||||
|
||||
Args:
|
||||
search_term: Suchbegriff
|
||||
selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"}
|
||||
keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split)
|
||||
selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"} und idealerweise "primary_language"
|
||||
keywords: Sprach-Dict {iso_lang: [keyword, ...]} oder flache Liste (Backward).
|
||||
"""
|
||||
all_articles = []
|
||||
if keywords:
|
||||
search_words = [w.lower().strip() for w in keywords if w.strip()]
|
||||
logger.info(f"RSS-Selektiv mit Claude-Keywords: {search_words}")
|
||||
if isinstance(keywords, dict):
|
||||
logger.info(f"RSS-Selektiv mit Claude-Keywords (Sprachen): "
|
||||
f"{ {k: len(v) for k, v in keywords.items()} }")
|
||||
else:
|
||||
logger.info(f"RSS-Selektiv mit Claude-Keywords (flach): {keywords}")
|
||||
fallback_words = None
|
||||
else:
|
||||
search_words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in self.STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not search_words:
|
||||
search_words = search_term.lower().split()[:2]
|
||||
search_words = self._clean_search_words(search_words)
|
||||
fallback_words = self._fallback_search_words(search_term)
|
||||
|
||||
tasks = []
|
||||
for feed_config in selected_feeds:
|
||||
tasks.append(self._fetch_feed(feed_config, search_words))
|
||||
feed_lang = feed_config.get("primary_language")
|
||||
if keywords:
|
||||
words = keywords_for_language(keywords, feed_lang)
|
||||
words = [w.lower() for w in words]
|
||||
else:
|
||||
words = fallback_words
|
||||
tasks.append(self._fetch_feed(feed_config, words))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
@@ -166,11 +195,11 @@ class RSSParser:
|
||||
text = f"{title} {summary}".lower()
|
||||
|
||||
# Adaptive Match-Schwelle:
|
||||
# - Bei mindestens einem spezifischen Keyword (>=7 Zeichen) im Text reicht 1 Treffer.
|
||||
# Verhindert, dass Headlines mit nur einem starken Keyword wie "buckelwal"
|
||||
# rausfallen, wenn die Lage thematisch eng ist (Bug 1, vom User dokumentiert).
|
||||
# - Bei mindestens einem spezifischen Keyword (Latin ≥7 Zeichen oder
|
||||
# CJK/Arabisch/Hebräisch/Kyrillisch ≥3 Zeichen) im Text reicht 1 Treffer.
|
||||
# Damit matched z.B. "自衛隊" (3 Kanji) wie "buckelwal" (9 Zeichen).
|
||||
# - Sonst: alte Heuristik (mindestens halb der Wörter, max. 2).
|
||||
specific_in_text = any(w in text for w in search_words if len(w) >= 7)
|
||||
specific_in_text = any(w in text for w in search_words if _is_specific_word(w))
|
||||
if specific_in_text:
|
||||
min_matches = 1
|
||||
else:
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren