diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 501b1d5..e2e9fd4 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -579,14 +579,15 @@ class AgentOrchestrator: all_feeds = await get_feeds_with_metadata(tenant_id=tenant_id) feed_usage = None + keywords = None if len(all_feeds) > 20: - selected_feeds, feed_usage = await rss_researcher.select_relevant_feeds( + selected_feeds, keywords, feed_usage = await rss_researcher.select_relevant_feeds( title, description, international, all_feeds ) logger.info(f"Feed-Selektion: {len(selected_feeds)} von {len(all_feeds)} Feeds ausgewählt") - articles = await rss_parser.search_feeds_selective(title, selected_feeds) + articles = await rss_parser.search_feeds_selective(title, selected_feeds, keywords=keywords) else: - articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id) + articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords) logger.info(f"RSS: {len(articles)} relevante Artikel gefunden (international={international})") return articles, feed_usage diff --git a/src/feeds/rss_parser.py b/src/feeds/rss_parser.py index 4282b43..11f45ed 100644 --- a/src/feeds/rss_parser.py +++ b/src/feeds/rss_parser.py @@ -20,21 +20,33 @@ class RSSParser: "über", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from", } - async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None) -> list[dict]: + @staticmethod + def _clean_search_words(words: list[str]) -> list[str]: + """Entfernt rein-numerische Wörter (Jahreszahlen etc.) aus Suchbegriffen.""" + cleaned = [w for w in words if not w.isdigit()] + return cleaned if cleaned else words + + async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None, keywords: list[str] | None = None) -> list[dict]: """Durchsucht RSS-Feeds nach einem Suchbegriff. Args: search_term: Suchbegriff international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen) tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen + keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split) """ all_articles = [] - search_words = [ - w for w in search_term.lower().split() - if w not in self.STOP_WORDS and len(w) >= 3 - ] - if not search_words: - search_words = search_term.lower().split()[:2] + if keywords: + search_words = [w.lower().strip() for w in keywords if w.strip()] + logger.info(f"RSS-Suche mit Claude-Keywords: {search_words}") + else: + search_words = [ + w for w in search_term.lower().split() + if w not in self.STOP_WORDS and len(w) >= 3 + ] + if not search_words: + search_words = search_term.lower().split()[:2] + search_words = self._clean_search_words(search_words) rss_feeds = await self._get_rss_feeds(tenant_id=tenant_id) @@ -62,20 +74,26 @@ class RSSParser: all_articles = self._apply_domain_cap(all_articles) return all_articles - async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]: + async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict], keywords: list[str] | None = None) -> list[dict]: """Durchsucht nur die übergebenen Feeds (vorselektiert durch Claude). Args: search_term: Suchbegriff selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"} + keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split) """ all_articles = [] - search_words = [ - w for w in search_term.lower().split() - if w not in self.STOP_WORDS and len(w) >= 3 - ] - if not search_words: - search_words = search_term.lower().split()[:2] + if keywords: + search_words = [w.lower().strip() for w in keywords if w.strip()] + logger.info(f"RSS-Selektiv mit Claude-Keywords: {search_words}") + else: + search_words = [ + w for w in search_term.lower().split() + if w not in self.STOP_WORDS and len(w) >= 3 + ] + if not search_words: + search_words = search_term.lower().split()[:2] + search_words = self._clean_search_words(search_words) tasks = [] for feed_config in selected_feeds: @@ -124,8 +142,8 @@ class RSSParser: summary = entry.get("summary", "") text = f"{title} {summary}".lower() - # Flexibles Keyword-Matching: mindestens die Hälfte der Suchworte muss vorkommen - min_matches = max(1, len(search_words) // 2) + # Flexibles Keyword-Matching: mindestens die Hälfte der Suchworte muss vorkommen (aufgerundet) + min_matches = max(1, (len(search_words) + 1) // 2) match_count = sum(1 for word in search_words if word in text) if match_count >= min_matches: