"""RSS-Feed Parser: Durchsucht vorkonfigurierte Feeds nach relevanten Meldungen.""" import asyncio import logging import feedparser import httpx from datetime import datetime, timezone from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS from source_rules import _extract_domain logger = logging.getLogger("osint.rss") class RSSParser: """Durchsucht RSS-Feeds nach relevanten Artikeln.""" # Stoppwörter die bei der RSS-Suche ignoriert werden STOP_WORDS = { "und", "oder", "der", "die", "das", "ein", "eine", "in", "im", "am", "an", "auf", "für", "mit", "von", "zu", "zum", "zur", "bei", "nach", "vor", "über", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from", } async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None) -> list[dict]: """Durchsucht RSS-Feeds nach einem Suchbegriff. Args: search_term: Suchbegriff international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen) tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen """ all_articles = [] search_words = [ w for w in search_term.lower().split() if w not in self.STOP_WORDS and len(w) >= 3 ] if not search_words: search_words = search_term.lower().split()[:2] rss_feeds = await self._get_rss_feeds(tenant_id=tenant_id) # Feed-Kategorien filtern if international: categories = rss_feeds.keys() else: categories = [c for c in rss_feeds.keys() if c != "international"] tasks = [] for category in categories: for feed_config in rss_feeds.get(category, []): tasks.append(self._fetch_feed(feed_config, search_words)) results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: if isinstance(result, Exception): logger.warning(f"Feed-Fehler: {result}") continue all_articles.extend(result) cat_info = "alle" if international else "nur deutsch + behörden" logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer") all_articles = self._apply_domain_cap(all_articles) return all_articles async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]: """Durchsucht nur die übergebenen Feeds (vorselektiert durch Claude). Args: search_term: Suchbegriff selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"} """ all_articles = [] search_words = [ w for w in search_term.lower().split() if w not in self.STOP_WORDS and len(w) >= 3 ] if not search_words: search_words = search_term.lower().split()[:2] tasks = [] for feed_config in selected_feeds: tasks.append(self._fetch_feed(feed_config, search_words)) results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: if isinstance(result, Exception): logger.warning(f"Feed-Fehler: {result}") continue all_articles.extend(result) logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds") all_articles = self._apply_domain_cap(all_articles) return all_articles async def _get_rss_feeds(self, tenant_id: int = None) -> dict: """Laedt RSS-Feeds aus der Datenbank (global + org-spezifisch).""" try: from source_rules import get_source_rules rules = await get_source_rules(tenant_id=tenant_id) return rules.get("rss_feeds", {}) except Exception as e: logger.warning(f"Fallback auf config.py fuer RSS-Feeds: {e}") from config import RSS_FEEDS return dict(RSS_FEEDS) async def _fetch_feed(self, feed_config: dict, search_words: list[str]) -> list[dict]: """Einzelnen RSS-Feed abrufen und durchsuchen.""" name = feed_config["name"] url = feed_config["url"] articles = [] try: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: response = await client.get(url, headers={ "User-Agent": "OSINT-Monitor/1.0 (News Aggregator)" }) response.raise_for_status() feed = await asyncio.to_thread(feedparser.parse, response.text) for entry in feed.entries[:50]: title = entry.get("title", "") summary = entry.get("summary", "") text = f"{title} {summary}".lower() # Flexibles Keyword-Matching: mindestens die Hälfte der Suchworte muss vorkommen min_matches = max(1, len(search_words) // 2) match_count = sum(1 for word in search_words if word in text) if match_count >= min_matches: published = None if hasattr(entry, "published_parsed") and entry.published_parsed: try: published = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).astimezone(TIMEZONE).isoformat() except (TypeError, ValueError): pass # Relevanz-Score: Anteil der gematchten Suchworte (0.0-1.0) relevance_score = match_count / len(search_words) if search_words else 0.0 articles.append({ "headline": title, "headline_de": title if self._is_german(title) else None, "source": name, "source_url": entry.get("link", ""), "content_original": summary[:1000] if summary else None, "content_de": summary[:1000] if summary and self._is_german(summary) else None, "language": "de" if self._is_german(title) else "en", "published_at": published, "relevance_score": relevance_score, }) except Exception as e: logger.debug(f"Feed {name} ({url}): {e}") return articles def _apply_domain_cap(self, articles: list[dict]) -> list[dict]: """Begrenzt die Anzahl der Artikel pro Domain auf MAX_ARTICLES_PER_DOMAIN_RSS. Gruppiert nach Domain, sortiert pro Domain nach relevance_score (beste zuerst), behält nur die Top-N pro Domain. """ if not articles: return articles # Nach Domain gruppieren by_domain: dict[str, list[dict]] = {} for article in articles: domain = _extract_domain(article.get("source_url", "")) if not domain: domain = "__unknown__" by_domain.setdefault(domain, []).append(article) capped = [] for domain, domain_articles in by_domain.items(): # Nach Relevanz sortieren (beste zuerst) domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True) kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS] if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS: logger.info( f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt" ) capped.extend(kept) if len(capped) < len(articles): logger.info(f"Domain-Cap gesamt: {len(articles)} → {len(capped)} Artikel") return capped def _is_german(self, text: str) -> bool: """Einfache Heuristik ob ein Text deutsch ist.""" german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein", "eine", "den", "dem", "des", "sich", "wird", "nach", "bei", "auch", "über", "wie", "aus", "hat", "zum", "zur", "als", "noch", "mehr", "nicht", "aber", "oder", "sind", "vor", "einem", "einer", "wurde"} words = set(text.lower().split()) matches = words & german_words return len(matches) >= 2