Dateien
AegisSight-Monitor/src/feeds/rss_parser.py
claude-dev ff4c54d9a8 Quellenvielfalt sicherstellen: Domain-Cap + Balance + Discovery-Verbesserungen
- config.py: MAX_FEEDS_PER_DOMAIN=3, MAX_ARTICLES_PER_DOMAIN_RSS=10
- rss_parser.py: _apply_domain_cap() begrenzt Artikel pro Domain nach RSS-Fetch
- orchestrator.py: Domain-Balance vor Feed-Selektion (max 3 Feeds/Domain),
  Domain-Cap in Background-Discovery
- source_rules.py: article_count in get_feeds_with_metadata(), Content-Hash
  in _validate_feed() für Duplikat-Erkennung bei Discovery
- researcher.py: QUELLENVIELFALT-Regel im Haiku Feed-Selektions-Prompt
- DB: 52 WordPress-Redirect-Duplikate deaktiviert (netzpolitik.org, bashinho.de)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 23:25:04 +01:00

201 Zeilen
8.3 KiB
Python

"""RSS-Feed Parser: Durchsucht vorkonfigurierte Feeds nach relevanten Meldungen."""
import asyncio
import logging
import feedparser
import httpx
from datetime import datetime, timezone
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
from source_rules import _extract_domain
logger = logging.getLogger("osint.rss")
class RSSParser:
"""Durchsucht RSS-Feeds nach relevanten Artikeln."""
# Stoppwörter die bei der RSS-Suche ignoriert werden
STOP_WORDS = {
"und", "oder", "der", "die", "das", "ein", "eine", "in", "im", "am", "an",
"auf", "für", "mit", "von", "zu", "zum", "zur", "bei", "nach", "vor",
"über", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from",
}
async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None) -> list[dict]:
"""Durchsucht RSS-Feeds nach einem Suchbegriff.
Args:
search_term: Suchbegriff
international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen)
tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen
"""
all_articles = []
search_words = [
w for w in search_term.lower().split()
if w not in self.STOP_WORDS and len(w) >= 3
]
if not search_words:
search_words = search_term.lower().split()[:2]
rss_feeds = await self._get_rss_feeds(tenant_id=tenant_id)
# Feed-Kategorien filtern
if international:
categories = rss_feeds.keys()
else:
categories = [c for c in rss_feeds.keys() if c != "international"]
tasks = []
for category in categories:
for feed_config in rss_feeds.get(category, []):
tasks.append(self._fetch_feed(feed_config, search_words))
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.warning(f"Feed-Fehler: {result}")
continue
all_articles.extend(result)
cat_info = "alle" if international else "nur deutsch + behörden"
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
all_articles = self._apply_domain_cap(all_articles)
return all_articles
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]:
"""Durchsucht nur die übergebenen Feeds (vorselektiert durch Claude).
Args:
search_term: Suchbegriff
selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"}
"""
all_articles = []
search_words = [
w for w in search_term.lower().split()
if w not in self.STOP_WORDS and len(w) >= 3
]
if not search_words:
search_words = search_term.lower().split()[:2]
tasks = []
for feed_config in selected_feeds:
tasks.append(self._fetch_feed(feed_config, search_words))
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.warning(f"Feed-Fehler: {result}")
continue
all_articles.extend(result)
logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds")
all_articles = self._apply_domain_cap(all_articles)
return all_articles
async def _get_rss_feeds(self, tenant_id: int = None) -> dict:
"""Laedt RSS-Feeds aus der Datenbank (global + org-spezifisch)."""
try:
from source_rules import get_source_rules
rules = await get_source_rules(tenant_id=tenant_id)
return rules.get("rss_feeds", {})
except Exception as e:
logger.warning(f"Fallback auf config.py fuer RSS-Feeds: {e}")
from config import RSS_FEEDS
return dict(RSS_FEEDS)
async def _fetch_feed(self, feed_config: dict, search_words: list[str]) -> list[dict]:
"""Einzelnen RSS-Feed abrufen und durchsuchen."""
name = feed_config["name"]
url = feed_config["url"]
articles = []
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(url, headers={
"User-Agent": "OSINT-Monitor/1.0 (News Aggregator)"
})
response.raise_for_status()
feed = await asyncio.to_thread(feedparser.parse, response.text)
for entry in feed.entries[:50]:
title = entry.get("title", "")
summary = entry.get("summary", "")
text = f"{title} {summary}".lower()
# Flexibles Keyword-Matching: mindestens die Hälfte der Suchworte muss vorkommen
min_matches = max(1, len(search_words) // 2)
match_count = sum(1 for word in search_words if word in text)
if match_count >= min_matches:
published = None
if hasattr(entry, "published_parsed") and entry.published_parsed:
try:
published = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).astimezone(TIMEZONE).isoformat()
except (TypeError, ValueError):
pass
# Relevanz-Score: Anteil der gematchten Suchworte (0.0-1.0)
relevance_score = match_count / len(search_words) if search_words else 0.0
articles.append({
"headline": title,
"headline_de": title if self._is_german(title) else None,
"source": name,
"source_url": entry.get("link", ""),
"content_original": summary[:1000] if summary else None,
"content_de": summary[:1000] if summary and self._is_german(summary) else None,
"language": "de" if self._is_german(title) else "en",
"published_at": published,
"relevance_score": relevance_score,
})
except Exception as e:
logger.debug(f"Feed {name} ({url}): {e}")
return articles
def _apply_domain_cap(self, articles: list[dict]) -> list[dict]:
"""Begrenzt die Anzahl der Artikel pro Domain auf MAX_ARTICLES_PER_DOMAIN_RSS.
Gruppiert nach Domain, sortiert pro Domain nach relevance_score (beste zuerst),
behält nur die Top-N pro Domain.
"""
if not articles:
return articles
# Nach Domain gruppieren
by_domain: dict[str, list[dict]] = {}
for article in articles:
domain = _extract_domain(article.get("source_url", ""))
if not domain:
domain = "__unknown__"
by_domain.setdefault(domain, []).append(article)
capped = []
for domain, domain_articles in by_domain.items():
# Nach Relevanz sortieren (beste zuerst)
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS]
if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS:
logger.info(
f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt"
)
capped.extend(kept)
if len(capped) < len(articles):
logger.info(f"Domain-Cap gesamt: {len(articles)}{len(capped)} Artikel")
return capped
def _is_german(self, text: str) -> bool:
"""Einfache Heuristik ob ein Text deutsch ist."""
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein",
"eine", "den", "dem", "des", "sich", "wird", "nach", "bei", "auch",
"über", "wie", "aus", "hat", "zum", "zur", "als", "noch", "mehr",
"nicht", "aber", "oder", "sind", "vor", "einem", "einer", "wurde"}
words = set(text.lower().split())
matches = words & german_words
return len(matches) >= 2