Initial commit: AegisSight-Monitor (OSINT-Monitoringsystem)
Dieser Commit ist enthalten in:
157
src/feeds/rss_parser.py
Normale Datei
157
src/feeds/rss_parser.py
Normale Datei
@@ -0,0 +1,157 @@
|
||||
"""RSS-Feed Parser: Durchsucht vorkonfigurierte Feeds nach relevanten Meldungen."""
|
||||
import asyncio
|
||||
import logging
|
||||
import feedparser
|
||||
import httpx
|
||||
from datetime import datetime, timezone
|
||||
from config import TIMEZONE
|
||||
|
||||
logger = logging.getLogger("osint.rss")
|
||||
|
||||
|
||||
class RSSParser:
|
||||
"""Durchsucht RSS-Feeds nach relevanten Artikeln."""
|
||||
|
||||
# Stoppwörter die bei der RSS-Suche ignoriert werden
|
||||
STOP_WORDS = {
|
||||
"und", "oder", "der", "die", "das", "ein", "eine", "in", "im", "am", "an",
|
||||
"auf", "für", "mit", "von", "zu", "zum", "zur", "bei", "nach", "vor",
|
||||
"über", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from",
|
||||
}
|
||||
|
||||
async def search_feeds(self, search_term: str, international: bool = True, tenant_id: int = None) -> list[dict]:
|
||||
"""Durchsucht RSS-Feeds nach einem Suchbegriff.
|
||||
|
||||
Args:
|
||||
search_term: Suchbegriff
|
||||
international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen)
|
||||
tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen
|
||||
"""
|
||||
all_articles = []
|
||||
search_words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in self.STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not search_words:
|
||||
search_words = search_term.lower().split()[:2]
|
||||
|
||||
rss_feeds = await self._get_rss_feeds(tenant_id=tenant_id)
|
||||
|
||||
# Feed-Kategorien filtern
|
||||
if international:
|
||||
categories = rss_feeds.keys()
|
||||
else:
|
||||
categories = [c for c in rss_feeds.keys() if c != "international"]
|
||||
|
||||
tasks = []
|
||||
for category in categories:
|
||||
for feed_config in rss_feeds.get(category, []):
|
||||
tasks.append(self._fetch_feed(feed_config, search_words))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
logger.warning(f"Feed-Fehler: {result}")
|
||||
continue
|
||||
all_articles.extend(result)
|
||||
|
||||
cat_info = "alle" if international else "nur deutsch + behörden"
|
||||
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
||||
return all_articles
|
||||
|
||||
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]:
|
||||
"""Durchsucht nur die übergebenen Feeds (vorselektiert durch Claude).
|
||||
|
||||
Args:
|
||||
search_term: Suchbegriff
|
||||
selected_feeds: Liste von Feed-Dicts mit mindestens {"name", "url"}
|
||||
"""
|
||||
all_articles = []
|
||||
search_words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in self.STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not search_words:
|
||||
search_words = search_term.lower().split()[:2]
|
||||
|
||||
tasks = []
|
||||
for feed_config in selected_feeds:
|
||||
tasks.append(self._fetch_feed(feed_config, search_words))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
logger.warning(f"Feed-Fehler: {result}")
|
||||
continue
|
||||
all_articles.extend(result)
|
||||
|
||||
logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds")
|
||||
return all_articles
|
||||
|
||||
async def _get_rss_feeds(self, tenant_id: int = None) -> dict:
|
||||
"""Laedt RSS-Feeds aus der Datenbank (global + org-spezifisch)."""
|
||||
try:
|
||||
from source_rules import get_source_rules
|
||||
rules = await get_source_rules(tenant_id=tenant_id)
|
||||
return rules.get("rss_feeds", {})
|
||||
except Exception as e:
|
||||
logger.warning(f"Fallback auf config.py fuer RSS-Feeds: {e}")
|
||||
from config import RSS_FEEDS
|
||||
return dict(RSS_FEEDS)
|
||||
|
||||
async def _fetch_feed(self, feed_config: dict, search_words: list[str]) -> list[dict]:
|
||||
"""Einzelnen RSS-Feed abrufen und durchsuchen."""
|
||||
name = feed_config["name"]
|
||||
url = feed_config["url"]
|
||||
articles = []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
response = await client.get(url, headers={
|
||||
"User-Agent": "OSINT-Monitor/1.0 (News Aggregator)"
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
feed = await asyncio.to_thread(feedparser.parse, response.text)
|
||||
|
||||
for entry in feed.entries[:50]:
|
||||
title = entry.get("title", "")
|
||||
summary = entry.get("summary", "")
|
||||
text = f"{title} {summary}".lower()
|
||||
|
||||
# Prüfe ob mindestens ein Suchwort vorkommt
|
||||
if all(word in text for word in search_words):
|
||||
published = None
|
||||
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
||||
try:
|
||||
published = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).astimezone(TIMEZONE).isoformat()
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
articles.append({
|
||||
"headline": title,
|
||||
"headline_de": title if self._is_german(title) else None,
|
||||
"source": name,
|
||||
"source_url": entry.get("link", ""),
|
||||
"content_original": summary[:1000] if summary else None,
|
||||
"content_de": summary[:1000] if summary and self._is_german(summary) else None,
|
||||
"language": "de" if self._is_german(title) else "en",
|
||||
"published_at": published,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Feed {name} ({url}): {e}")
|
||||
|
||||
return articles
|
||||
|
||||
def _is_german(self, text: str) -> bool:
|
||||
"""Einfache Heuristik ob ein Text deutsch ist."""
|
||||
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein",
|
||||
"eine", "den", "dem", "des", "sich", "wird", "nach", "bei", "auch",
|
||||
"über", "wie", "aus", "hat", "zum", "zur", "als", "noch", "mehr",
|
||||
"nicht", "aber", "oder", "sind", "vor", "einem", "einer", "wurde"}
|
||||
words = set(text.lower().split())
|
||||
matches = words & german_words
|
||||
return len(matches) >= 2
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren