Quellenvielfalt sicherstellen: Domain-Cap + Balance + Discovery-Verbesserungen
- config.py: MAX_FEEDS_PER_DOMAIN=3, MAX_ARTICLES_PER_DOMAIN_RSS=10 - rss_parser.py: _apply_domain_cap() begrenzt Artikel pro Domain nach RSS-Fetch - orchestrator.py: Domain-Balance vor Feed-Selektion (max 3 Feeds/Domain), Domain-Cap in Background-Discovery - source_rules.py: article_count in get_feeds_with_metadata(), Content-Hash in _validate_feed() für Duplikat-Erkennung bei Discovery - researcher.py: QUELLENVIELFALT-Regel im Haiku Feed-Selektions-Prompt - DB: 52 WordPress-Redirect-Duplikate deaktiviert (netzpolitik.org, bashinho.de) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -4,7 +4,8 @@ import logging
|
||||
import feedparser
|
||||
import httpx
|
||||
from datetime import datetime, timezone
|
||||
from config import TIMEZONE
|
||||
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||
from source_rules import _extract_domain
|
||||
|
||||
logger = logging.getLogger("osint.rss")
|
||||
|
||||
@@ -58,6 +59,7 @@ class RSSParser:
|
||||
|
||||
cat_info = "alle" if international else "nur deutsch + behörden"
|
||||
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
||||
all_articles = self._apply_domain_cap(all_articles)
|
||||
return all_articles
|
||||
|
||||
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]:
|
||||
@@ -88,6 +90,7 @@ class RSSParser:
|
||||
all_articles.extend(result)
|
||||
|
||||
logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds")
|
||||
all_articles = self._apply_domain_cap(all_articles)
|
||||
return all_articles
|
||||
|
||||
async def _get_rss_feeds(self, tenant_id: int = None) -> dict:
|
||||
@@ -153,6 +156,39 @@ class RSSParser:
|
||||
|
||||
return articles
|
||||
|
||||
def _apply_domain_cap(self, articles: list[dict]) -> list[dict]:
|
||||
"""Begrenzt die Anzahl der Artikel pro Domain auf MAX_ARTICLES_PER_DOMAIN_RSS.
|
||||
|
||||
Gruppiert nach Domain, sortiert pro Domain nach relevance_score (beste zuerst),
|
||||
behält nur die Top-N pro Domain.
|
||||
"""
|
||||
if not articles:
|
||||
return articles
|
||||
|
||||
# Nach Domain gruppieren
|
||||
by_domain: dict[str, list[dict]] = {}
|
||||
for article in articles:
|
||||
domain = _extract_domain(article.get("source_url", ""))
|
||||
if not domain:
|
||||
domain = "__unknown__"
|
||||
by_domain.setdefault(domain, []).append(article)
|
||||
|
||||
capped = []
|
||||
for domain, domain_articles in by_domain.items():
|
||||
# Nach Relevanz sortieren (beste zuerst)
|
||||
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
|
||||
kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS]
|
||||
if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS:
|
||||
logger.info(
|
||||
f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt"
|
||||
)
|
||||
capped.extend(kept)
|
||||
|
||||
if len(capped) < len(articles):
|
||||
logger.info(f"Domain-Cap gesamt: {len(articles)} → {len(capped)} Artikel")
|
||||
|
||||
return capped
|
||||
|
||||
def _is_german(self, text: str) -> bool:
|
||||
"""Einfache Heuristik ob ein Text deutsch ist."""
|
||||
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein",
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren