From ff4c54d9a88e2ff46b3d0db10f6d1e1e9e6305e3 Mon Sep 17 00:00:00 2001 From: claude-dev Date: Wed, 4 Mar 2026 23:25:04 +0100 Subject: [PATCH] Quellenvielfalt sicherstellen: Domain-Cap + Balance + Discovery-Verbesserungen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.py: MAX_FEEDS_PER_DOMAIN=3, MAX_ARTICLES_PER_DOMAIN_RSS=10 - rss_parser.py: _apply_domain_cap() begrenzt Artikel pro Domain nach RSS-Fetch - orchestrator.py: Domain-Balance vor Feed-Selektion (max 3 Feeds/Domain), Domain-Cap in Background-Discovery - source_rules.py: article_count in get_feeds_with_metadata(), Content-Hash in _validate_feed() für Duplikat-Erkennung bei Discovery - researcher.py: QUELLENVIELFALT-Regel im Haiku Feed-Selektions-Prompt - DB: 52 WordPress-Redirect-Duplikate deaktiviert (netzpolitik.org, bashinho.de) Co-Authored-By: Claude Opus 4.6 --- src/agents/orchestrator.py | 33 ++++++++++++++++++++++++++++++++- src/agents/researcher.py | 1 + src/config.py | 4 ++++ src/feeds/rss_parser.py | 38 +++++++++++++++++++++++++++++++++++++- src/source_rules.py | 35 +++++++++++++++++++++++++++++------ 5 files changed, 103 insertions(+), 8 deletions(-) diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 501b1d5..3cc96bf 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -4,8 +4,9 @@ import json import logging import re from datetime import datetime, timezone -from config import TIMEZONE +from config import TIMEZONE, MAX_FEEDS_PER_DOMAIN from typing import Optional +from collections import defaultdict from urllib.parse import urlparse, urlunparse from agents.claude_client import UsageAccumulator @@ -162,6 +163,14 @@ async def _background_discover_sources(articles: list[dict]): # 3. Gegen DB prüfen — welche Domains existieren schon? new_count = 0 for domain, url, category in domains_to_check: + cursor = await db.execute( + "SELECT id FROM sources WHERE LOWER(domain) = ? AND source_type = 'rss_feed' AND status = 'active'", + (domain.lower(),), + ) + existing_feeds = await cursor.fetchall() + if len(existing_feeds) >= MAX_FEEDS_PER_DOMAIN: + continue # Domain hat bereits genug aktive Feeds + cursor = await db.execute( "SELECT id FROM sources WHERE LOWER(domain) = ?", (domain.lower(),), @@ -578,6 +587,28 @@ class AgentOrchestrator: from source_rules import get_feeds_with_metadata all_feeds = await get_feeds_with_metadata(tenant_id=tenant_id) + # Domain-Balance: Max. MAX_FEEDS_PER_DOMAIN Feeds pro Domain + feeds_by_domain: dict[str, list[dict]] = defaultdict(list) + for feed in all_feeds: + feeds_by_domain[feed.get("domain", "")].append(feed) + + balanced_feeds = [] + for domain, domain_feeds in feeds_by_domain.items(): + if len(domain_feeds) > MAX_FEEDS_PER_DOMAIN: + # Nach article_count sortieren, meistgenutzte behalten + domain_feeds.sort(key=lambda f: f.get("article_count", 0), reverse=True) + kept = domain_feeds[:MAX_FEEDS_PER_DOMAIN] + logger.info( + f"Domain-Balance: {domain} von {len(domain_feeds)} auf {MAX_FEEDS_PER_DOMAIN} Feeds begrenzt" + ) + balanced_feeds.extend(kept) + else: + balanced_feeds.extend(domain_feeds) + + if len(balanced_feeds) < len(all_feeds): + logger.info(f"Domain-Balance gesamt: {len(all_feeds)} → {len(balanced_feeds)} Feeds") + all_feeds = balanced_feeds + feed_usage = None if len(all_feeds) > 20: selected_feeds, feed_usage = await rss_researcher.select_relevant_feeds( diff --git a/src/agents/researcher.py b/src/agents/researcher.py index 9f990b4..3354138 100644 --- a/src/agents/researcher.py +++ b/src/agents/researcher.py @@ -97,6 +97,7 @@ REGELN: - Lieber einen Feed zu viel als zu wenig auswählen - Bei "Internationale Quellen: Nein": Keine internationalen Feeds auswählen - Allgemeine Nachrichtenfeeds (tagesschau, Spiegel etc.) sind fast immer relevant +- QUELLENVIELFALT: Wähle pro Domain maximal 2-3 Feeds. Bevorzuge eine breite Mischung aus verschiedenen Quellen statt vieler Feeds derselben Domain. - Antworte NUR mit einem JSON-Array der Nummern, z.B. [1, 2, 5, 12]""" diff --git a/src/config.py b/src/config.py index 7e4603e..0bd939e 100644 --- a/src/config.py +++ b/src/config.py @@ -65,6 +65,10 @@ SMTP_FROM_EMAIL = os.environ.get("SMTP_FROM_EMAIL", "noreply@intelsight.de") SMTP_FROM_NAME = os.environ.get("SMTP_FROM_NAME", "AegisSight Monitor") SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "true").lower() == "true" +# Quellenvielfalt: Domain-Begrenzungen +MAX_FEEDS_PER_DOMAIN = 3 # Max. Feeds pro Domain bei Feed-Selektion +MAX_ARTICLES_PER_DOMAIN_RSS = 10 # Max. Artikel pro Domain nach RSS-Fetch + # Magic Link MAGIC_LINK_EXPIRE_MINUTES = 10 MAGIC_LINK_BASE_URL = os.environ.get("MAGIC_LINK_BASE_URL", "https://osint.intelsight.de") diff --git a/src/feeds/rss_parser.py b/src/feeds/rss_parser.py index b047b85..4282b43 100644 --- a/src/feeds/rss_parser.py +++ b/src/feeds/rss_parser.py @@ -4,7 +4,8 @@ import logging import feedparser import httpx from datetime import datetime, timezone -from config import TIMEZONE +from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS +from source_rules import _extract_domain logger = logging.getLogger("osint.rss") @@ -58,6 +59,7 @@ class RSSParser: cat_info = "alle" if international else "nur deutsch + behörden" logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer") + all_articles = self._apply_domain_cap(all_articles) return all_articles async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]: @@ -88,6 +90,7 @@ class RSSParser: all_articles.extend(result) logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds") + all_articles = self._apply_domain_cap(all_articles) return all_articles async def _get_rss_feeds(self, tenant_id: int = None) -> dict: @@ -153,6 +156,39 @@ class RSSParser: return articles + def _apply_domain_cap(self, articles: list[dict]) -> list[dict]: + """Begrenzt die Anzahl der Artikel pro Domain auf MAX_ARTICLES_PER_DOMAIN_RSS. + + Gruppiert nach Domain, sortiert pro Domain nach relevance_score (beste zuerst), + behält nur die Top-N pro Domain. + """ + if not articles: + return articles + + # Nach Domain gruppieren + by_domain: dict[str, list[dict]] = {} + for article in articles: + domain = _extract_domain(article.get("source_url", "")) + if not domain: + domain = "__unknown__" + by_domain.setdefault(domain, []).append(article) + + capped = [] + for domain, domain_articles in by_domain.items(): + # Nach Relevanz sortieren (beste zuerst) + domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True) + kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS] + if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS: + logger.info( + f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt" + ) + capped.extend(kept) + + if len(capped) < len(articles): + logger.info(f"Domain-Cap gesamt: {len(articles)} → {len(capped)} Artikel") + + return capped + def _is_german(self, text: str) -> bool: """Einfache Heuristik ob ein Text deutsch ist.""" german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein", diff --git a/src/source_rules.py b/src/source_rules.py index 758fbb0..a227d88 100644 --- a/src/source_rules.py +++ b/src/source_rules.py @@ -8,7 +8,8 @@ from urllib.parse import urlparse import httpx import feedparser -from config import CLAUDE_PATH, CLAUDE_TIMEOUT +import hashlib +from config import CLAUDE_PATH, CLAUDE_TIMEOUT, MAX_FEEDS_PER_DOMAIN logger = logging.getLogger("osint.source_rules") @@ -289,6 +290,15 @@ def domain_to_display_name(domain: str) -> str: return core.replace("-", " ").title() +def _compute_content_hash(entries: list) -> str: + """Berechnet einen Fingerprint aus den ersten 5 Entry-Titeln eines Feeds.""" + titles = [e.get("title", "") for e in entries[:5]] + combined = "|".join(titles).strip() + if not combined: + return "" + return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16] + + async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None: """Prüft ob eine URL ein gültiger RSS/Atom-Feed ist. Gibt Feed-Info zurück oder None.""" try: @@ -304,9 +314,11 @@ async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None: if feed.get("bozo") and not feed.entries: return None if feed.feed.get("title") or feed.entries: + content_hash = _compute_content_hash(feed.entries) return { "url": str(resp.url), # Finale URL nach Redirects "title": feed.feed.get("title", ""), + "content_hash": content_hash, } except Exception: pass @@ -431,6 +443,7 @@ async def discover_all_feeds(url: str) -> dict: } seen_urls = set() + seen_content_hashes = set() async with httpx.AsyncClient( timeout=15.0, @@ -487,9 +500,19 @@ async def discover_all_feeds(url: str) -> dict: batch = candidate_urls[i:i + 10] results = await asyncio.gather(*[_validate_and_collect(u) for u in batch]) for feed_info in results: - if feed_info and feed_info["url"] not in seen_urls: - seen_urls.add(feed_info["url"]) - result["feeds"].append(feed_info) + if not feed_info: + continue + if feed_info["url"] in seen_urls: + continue + # Content-Hash Duplikat-Erkennung (gleicher Inhalt = WordPress-Redirect etc.) + content_hash = feed_info.get("content_hash", "") + if content_hash and content_hash in seen_content_hashes: + logger.debug(f"Content-Hash Duplikat übersprungen: {feed_info['url']}") + continue + seen_urls.add(feed_info["url"]) + if content_hash: + seen_content_hashes.add(content_hash) + result["feeds"].append(feed_info) logger.info(f"discover_all_feeds({domain}): {len(result['feeds'])} Feeds gefunden") return result @@ -606,14 +629,14 @@ async def get_feeds_with_metadata(tenant_id: int = None) -> list[dict]: try: if tenant_id: cursor = await db.execute( - "SELECT name, url, domain, category FROM sources " + "SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources " "WHERE source_type = 'rss_feed' AND status = 'active' " "AND (tenant_id IS NULL OR tenant_id = ?)", (tenant_id,), ) else: cursor = await db.execute( - "SELECT name, url, domain, category FROM sources " + "SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources " "WHERE source_type = 'rss_feed' AND status = 'active'" ) return [dict(row) for row in await cursor.fetchall()]