Quellenvielfalt sicherstellen: Domain-Cap + Balance + Discovery-Verbesserungen
- config.py: MAX_FEEDS_PER_DOMAIN=3, MAX_ARTICLES_PER_DOMAIN_RSS=10 - rss_parser.py: _apply_domain_cap() begrenzt Artikel pro Domain nach RSS-Fetch - orchestrator.py: Domain-Balance vor Feed-Selektion (max 3 Feeds/Domain), Domain-Cap in Background-Discovery - source_rules.py: article_count in get_feeds_with_metadata(), Content-Hash in _validate_feed() für Duplikat-Erkennung bei Discovery - researcher.py: QUELLENVIELFALT-Regel im Haiku Feed-Selektions-Prompt - DB: 52 WordPress-Redirect-Duplikate deaktiviert (netzpolitik.org, bashinho.de) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -4,8 +4,9 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from config import TIMEZONE
|
from config import TIMEZONE, MAX_FEEDS_PER_DOMAIN
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from collections import defaultdict
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from agents.claude_client import UsageAccumulator
|
from agents.claude_client import UsageAccumulator
|
||||||
@@ -162,6 +163,14 @@ async def _background_discover_sources(articles: list[dict]):
|
|||||||
# 3. Gegen DB prüfen — welche Domains existieren schon?
|
# 3. Gegen DB prüfen — welche Domains existieren schon?
|
||||||
new_count = 0
|
new_count = 0
|
||||||
for domain, url, category in domains_to_check:
|
for domain, url, category in domains_to_check:
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id FROM sources WHERE LOWER(domain) = ? AND source_type = 'rss_feed' AND status = 'active'",
|
||||||
|
(domain.lower(),),
|
||||||
|
)
|
||||||
|
existing_feeds = await cursor.fetchall()
|
||||||
|
if len(existing_feeds) >= MAX_FEEDS_PER_DOMAIN:
|
||||||
|
continue # Domain hat bereits genug aktive Feeds
|
||||||
|
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"SELECT id FROM sources WHERE LOWER(domain) = ?",
|
"SELECT id FROM sources WHERE LOWER(domain) = ?",
|
||||||
(domain.lower(),),
|
(domain.lower(),),
|
||||||
@@ -578,6 +587,28 @@ class AgentOrchestrator:
|
|||||||
from source_rules import get_feeds_with_metadata
|
from source_rules import get_feeds_with_metadata
|
||||||
all_feeds = await get_feeds_with_metadata(tenant_id=tenant_id)
|
all_feeds = await get_feeds_with_metadata(tenant_id=tenant_id)
|
||||||
|
|
||||||
|
# Domain-Balance: Max. MAX_FEEDS_PER_DOMAIN Feeds pro Domain
|
||||||
|
feeds_by_domain: dict[str, list[dict]] = defaultdict(list)
|
||||||
|
for feed in all_feeds:
|
||||||
|
feeds_by_domain[feed.get("domain", "")].append(feed)
|
||||||
|
|
||||||
|
balanced_feeds = []
|
||||||
|
for domain, domain_feeds in feeds_by_domain.items():
|
||||||
|
if len(domain_feeds) > MAX_FEEDS_PER_DOMAIN:
|
||||||
|
# Nach article_count sortieren, meistgenutzte behalten
|
||||||
|
domain_feeds.sort(key=lambda f: f.get("article_count", 0), reverse=True)
|
||||||
|
kept = domain_feeds[:MAX_FEEDS_PER_DOMAIN]
|
||||||
|
logger.info(
|
||||||
|
f"Domain-Balance: {domain} von {len(domain_feeds)} auf {MAX_FEEDS_PER_DOMAIN} Feeds begrenzt"
|
||||||
|
)
|
||||||
|
balanced_feeds.extend(kept)
|
||||||
|
else:
|
||||||
|
balanced_feeds.extend(domain_feeds)
|
||||||
|
|
||||||
|
if len(balanced_feeds) < len(all_feeds):
|
||||||
|
logger.info(f"Domain-Balance gesamt: {len(all_feeds)} → {len(balanced_feeds)} Feeds")
|
||||||
|
all_feeds = balanced_feeds
|
||||||
|
|
||||||
feed_usage = None
|
feed_usage = None
|
||||||
if len(all_feeds) > 20:
|
if len(all_feeds) > 20:
|
||||||
selected_feeds, feed_usage = await rss_researcher.select_relevant_feeds(
|
selected_feeds, feed_usage = await rss_researcher.select_relevant_feeds(
|
||||||
|
|||||||
@@ -97,6 +97,7 @@ REGELN:
|
|||||||
- Lieber einen Feed zu viel als zu wenig auswählen
|
- Lieber einen Feed zu viel als zu wenig auswählen
|
||||||
- Bei "Internationale Quellen: Nein": Keine internationalen Feeds auswählen
|
- Bei "Internationale Quellen: Nein": Keine internationalen Feeds auswählen
|
||||||
- Allgemeine Nachrichtenfeeds (tagesschau, Spiegel etc.) sind fast immer relevant
|
- Allgemeine Nachrichtenfeeds (tagesschau, Spiegel etc.) sind fast immer relevant
|
||||||
|
- QUELLENVIELFALT: Wähle pro Domain maximal 2-3 Feeds. Bevorzuge eine breite Mischung aus verschiedenen Quellen statt vieler Feeds derselben Domain.
|
||||||
- Antworte NUR mit einem JSON-Array der Nummern, z.B. [1, 2, 5, 12]"""
|
- Antworte NUR mit einem JSON-Array der Nummern, z.B. [1, 2, 5, 12]"""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ SMTP_FROM_EMAIL = os.environ.get("SMTP_FROM_EMAIL", "noreply@intelsight.de")
|
|||||||
SMTP_FROM_NAME = os.environ.get("SMTP_FROM_NAME", "AegisSight Monitor")
|
SMTP_FROM_NAME = os.environ.get("SMTP_FROM_NAME", "AegisSight Monitor")
|
||||||
SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "true").lower() == "true"
|
SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "true").lower() == "true"
|
||||||
|
|
||||||
|
# Quellenvielfalt: Domain-Begrenzungen
|
||||||
|
MAX_FEEDS_PER_DOMAIN = 3 # Max. Feeds pro Domain bei Feed-Selektion
|
||||||
|
MAX_ARTICLES_PER_DOMAIN_RSS = 10 # Max. Artikel pro Domain nach RSS-Fetch
|
||||||
|
|
||||||
# Magic Link
|
# Magic Link
|
||||||
MAGIC_LINK_EXPIRE_MINUTES = 10
|
MAGIC_LINK_EXPIRE_MINUTES = 10
|
||||||
MAGIC_LINK_BASE_URL = os.environ.get("MAGIC_LINK_BASE_URL", "https://osint.intelsight.de")
|
MAGIC_LINK_BASE_URL = os.environ.get("MAGIC_LINK_BASE_URL", "https://osint.intelsight.de")
|
||||||
|
|||||||
@@ -4,7 +4,8 @@ import logging
|
|||||||
import feedparser
|
import feedparser
|
||||||
import httpx
|
import httpx
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from config import TIMEZONE
|
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||||
|
from source_rules import _extract_domain
|
||||||
|
|
||||||
logger = logging.getLogger("osint.rss")
|
logger = logging.getLogger("osint.rss")
|
||||||
|
|
||||||
@@ -58,6 +59,7 @@ class RSSParser:
|
|||||||
|
|
||||||
cat_info = "alle" if international else "nur deutsch + behörden"
|
cat_info = "alle" if international else "nur deutsch + behörden"
|
||||||
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
||||||
|
all_articles = self._apply_domain_cap(all_articles)
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|
||||||
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]:
|
async def search_feeds_selective(self, search_term: str, selected_feeds: list[dict]) -> list[dict]:
|
||||||
@@ -88,6 +90,7 @@ class RSSParser:
|
|||||||
all_articles.extend(result)
|
all_articles.extend(result)
|
||||||
|
|
||||||
logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds")
|
logger.info(f"RSS-Selektiv nach '{search_term}': {len(all_articles)} Treffer aus {len(selected_feeds)} Feeds")
|
||||||
|
all_articles = self._apply_domain_cap(all_articles)
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|
||||||
async def _get_rss_feeds(self, tenant_id: int = None) -> dict:
|
async def _get_rss_feeds(self, tenant_id: int = None) -> dict:
|
||||||
@@ -153,6 +156,39 @@ class RSSParser:
|
|||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
def _apply_domain_cap(self, articles: list[dict]) -> list[dict]:
|
||||||
|
"""Begrenzt die Anzahl der Artikel pro Domain auf MAX_ARTICLES_PER_DOMAIN_RSS.
|
||||||
|
|
||||||
|
Gruppiert nach Domain, sortiert pro Domain nach relevance_score (beste zuerst),
|
||||||
|
behält nur die Top-N pro Domain.
|
||||||
|
"""
|
||||||
|
if not articles:
|
||||||
|
return articles
|
||||||
|
|
||||||
|
# Nach Domain gruppieren
|
||||||
|
by_domain: dict[str, list[dict]] = {}
|
||||||
|
for article in articles:
|
||||||
|
domain = _extract_domain(article.get("source_url", ""))
|
||||||
|
if not domain:
|
||||||
|
domain = "__unknown__"
|
||||||
|
by_domain.setdefault(domain, []).append(article)
|
||||||
|
|
||||||
|
capped = []
|
||||||
|
for domain, domain_articles in by_domain.items():
|
||||||
|
# Nach Relevanz sortieren (beste zuerst)
|
||||||
|
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
|
||||||
|
kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS]
|
||||||
|
if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS:
|
||||||
|
logger.info(
|
||||||
|
f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt"
|
||||||
|
)
|
||||||
|
capped.extend(kept)
|
||||||
|
|
||||||
|
if len(capped) < len(articles):
|
||||||
|
logger.info(f"Domain-Cap gesamt: {len(articles)} → {len(capped)} Artikel")
|
||||||
|
|
||||||
|
return capped
|
||||||
|
|
||||||
def _is_german(self, text: str) -> bool:
|
def _is_german(self, text: str) -> bool:
|
||||||
"""Einfache Heuristik ob ein Text deutsch ist."""
|
"""Einfache Heuristik ob ein Text deutsch ist."""
|
||||||
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein",
|
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "für", "auf", "ein",
|
||||||
|
|||||||
@@ -8,7 +8,8 @@ from urllib.parse import urlparse
|
|||||||
import httpx
|
import httpx
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
from config import CLAUDE_PATH, CLAUDE_TIMEOUT
|
import hashlib
|
||||||
|
from config import CLAUDE_PATH, CLAUDE_TIMEOUT, MAX_FEEDS_PER_DOMAIN
|
||||||
|
|
||||||
logger = logging.getLogger("osint.source_rules")
|
logger = logging.getLogger("osint.source_rules")
|
||||||
|
|
||||||
@@ -289,6 +290,15 @@ def domain_to_display_name(domain: str) -> str:
|
|||||||
return core.replace("-", " ").title()
|
return core.replace("-", " ").title()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_content_hash(entries: list) -> str:
|
||||||
|
"""Berechnet einen Fingerprint aus den ersten 5 Entry-Titeln eines Feeds."""
|
||||||
|
titles = [e.get("title", "") for e in entries[:5]]
|
||||||
|
combined = "|".join(titles).strip()
|
||||||
|
if not combined:
|
||||||
|
return ""
|
||||||
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None:
|
async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None:
|
||||||
"""Prüft ob eine URL ein gültiger RSS/Atom-Feed ist. Gibt Feed-Info zurück oder None."""
|
"""Prüft ob eine URL ein gültiger RSS/Atom-Feed ist. Gibt Feed-Info zurück oder None."""
|
||||||
try:
|
try:
|
||||||
@@ -304,9 +314,11 @@ async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None:
|
|||||||
if feed.get("bozo") and not feed.entries:
|
if feed.get("bozo") and not feed.entries:
|
||||||
return None
|
return None
|
||||||
if feed.feed.get("title") or feed.entries:
|
if feed.feed.get("title") or feed.entries:
|
||||||
|
content_hash = _compute_content_hash(feed.entries)
|
||||||
return {
|
return {
|
||||||
"url": str(resp.url), # Finale URL nach Redirects
|
"url": str(resp.url), # Finale URL nach Redirects
|
||||||
"title": feed.feed.get("title", ""),
|
"title": feed.feed.get("title", ""),
|
||||||
|
"content_hash": content_hash,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@@ -431,6 +443,7 @@ async def discover_all_feeds(url: str) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
|
seen_content_hashes = set()
|
||||||
|
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
timeout=15.0,
|
timeout=15.0,
|
||||||
@@ -487,9 +500,19 @@ async def discover_all_feeds(url: str) -> dict:
|
|||||||
batch = candidate_urls[i:i + 10]
|
batch = candidate_urls[i:i + 10]
|
||||||
results = await asyncio.gather(*[_validate_and_collect(u) for u in batch])
|
results = await asyncio.gather(*[_validate_and_collect(u) for u in batch])
|
||||||
for feed_info in results:
|
for feed_info in results:
|
||||||
if feed_info and feed_info["url"] not in seen_urls:
|
if not feed_info:
|
||||||
seen_urls.add(feed_info["url"])
|
continue
|
||||||
result["feeds"].append(feed_info)
|
if feed_info["url"] in seen_urls:
|
||||||
|
continue
|
||||||
|
# Content-Hash Duplikat-Erkennung (gleicher Inhalt = WordPress-Redirect etc.)
|
||||||
|
content_hash = feed_info.get("content_hash", "")
|
||||||
|
if content_hash and content_hash in seen_content_hashes:
|
||||||
|
logger.debug(f"Content-Hash Duplikat übersprungen: {feed_info['url']}")
|
||||||
|
continue
|
||||||
|
seen_urls.add(feed_info["url"])
|
||||||
|
if content_hash:
|
||||||
|
seen_content_hashes.add(content_hash)
|
||||||
|
result["feeds"].append(feed_info)
|
||||||
|
|
||||||
logger.info(f"discover_all_feeds({domain}): {len(result['feeds'])} Feeds gefunden")
|
logger.info(f"discover_all_feeds({domain}): {len(result['feeds'])} Feeds gefunden")
|
||||||
return result
|
return result
|
||||||
@@ -606,14 +629,14 @@ async def get_feeds_with_metadata(tenant_id: int = None) -> list[dict]:
|
|||||||
try:
|
try:
|
||||||
if tenant_id:
|
if tenant_id:
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"SELECT name, url, domain, category FROM sources "
|
"SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources "
|
||||||
"WHERE source_type = 'rss_feed' AND status = 'active' "
|
"WHERE source_type = 'rss_feed' AND status = 'active' "
|
||||||
"AND (tenant_id IS NULL OR tenant_id = ?)",
|
"AND (tenant_id IS NULL OR tenant_id = ?)",
|
||||||
(tenant_id,),
|
(tenant_id,),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"SELECT name, url, domain, category FROM sources "
|
"SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources "
|
||||||
"WHERE source_type = 'rss_feed' AND status = 'active'"
|
"WHERE source_type = 'rss_feed' AND status = 'active'"
|
||||||
)
|
)
|
||||||
return [dict(row) for row in await cursor.fetchall()]
|
return [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren