Quellenvielfalt sicherstellen: Domain-Cap + Balance + Discovery-Verbesserungen
- config.py: MAX_FEEDS_PER_DOMAIN=3, MAX_ARTICLES_PER_DOMAIN_RSS=10 - rss_parser.py: _apply_domain_cap() begrenzt Artikel pro Domain nach RSS-Fetch - orchestrator.py: Domain-Balance vor Feed-Selektion (max 3 Feeds/Domain), Domain-Cap in Background-Discovery - source_rules.py: article_count in get_feeds_with_metadata(), Content-Hash in _validate_feed() für Duplikat-Erkennung bei Discovery - researcher.py: QUELLENVIELFALT-Regel im Haiku Feed-Selektions-Prompt - DB: 52 WordPress-Redirect-Duplikate deaktiviert (netzpolitik.org, bashinho.de) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -8,7 +8,8 @@ from urllib.parse import urlparse
|
||||
import httpx
|
||||
import feedparser
|
||||
|
||||
from config import CLAUDE_PATH, CLAUDE_TIMEOUT
|
||||
import hashlib
|
||||
from config import CLAUDE_PATH, CLAUDE_TIMEOUT, MAX_FEEDS_PER_DOMAIN
|
||||
|
||||
logger = logging.getLogger("osint.source_rules")
|
||||
|
||||
@@ -289,6 +290,15 @@ def domain_to_display_name(domain: str) -> str:
|
||||
return core.replace("-", " ").title()
|
||||
|
||||
|
||||
def _compute_content_hash(entries: list) -> str:
|
||||
"""Berechnet einen Fingerprint aus den ersten 5 Entry-Titeln eines Feeds."""
|
||||
titles = [e.get("title", "") for e in entries[:5]]
|
||||
combined = "|".join(titles).strip()
|
||||
if not combined:
|
||||
return ""
|
||||
return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None:
|
||||
"""Prüft ob eine URL ein gültiger RSS/Atom-Feed ist. Gibt Feed-Info zurück oder None."""
|
||||
try:
|
||||
@@ -304,9 +314,11 @@ async def _validate_feed(client: httpx.AsyncClient, url: str) -> dict | None:
|
||||
if feed.get("bozo") and not feed.entries:
|
||||
return None
|
||||
if feed.feed.get("title") or feed.entries:
|
||||
content_hash = _compute_content_hash(feed.entries)
|
||||
return {
|
||||
"url": str(resp.url), # Finale URL nach Redirects
|
||||
"title": feed.feed.get("title", ""),
|
||||
"content_hash": content_hash,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
@@ -431,6 +443,7 @@ async def discover_all_feeds(url: str) -> dict:
|
||||
}
|
||||
|
||||
seen_urls = set()
|
||||
seen_content_hashes = set()
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
@@ -487,9 +500,19 @@ async def discover_all_feeds(url: str) -> dict:
|
||||
batch = candidate_urls[i:i + 10]
|
||||
results = await asyncio.gather(*[_validate_and_collect(u) for u in batch])
|
||||
for feed_info in results:
|
||||
if feed_info and feed_info["url"] not in seen_urls:
|
||||
seen_urls.add(feed_info["url"])
|
||||
result["feeds"].append(feed_info)
|
||||
if not feed_info:
|
||||
continue
|
||||
if feed_info["url"] in seen_urls:
|
||||
continue
|
||||
# Content-Hash Duplikat-Erkennung (gleicher Inhalt = WordPress-Redirect etc.)
|
||||
content_hash = feed_info.get("content_hash", "")
|
||||
if content_hash and content_hash in seen_content_hashes:
|
||||
logger.debug(f"Content-Hash Duplikat übersprungen: {feed_info['url']}")
|
||||
continue
|
||||
seen_urls.add(feed_info["url"])
|
||||
if content_hash:
|
||||
seen_content_hashes.add(content_hash)
|
||||
result["feeds"].append(feed_info)
|
||||
|
||||
logger.info(f"discover_all_feeds({domain}): {len(result['feeds'])} Feeds gefunden")
|
||||
return result
|
||||
@@ -606,14 +629,14 @@ async def get_feeds_with_metadata(tenant_id: int = None) -> list[dict]:
|
||||
try:
|
||||
if tenant_id:
|
||||
cursor = await db.execute(
|
||||
"SELECT name, url, domain, category FROM sources "
|
||||
"SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources "
|
||||
"WHERE source_type = 'rss_feed' AND status = 'active' "
|
||||
"AND (tenant_id IS NULL OR tenant_id = ?)",
|
||||
(tenant_id,),
|
||||
)
|
||||
else:
|
||||
cursor = await db.execute(
|
||||
"SELECT name, url, domain, category FROM sources "
|
||||
"SELECT name, url, domain, category, COALESCE(article_count, 0) AS article_count FROM sources "
|
||||
"WHERE source_type = 'rss_feed' AND status = 'active'"
|
||||
)
|
||||
return [dict(row) for row in await cursor.fetchall()]
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren