Commits vergleichen
2 Commits
f7fc09c864
...
14b98b59e0
| Autor | SHA1 | Datum | |
|---|---|---|---|
| 14b98b59e0 | |||
| 0e4c78d50a |
@@ -922,7 +922,22 @@ class AgentOrchestrator:
|
||||
# Feed-Selektion-Keywords nur als Fallback wenn dynamische fehlen
|
||||
if not keywords:
|
||||
keywords = feed_sel_keywords
|
||||
articles = await rss_parser.search_feeds_selective(title, selected_feeds, keywords=keywords)
|
||||
# --- Recall-Boost: dynamische Google-News-Volltext-Suchfeeds ---
|
||||
# Statt nur feste site:-Feeds zu durchsuchen, baut die Pipeline
|
||||
# pro Sprache einen Google-News-Suchfeed aus den Keywords. Damit
|
||||
# erreichen wir Quellen, die in keinem festen Feed stehen
|
||||
# (Vendor-Blogs, Fachportale, Regionalmedien).
|
||||
from agents.researcher import build_news_search_feeds
|
||||
if source_lang_whitelist:
|
||||
_gnews_langs = list(source_lang_whitelist)
|
||||
else:
|
||||
_gnews_langs = list({output_language_iso, research_language_iso})
|
||||
_gnews_feeds = build_news_search_feeds(keywords, _gnews_langs)
|
||||
if _gnews_feeds:
|
||||
logger.info(f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)}")
|
||||
articles = await rss_parser.search_feeds_selective(
|
||||
title, selected_feeds + _gnews_feeds, keywords=keywords,
|
||||
)
|
||||
else:
|
||||
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
||||
|
||||
|
||||
@@ -2,12 +2,96 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import urllib.parse
|
||||
from agents.claude_client import call_claude, ClaudeUsage
|
||||
from config import CLAUDE_MODEL_FAST
|
||||
|
||||
logger = logging.getLogger("osint.researcher")
|
||||
|
||||
|
||||
# Google-News-Locale pro ISO-Sprachcode: (hl, gl). ceid wird daraus gebaut.
|
||||
_GNEWS_LOCALE = {
|
||||
"ja": ("ja", "JP"),
|
||||
"de": ("de", "DE"),
|
||||
"en": ("en-US", "US"),
|
||||
"ru": ("ru", "RU"),
|
||||
"ko": ("ko", "KR"),
|
||||
"zh": ("zh-CN", "CN"),
|
||||
"fr": ("fr", "FR"),
|
||||
"es": ("es", "ES"),
|
||||
"it": ("it", "IT"),
|
||||
"ar": ("ar", "EG"),
|
||||
"he": ("iw", "IL"),
|
||||
"fa": ("fa", "IR"),
|
||||
}
|
||||
|
||||
|
||||
def build_news_search_feeds(
|
||||
keywords_by_lang: dict | list | None,
|
||||
languages: list[str],
|
||||
max_keywords: int = 4,
|
||||
) -> list[dict]:
|
||||
"""Baut dynamische Google-News-Volltext-Such-Feeds pro Sprache.
|
||||
|
||||
Statt nur feste site:-RSS-Feeds zu durchsuchen, erzeugt diese Funktion pro
|
||||
Sprache einen Google-News-Suchfeed (news.google.com/rss/search?q=...). Damit
|
||||
erreicht die Pipeline auch Quellen, die in keinem festen Feed stehen
|
||||
(Security-Vendor-Blogs, Fachportale, Regionalmedien). Der Recall steigt
|
||||
massiv; die Precision bleibt, weil der nachgelagerte Topic-Filter unveraendert
|
||||
greift.
|
||||
|
||||
Args:
|
||||
keywords_by_lang: Sprach-Dict {iso: [keyword,...]} aus der Keyword-Extraktion.
|
||||
languages: ISO-Codes, fuer die ein Suchfeed gebaut werden soll.
|
||||
max_keywords: wie viele (spezifischste) Keywords in die Such-Query gehen.
|
||||
|
||||
Returns:
|
||||
Liste von Feed-Config-Dicts (kompatibel mit RSSParser._fetch_feed).
|
||||
"""
|
||||
if not keywords_by_lang or not isinstance(keywords_by_lang, dict):
|
||||
return []
|
||||
|
||||
feeds: list[dict] = []
|
||||
seen_queries: set[str] = set()
|
||||
for lang in languages:
|
||||
lang_key = (lang or "").lower().strip()
|
||||
locale = _GNEWS_LOCALE.get(lang_key)
|
||||
if not locale:
|
||||
continue
|
||||
kws = keywords_by_lang.get(lang_key) or []
|
||||
# Fallback: wenn fuer die Sprache keine Keywords da sind, "en" nehmen
|
||||
# (lateinische Eigennamen matchen auch in fremdsprachigen News-Indizes).
|
||||
if not kws and lang_key != "en":
|
||||
kws = keywords_by_lang.get("en") or []
|
||||
kws = [str(k).strip() for k in kws if str(k).strip()]
|
||||
if not kws:
|
||||
continue
|
||||
query = " ".join(kws[:max_keywords])
|
||||
if not query or query in seen_queries:
|
||||
continue
|
||||
seen_queries.add(query)
|
||||
|
||||
hl, gl = locale
|
||||
ceid_lang = hl.split("-")[0]
|
||||
url = (
|
||||
"https://news.google.com/rss/search?q="
|
||||
+ urllib.parse.quote(query)
|
||||
+ f"&hl={hl}&gl={gl}&ceid={gl}:{ceid_lang}"
|
||||
)
|
||||
feeds.append({
|
||||
"name": f"Google News Suche ({lang_key}): {query}",
|
||||
"url": url,
|
||||
# Eigene Domain-Gruppe, damit der Domain-Cap die Such-Feeds NICHT mit
|
||||
# den site:-Google-News-Feeds in einen Topf wirft.
|
||||
"domain": f"google-news-search-{lang_key}",
|
||||
"primary_language": lang_key,
|
||||
"category": "international",
|
||||
"media_type": "",
|
||||
})
|
||||
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, query)
|
||||
return feeds
|
||||
|
||||
|
||||
class ResearcherParseError(Exception):
|
||||
"""Claude hat eine nicht-leere Antwort geliefert, aus der kein JSON extrahiert werden konnte."""
|
||||
|
||||
|
||||
@@ -6,6 +6,11 @@ import httpx
|
||||
from datetime import datetime, timezone
|
||||
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||
from source_rules import _extract_domain
|
||||
|
||||
# Cap fuer dynamische Google-News-Suchfeeds — hoeher als der normale Domain-Cap,
|
||||
# weil ein Suchfeed gezielt fuer breiten Recall gebaut wird. Topic-Filter
|
||||
# entscheidet danach ueber die Precision.
|
||||
MAX_ARTICLES_PER_DOMAIN_RSS_SEARCH = 25
|
||||
from feeds.transcript_extractors._common import html_to_text
|
||||
from services.post_refresh_qc import normalize_german_umlauts
|
||||
from agents.researcher import keywords_for_language, flatten_keywords
|
||||
@@ -276,10 +281,15 @@ class RSSParser:
|
||||
for domain, domain_articles in by_domain.items():
|
||||
# Nach Relevanz sortieren (beste zuerst)
|
||||
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
|
||||
kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS]
|
||||
if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS:
|
||||
# Dynamische Google-News-Suchfeeds ("google-news-search-<lang>") sind
|
||||
# der Recall-Treiber und bekommen einen hoeheren Cap als feste Feeds.
|
||||
cap = (MAX_ARTICLES_PER_DOMAIN_RSS_SEARCH
|
||||
if domain.startswith("google-news-search-")
|
||||
else MAX_ARTICLES_PER_DOMAIN_RSS)
|
||||
kept = domain_articles[:cap]
|
||||
if len(domain_articles) > cap:
|
||||
logger.info(
|
||||
f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt"
|
||||
f"Domain-Cap: {domain} von {len(domain_articles)} auf {cap} Artikel begrenzt"
|
||||
)
|
||||
capped.extend(kept)
|
||||
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren