Recall: dynamische Google-News-Volltext-Suchfeeds #33
@@ -922,7 +922,22 @@ class AgentOrchestrator:
|
|||||||
# Feed-Selektion-Keywords nur als Fallback wenn dynamische fehlen
|
# Feed-Selektion-Keywords nur als Fallback wenn dynamische fehlen
|
||||||
if not keywords:
|
if not keywords:
|
||||||
keywords = feed_sel_keywords
|
keywords = feed_sel_keywords
|
||||||
articles = await rss_parser.search_feeds_selective(title, selected_feeds, keywords=keywords)
|
# --- Recall-Boost: dynamische Google-News-Volltext-Suchfeeds ---
|
||||||
|
# Statt nur feste site:-Feeds zu durchsuchen, baut die Pipeline
|
||||||
|
# pro Sprache einen Google-News-Suchfeed aus den Keywords. Damit
|
||||||
|
# erreichen wir Quellen, die in keinem festen Feed stehen
|
||||||
|
# (Vendor-Blogs, Fachportale, Regionalmedien).
|
||||||
|
from agents.researcher import build_news_search_feeds
|
||||||
|
if source_lang_whitelist:
|
||||||
|
_gnews_langs = list(source_lang_whitelist)
|
||||||
|
else:
|
||||||
|
_gnews_langs = list({output_language_iso, research_language_iso})
|
||||||
|
_gnews_feeds = build_news_search_feeds(keywords, _gnews_langs)
|
||||||
|
if _gnews_feeds:
|
||||||
|
logger.info(f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)}")
|
||||||
|
articles = await rss_parser.search_feeds_selective(
|
||||||
|
title, selected_feeds + _gnews_feeds, keywords=keywords,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
||||||
|
|
||||||
|
|||||||
@@ -2,12 +2,96 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import urllib.parse
|
||||||
from agents.claude_client import call_claude, ClaudeUsage
|
from agents.claude_client import call_claude, ClaudeUsage
|
||||||
from config import CLAUDE_MODEL_FAST
|
from config import CLAUDE_MODEL_FAST
|
||||||
|
|
||||||
logger = logging.getLogger("osint.researcher")
|
logger = logging.getLogger("osint.researcher")
|
||||||
|
|
||||||
|
|
||||||
|
# Google-News-Locale pro ISO-Sprachcode: (hl, gl). ceid wird daraus gebaut.
|
||||||
|
_GNEWS_LOCALE = {
|
||||||
|
"ja": ("ja", "JP"),
|
||||||
|
"de": ("de", "DE"),
|
||||||
|
"en": ("en-US", "US"),
|
||||||
|
"ru": ("ru", "RU"),
|
||||||
|
"ko": ("ko", "KR"),
|
||||||
|
"zh": ("zh-CN", "CN"),
|
||||||
|
"fr": ("fr", "FR"),
|
||||||
|
"es": ("es", "ES"),
|
||||||
|
"it": ("it", "IT"),
|
||||||
|
"ar": ("ar", "EG"),
|
||||||
|
"he": ("iw", "IL"),
|
||||||
|
"fa": ("fa", "IR"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_news_search_feeds(
|
||||||
|
keywords_by_lang: dict | list | None,
|
||||||
|
languages: list[str],
|
||||||
|
max_keywords: int = 4,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Baut dynamische Google-News-Volltext-Such-Feeds pro Sprache.
|
||||||
|
|
||||||
|
Statt nur feste site:-RSS-Feeds zu durchsuchen, erzeugt diese Funktion pro
|
||||||
|
Sprache einen Google-News-Suchfeed (news.google.com/rss/search?q=...). Damit
|
||||||
|
erreicht die Pipeline auch Quellen, die in keinem festen Feed stehen
|
||||||
|
(Security-Vendor-Blogs, Fachportale, Regionalmedien). Der Recall steigt
|
||||||
|
massiv; die Precision bleibt, weil der nachgelagerte Topic-Filter unveraendert
|
||||||
|
greift.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
keywords_by_lang: Sprach-Dict {iso: [keyword,...]} aus der Keyword-Extraktion.
|
||||||
|
languages: ISO-Codes, fuer die ein Suchfeed gebaut werden soll.
|
||||||
|
max_keywords: wie viele (spezifischste) Keywords in die Such-Query gehen.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste von Feed-Config-Dicts (kompatibel mit RSSParser._fetch_feed).
|
||||||
|
"""
|
||||||
|
if not keywords_by_lang or not isinstance(keywords_by_lang, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
feeds: list[dict] = []
|
||||||
|
seen_queries: set[str] = set()
|
||||||
|
for lang in languages:
|
||||||
|
lang_key = (lang or "").lower().strip()
|
||||||
|
locale = _GNEWS_LOCALE.get(lang_key)
|
||||||
|
if not locale:
|
||||||
|
continue
|
||||||
|
kws = keywords_by_lang.get(lang_key) or []
|
||||||
|
# Fallback: wenn fuer die Sprache keine Keywords da sind, "en" nehmen
|
||||||
|
# (lateinische Eigennamen matchen auch in fremdsprachigen News-Indizes).
|
||||||
|
if not kws and lang_key != "en":
|
||||||
|
kws = keywords_by_lang.get("en") or []
|
||||||
|
kws = [str(k).strip() for k in kws if str(k).strip()]
|
||||||
|
if not kws:
|
||||||
|
continue
|
||||||
|
query = " ".join(kws[:max_keywords])
|
||||||
|
if not query or query in seen_queries:
|
||||||
|
continue
|
||||||
|
seen_queries.add(query)
|
||||||
|
|
||||||
|
hl, gl = locale
|
||||||
|
ceid_lang = hl.split("-")[0]
|
||||||
|
url = (
|
||||||
|
"https://news.google.com/rss/search?q="
|
||||||
|
+ urllib.parse.quote(query)
|
||||||
|
+ f"&hl={hl}&gl={gl}&ceid={gl}:{ceid_lang}"
|
||||||
|
)
|
||||||
|
feeds.append({
|
||||||
|
"name": f"Google News Suche ({lang_key}): {query}",
|
||||||
|
"url": url,
|
||||||
|
# Eigene Domain-Gruppe, damit der Domain-Cap die Such-Feeds NICHT mit
|
||||||
|
# den site:-Google-News-Feeds in einen Topf wirft.
|
||||||
|
"domain": f"google-news-search-{lang_key}",
|
||||||
|
"primary_language": lang_key,
|
||||||
|
"category": "international",
|
||||||
|
"media_type": "",
|
||||||
|
})
|
||||||
|
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, query)
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
class ResearcherParseError(Exception):
|
class ResearcherParseError(Exception):
|
||||||
"""Claude hat eine nicht-leere Antwort geliefert, aus der kein JSON extrahiert werden konnte."""
|
"""Claude hat eine nicht-leere Antwort geliefert, aus der kein JSON extrahiert werden konnte."""
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,11 @@ import httpx
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
from config import TIMEZONE, MAX_ARTICLES_PER_DOMAIN_RSS
|
||||||
from source_rules import _extract_domain
|
from source_rules import _extract_domain
|
||||||
|
|
||||||
|
# Cap fuer dynamische Google-News-Suchfeeds — hoeher als der normale Domain-Cap,
|
||||||
|
# weil ein Suchfeed gezielt fuer breiten Recall gebaut wird. Topic-Filter
|
||||||
|
# entscheidet danach ueber die Precision.
|
||||||
|
MAX_ARTICLES_PER_DOMAIN_RSS_SEARCH = 25
|
||||||
from feeds.transcript_extractors._common import html_to_text
|
from feeds.transcript_extractors._common import html_to_text
|
||||||
from services.post_refresh_qc import normalize_german_umlauts
|
from services.post_refresh_qc import normalize_german_umlauts
|
||||||
from agents.researcher import keywords_for_language, flatten_keywords
|
from agents.researcher import keywords_for_language, flatten_keywords
|
||||||
@@ -276,10 +281,15 @@ class RSSParser:
|
|||||||
for domain, domain_articles in by_domain.items():
|
for domain, domain_articles in by_domain.items():
|
||||||
# Nach Relevanz sortieren (beste zuerst)
|
# Nach Relevanz sortieren (beste zuerst)
|
||||||
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
|
domain_articles.sort(key=lambda a: a.get("relevance_score", 0), reverse=True)
|
||||||
kept = domain_articles[:MAX_ARTICLES_PER_DOMAIN_RSS]
|
# Dynamische Google-News-Suchfeeds ("google-news-search-<lang>") sind
|
||||||
if len(domain_articles) > MAX_ARTICLES_PER_DOMAIN_RSS:
|
# der Recall-Treiber und bekommen einen hoeheren Cap als feste Feeds.
|
||||||
|
cap = (MAX_ARTICLES_PER_DOMAIN_RSS_SEARCH
|
||||||
|
if domain.startswith("google-news-search-")
|
||||||
|
else MAX_ARTICLES_PER_DOMAIN_RSS)
|
||||||
|
kept = domain_articles[:cap]
|
||||||
|
if len(domain_articles) > cap:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Domain-Cap: {domain} von {len(domain_articles)} auf {MAX_ARTICLES_PER_DOMAIN_RSS} Artikel begrenzt"
|
f"Domain-Cap: {domain} von {len(domain_articles)} auf {cap} Artikel begrenzt"
|
||||||
)
|
)
|
||||||
capped.extend(kept)
|
capped.extend(kept)
|
||||||
|
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren