Recency: Frische-Suchfeed (when:14d) + Aktualitaets-Score #36
@@ -932,11 +932,21 @@ class AgentOrchestrator:
|
||||
_gnews_langs = list(source_lang_whitelist)
|
||||
else:
|
||||
_gnews_langs = list({output_language_iso, research_language_iso})
|
||||
# Zwei Sets: ein Kontext-Feed (alle Zeiten) + ein Frische-Feed
|
||||
# (when:14d). Der Frische-Feed garantiert, dass das aktuelle
|
||||
# Bild eingefangen wird, auch wenn aeltere Artikel relevanter
|
||||
# ranken. Beide laufen durch dieselbe Pipeline; Dedup entfernt
|
||||
# Ueberschneidungen.
|
||||
_gnews_feeds = build_news_search_feeds(keywords, _gnews_langs)
|
||||
if _gnews_feeds:
|
||||
logger.info(f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)}")
|
||||
_gnews_recent = build_news_search_feeds(keywords, _gnews_langs, recency_days=14)
|
||||
_all_gnews = _gnews_feeds + _gnews_recent
|
||||
if _all_gnews:
|
||||
logger.info(
|
||||
f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)} Kontext "
|
||||
f"+ {len(_gnews_recent)} Frische (when:14d)"
|
||||
)
|
||||
articles = await rss_parser.search_feeds_selective(
|
||||
title, selected_feeds + _gnews_feeds, keywords=keywords,
|
||||
title, selected_feeds + _all_gnews, keywords=keywords,
|
||||
)
|
||||
else:
|
||||
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
||||
|
||||
@@ -30,6 +30,7 @@ def build_news_search_feeds(
|
||||
keywords_by_lang: dict | list | None,
|
||||
languages: list[str],
|
||||
max_keywords: int = 4,
|
||||
recency_days: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Baut dynamische Google-News-Volltext-Such-Feeds pro Sprache.
|
||||
|
||||
@@ -44,6 +45,9 @@ def build_news_search_feeds(
|
||||
keywords_by_lang: Sprach-Dict {iso: [keyword,...]} aus der Keyword-Extraktion.
|
||||
languages: ISO-Codes, fuer die ein Suchfeed gebaut werden soll.
|
||||
max_keywords: wie viele (spezifischste) Keywords in die Such-Query gehen.
|
||||
recency_days: wenn gesetzt, wird der Google-News-Operator "when:Nd" an die
|
||||
Query gehaengt — der Feed liefert dann nur Artikel der letzten N Tage.
|
||||
Fuer "Frische-Suchfeeds", die das aktuelle Bild garantiert einfangen.
|
||||
|
||||
Returns:
|
||||
Liste von Feed-Config-Dicts (kompatibel mit RSSParser._fetch_feed).
|
||||
@@ -88,28 +92,38 @@ def build_news_search_feeds(
|
||||
if not deduped:
|
||||
continue
|
||||
query = " ".join(deduped)
|
||||
if not query or query in seen_queries:
|
||||
# when:Nd-Operator anhaengen (Google-News-Zeitfilter)
|
||||
effective_query = query
|
||||
if recency_days and recency_days > 0:
|
||||
effective_query = f"{query} when:{recency_days}d"
|
||||
if not effective_query or effective_query in seen_queries:
|
||||
continue
|
||||
seen_queries.add(query)
|
||||
seen_queries.add(effective_query)
|
||||
|
||||
hl, gl = locale
|
||||
ceid_lang = hl.split("-")[0]
|
||||
url = (
|
||||
"https://news.google.com/rss/search?q="
|
||||
+ urllib.parse.quote(query)
|
||||
+ urllib.parse.quote(effective_query)
|
||||
+ f"&hl={hl}&gl={gl}&ceid={gl}:{ceid_lang}"
|
||||
)
|
||||
if recency_days and recency_days > 0:
|
||||
name = f"Google News Suche ({lang_key}, letzte {recency_days}d): {query}"
|
||||
domain = f"google-news-search-{lang_key}-recent"
|
||||
else:
|
||||
name = f"Google News Suche ({lang_key}): {query}"
|
||||
domain = f"google-news-search-{lang_key}"
|
||||
feeds.append({
|
||||
"name": f"Google News Suche ({lang_key}): {query}",
|
||||
"name": name,
|
||||
"url": url,
|
||||
# Eigene Domain-Gruppe, damit der Domain-Cap die Such-Feeds NICHT mit
|
||||
# den site:-Google-News-Feeds in einen Topf wirft.
|
||||
"domain": f"google-news-search-{lang_key}",
|
||||
"domain": domain,
|
||||
"primary_language": lang_key,
|
||||
"category": "international",
|
||||
"media_type": "",
|
||||
})
|
||||
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, query)
|
||||
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, effective_query)
|
||||
return feeds
|
||||
|
||||
|
||||
|
||||
@@ -218,14 +218,33 @@ class RSSParser:
|
||||
|
||||
if match_count >= min_matches:
|
||||
published = None
|
||||
published_dt = None
|
||||
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
||||
try:
|
||||
published = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).astimezone(TIMEZONE).isoformat()
|
||||
published_dt = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
|
||||
published = published_dt.astimezone(TIMEZONE).isoformat()
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Relevanz-Score: Anteil der gematchten Suchworte (0.0-1.0)
|
||||
relevance_score = match_count / len(search_words) if search_words else 0.0
|
||||
# Aktualitaets-Bonus/Malus: frische Artikel sollen den
|
||||
# Domain-Cap (sortiert nach relevance_score) ueberleben und
|
||||
# nicht von Monate alten verdraengt werden. Damit faengt die
|
||||
# Pipeline das aktuelle Bild ein. Nur adhoc-Pfad — research
|
||||
# nutzt diesen Code nicht.
|
||||
if published_dt is not None:
|
||||
age_days = (datetime.now(timezone.utc) - published_dt).days
|
||||
if age_days <= 3:
|
||||
relevance_score += 0.35
|
||||
elif age_days <= 14:
|
||||
relevance_score += 0.20
|
||||
elif age_days <= 60:
|
||||
relevance_score += 0.05
|
||||
elif age_days > 365:
|
||||
relevance_score -= 0.30
|
||||
elif age_days > 180:
|
||||
relevance_score -= 0.15
|
||||
|
||||
# Bei Google-News-Feeds: echten Publisher aus <source>-Tag holen
|
||||
article_source = name
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren