Recency Frische-Suchfeed (#36)
This commit was merged in pull request #36.
Dieser Commit ist enthalten in:
@@ -932,11 +932,21 @@ class AgentOrchestrator:
|
|||||||
_gnews_langs = list(source_lang_whitelist)
|
_gnews_langs = list(source_lang_whitelist)
|
||||||
else:
|
else:
|
||||||
_gnews_langs = list({output_language_iso, research_language_iso})
|
_gnews_langs = list({output_language_iso, research_language_iso})
|
||||||
|
# Zwei Sets: ein Kontext-Feed (alle Zeiten) + ein Frische-Feed
|
||||||
|
# (when:14d). Der Frische-Feed garantiert, dass das aktuelle
|
||||||
|
# Bild eingefangen wird, auch wenn aeltere Artikel relevanter
|
||||||
|
# ranken. Beide laufen durch dieselbe Pipeline; Dedup entfernt
|
||||||
|
# Ueberschneidungen.
|
||||||
_gnews_feeds = build_news_search_feeds(keywords, _gnews_langs)
|
_gnews_feeds = build_news_search_feeds(keywords, _gnews_langs)
|
||||||
if _gnews_feeds:
|
_gnews_recent = build_news_search_feeds(keywords, _gnews_langs, recency_days=14)
|
||||||
logger.info(f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)}")
|
_all_gnews = _gnews_feeds + _gnews_recent
|
||||||
|
if _all_gnews:
|
||||||
|
logger.info(
|
||||||
|
f"Google-News-Suchfeeds ergaenzt: {len(_gnews_feeds)} Kontext "
|
||||||
|
f"+ {len(_gnews_recent)} Frische (when:14d)"
|
||||||
|
)
|
||||||
articles = await rss_parser.search_feeds_selective(
|
articles = await rss_parser.search_feeds_selective(
|
||||||
title, selected_feeds + _gnews_feeds, keywords=keywords,
|
title, selected_feeds + _all_gnews, keywords=keywords,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id, keywords=keywords, user_id=user_id)
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ def build_news_search_feeds(
|
|||||||
keywords_by_lang: dict | list | None,
|
keywords_by_lang: dict | list | None,
|
||||||
languages: list[str],
|
languages: list[str],
|
||||||
max_keywords: int = 4,
|
max_keywords: int = 4,
|
||||||
|
recency_days: int | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Baut dynamische Google-News-Volltext-Such-Feeds pro Sprache.
|
"""Baut dynamische Google-News-Volltext-Such-Feeds pro Sprache.
|
||||||
|
|
||||||
@@ -44,6 +45,9 @@ def build_news_search_feeds(
|
|||||||
keywords_by_lang: Sprach-Dict {iso: [keyword,...]} aus der Keyword-Extraktion.
|
keywords_by_lang: Sprach-Dict {iso: [keyword,...]} aus der Keyword-Extraktion.
|
||||||
languages: ISO-Codes, fuer die ein Suchfeed gebaut werden soll.
|
languages: ISO-Codes, fuer die ein Suchfeed gebaut werden soll.
|
||||||
max_keywords: wie viele (spezifischste) Keywords in die Such-Query gehen.
|
max_keywords: wie viele (spezifischste) Keywords in die Such-Query gehen.
|
||||||
|
recency_days: wenn gesetzt, wird der Google-News-Operator "when:Nd" an die
|
||||||
|
Query gehaengt — der Feed liefert dann nur Artikel der letzten N Tage.
|
||||||
|
Fuer "Frische-Suchfeeds", die das aktuelle Bild garantiert einfangen.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Liste von Feed-Config-Dicts (kompatibel mit RSSParser._fetch_feed).
|
Liste von Feed-Config-Dicts (kompatibel mit RSSParser._fetch_feed).
|
||||||
@@ -88,28 +92,38 @@ def build_news_search_feeds(
|
|||||||
if not deduped:
|
if not deduped:
|
||||||
continue
|
continue
|
||||||
query = " ".join(deduped)
|
query = " ".join(deduped)
|
||||||
if not query or query in seen_queries:
|
# when:Nd-Operator anhaengen (Google-News-Zeitfilter)
|
||||||
|
effective_query = query
|
||||||
|
if recency_days and recency_days > 0:
|
||||||
|
effective_query = f"{query} when:{recency_days}d"
|
||||||
|
if not effective_query or effective_query in seen_queries:
|
||||||
continue
|
continue
|
||||||
seen_queries.add(query)
|
seen_queries.add(effective_query)
|
||||||
|
|
||||||
hl, gl = locale
|
hl, gl = locale
|
||||||
ceid_lang = hl.split("-")[0]
|
ceid_lang = hl.split("-")[0]
|
||||||
url = (
|
url = (
|
||||||
"https://news.google.com/rss/search?q="
|
"https://news.google.com/rss/search?q="
|
||||||
+ urllib.parse.quote(query)
|
+ urllib.parse.quote(effective_query)
|
||||||
+ f"&hl={hl}&gl={gl}&ceid={gl}:{ceid_lang}"
|
+ f"&hl={hl}&gl={gl}&ceid={gl}:{ceid_lang}"
|
||||||
)
|
)
|
||||||
|
if recency_days and recency_days > 0:
|
||||||
|
name = f"Google News Suche ({lang_key}, letzte {recency_days}d): {query}"
|
||||||
|
domain = f"google-news-search-{lang_key}-recent"
|
||||||
|
else:
|
||||||
|
name = f"Google News Suche ({lang_key}): {query}"
|
||||||
|
domain = f"google-news-search-{lang_key}"
|
||||||
feeds.append({
|
feeds.append({
|
||||||
"name": f"Google News Suche ({lang_key}): {query}",
|
"name": name,
|
||||||
"url": url,
|
"url": url,
|
||||||
# Eigene Domain-Gruppe, damit der Domain-Cap die Such-Feeds NICHT mit
|
# Eigene Domain-Gruppe, damit der Domain-Cap die Such-Feeds NICHT mit
|
||||||
# den site:-Google-News-Feeds in einen Topf wirft.
|
# den site:-Google-News-Feeds in einen Topf wirft.
|
||||||
"domain": f"google-news-search-{lang_key}",
|
"domain": domain,
|
||||||
"primary_language": lang_key,
|
"primary_language": lang_key,
|
||||||
"category": "international",
|
"category": "international",
|
||||||
"media_type": "",
|
"media_type": "",
|
||||||
})
|
})
|
||||||
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, query)
|
logger.info("Google-News-Suchfeed (%s): q=%r", lang_key, effective_query)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -218,14 +218,33 @@ class RSSParser:
|
|||||||
|
|
||||||
if match_count >= min_matches:
|
if match_count >= min_matches:
|
||||||
published = None
|
published = None
|
||||||
|
published_dt = None
|
||||||
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
||||||
try:
|
try:
|
||||||
published = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).astimezone(TIMEZONE).isoformat()
|
published_dt = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
|
||||||
|
published = published_dt.astimezone(TIMEZONE).isoformat()
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Relevanz-Score: Anteil der gematchten Suchworte (0.0-1.0)
|
# Relevanz-Score: Anteil der gematchten Suchworte (0.0-1.0)
|
||||||
relevance_score = match_count / len(search_words) if search_words else 0.0
|
relevance_score = match_count / len(search_words) if search_words else 0.0
|
||||||
|
# Aktualitaets-Bonus/Malus: frische Artikel sollen den
|
||||||
|
# Domain-Cap (sortiert nach relevance_score) ueberleben und
|
||||||
|
# nicht von Monate alten verdraengt werden. Damit faengt die
|
||||||
|
# Pipeline das aktuelle Bild ein. Nur adhoc-Pfad — research
|
||||||
|
# nutzt diesen Code nicht.
|
||||||
|
if published_dt is not None:
|
||||||
|
age_days = (datetime.now(timezone.utc) - published_dt).days
|
||||||
|
if age_days <= 3:
|
||||||
|
relevance_score += 0.35
|
||||||
|
elif age_days <= 14:
|
||||||
|
relevance_score += 0.20
|
||||||
|
elif age_days <= 60:
|
||||||
|
relevance_score += 0.05
|
||||||
|
elif age_days > 365:
|
||||||
|
relevance_score -= 0.30
|
||||||
|
elif age_days > 180:
|
||||||
|
relevance_score -= 0.15
|
||||||
|
|
||||||
# Bei Google-News-Feeds: echten Publisher aus <source>-Tag holen
|
# Bei Google-News-Feeds: echten Publisher aus <source>-Tag holen
|
||||||
article_source = name
|
article_source = name
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren