feat(sources): primary_language Spalte + ISO-Backfill + org-relativer Feed-Bucket
- Neue Spalte sources.primary_language (ISO-2-Code) mit Backfill aus dem Freitext-Feld language (Erste Sprache vor /-Trennung). Edge-Cases wie Iran Military Magazine (English) [Farsi/Arabisch] landen als fa und koennen ueber das Verwaltungsportal manuell justiert werden. - get_source_rules(tenant_id) bestimmt die Org-Sprache und bucketed Feeds nach primary (=Org-Sprache) / international (=alle anderen) / behoerden (Kategorie behoerde). Bei tenant_id=None oder Helper-Fehler default de. - rss_parser.search_feeds unveraendert in Logik (international=False laesst weiterhin alle ausser dem international-Bucket durch), Kommentare generischer formuliert. Phase 3 von 8 (eng_demo / Org-Sprache).
Dieser Commit ist enthalten in:
@@ -181,7 +181,8 @@ CREATE TABLE IF NOT EXISTS sources (
|
|||||||
eu_disinfo_case_count INTEGER DEFAULT 0,
|
eu_disinfo_case_count INTEGER DEFAULT 0,
|
||||||
eu_disinfo_last_seen TIMESTAMP,
|
eu_disinfo_last_seen TIMESTAMP,
|
||||||
ifcn_signatory INTEGER DEFAULT 0,
|
ifcn_signatory INTEGER DEFAULT 0,
|
||||||
external_data_synced_at TIMESTAMP
|
external_data_synced_at TIMESTAMP,
|
||||||
|
primary_language TEXT
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS source_alignments (
|
CREATE TABLE IF NOT EXISTS source_alignments (
|
||||||
@@ -817,6 +818,42 @@ async def init_db():
|
|||||||
""")
|
""")
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
# Migration: sources.primary_language (ISO-2-Sprachcode aus Freitext-Feld 'language')
|
||||||
|
cursor = await db.execute("PRAGMA table_info(sources)")
|
||||||
|
sources_columns = [row[1] for row in await cursor.fetchall()]
|
||||||
|
if "primary_language" not in sources_columns:
|
||||||
|
await db.execute("ALTER TABLE sources ADD COLUMN primary_language TEXT")
|
||||||
|
await db.commit()
|
||||||
|
logger.info("Migration: primary_language zu sources hinzugefuegt")
|
||||||
|
|
||||||
|
# Backfill: aus Freitext-Feld 'language' (z.B. 'Deutsch', 'Hebraeisch/Englisch')
|
||||||
|
# die erste Sprache als ISO-Code uebernehmen. Nur fuer Quellen mit NULL primary_language.
|
||||||
|
_LANGUAGE_LOOKUP = {
|
||||||
|
"Deutsch": "de", "Englisch": "en", "Russisch": "ru", "Ukrainisch": "uk",
|
||||||
|
"Arabisch": "ar", "Hebraeisch": "he", "Hebräisch": "he",
|
||||||
|
"Farsi": "fa", "Japanisch": "ja", "Kurdisch": "ku", "Malaiisch": "ms",
|
||||||
|
}
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id, language FROM sources WHERE primary_language IS NULL"
|
||||||
|
)
|
||||||
|
rows = await cursor.fetchall()
|
||||||
|
backfilled = 0
|
||||||
|
for row in rows:
|
||||||
|
sid = row[0]
|
||||||
|
lang = row[1]
|
||||||
|
iso = "de" # Default fuer NULL oder unbekannt
|
||||||
|
if lang:
|
||||||
|
first = lang.split("/")[0].strip()
|
||||||
|
iso = _LANGUAGE_LOOKUP.get(first, "de")
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE sources SET primary_language = ? WHERE id = ?",
|
||||||
|
(iso, sid),
|
||||||
|
)
|
||||||
|
backfilled += 1
|
||||||
|
if backfilled:
|
||||||
|
await db.commit()
|
||||||
|
logger.info("Migration: primary_language Backfill fuer %d Quellen", backfilled)
|
||||||
|
|
||||||
# Verwaiste running-Eintraege beim Start als error markieren (aelter als 15 Min)
|
# Verwaiste running-Eintraege beim Start als error markieren (aelter als 15 Min)
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"""UPDATE refresh_log SET status = 'error', error_message = 'Verwaist beim Neustart',
|
"""UPDATE refresh_log SET status = 'error', error_message = 'Verwaist beim Neustart',
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class RSSParser:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
search_term: Suchbegriff
|
search_term: Suchbegriff
|
||||||
international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen)
|
international: Wenn False, nur Feeds in der Org-Sprache + Behoerden (keine internationalen)
|
||||||
tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen
|
tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen
|
||||||
keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split)
|
keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split)
|
||||||
"""
|
"""
|
||||||
@@ -84,7 +84,7 @@ class RSSParser:
|
|||||||
continue
|
continue
|
||||||
all_articles.extend(result)
|
all_articles.extend(result)
|
||||||
|
|
||||||
cat_info = "alle" if international else "nur deutsch + behörden"
|
cat_info = "alle" if international else "nur primary + behörden"
|
||||||
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer")
|
||||||
all_articles = self._apply_domain_cap(all_articles)
|
all_articles = self._apply_domain_cap(all_articles)
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|||||||
@@ -692,12 +692,24 @@ async def get_source_rules(tenant_id: int = None) -> dict:
|
|||||||
Returns:
|
Returns:
|
||||||
dict mit:
|
dict mit:
|
||||||
- excluded_domains: Liste ausgeschlossener Domains
|
- excluded_domains: Liste ausgeschlossener Domains
|
||||||
- rss_feeds: Dict mit Kategorien deutsch/international/behoerden
|
- rss_feeds: Dict mit Kategorien primary/international/behoerden, wobei
|
||||||
|
'primary' diejenigen Feeds enthaelt, deren primary_language der
|
||||||
|
Ausgabesprache der Org entspricht. Andere Sprachen wandern in
|
||||||
|
'international'. Bei tenant_id=None wird die Org-Sprache 'de' angenommen.
|
||||||
"""
|
"""
|
||||||
from database import get_db
|
from database import get_db
|
||||||
|
from services.org_settings import get_org_language
|
||||||
|
|
||||||
db = await get_db()
|
db = await get_db()
|
||||||
try:
|
try:
|
||||||
|
# Ausgabesprache der Org bestimmen (Default 'de')
|
||||||
|
org_lang_iso = "de"
|
||||||
|
if tenant_id:
|
||||||
|
try:
|
||||||
|
org_lang_iso = await get_org_language(db, tenant_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Konnte Org-Sprache nicht laden, default 'de': %s", e)
|
||||||
|
|
||||||
if tenant_id:
|
if tenant_id:
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"SELECT * FROM sources WHERE status = 'active' AND (tenant_id IS NULL OR tenant_id = ?)",
|
"SELECT * FROM sources WHERE status = 'active' AND (tenant_id IS NULL OR tenant_id = ?)",
|
||||||
@@ -710,7 +722,7 @@ async def get_source_rules(tenant_id: int = None) -> dict:
|
|||||||
sources = [dict(row) for row in await cursor.fetchall()]
|
sources = [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|
||||||
excluded_domains = []
|
excluded_domains = []
|
||||||
rss_feeds = {"deutsch": [], "international": [], "behoerden": []}
|
rss_feeds = {"primary": [], "international": [], "behoerden": []}
|
||||||
|
|
||||||
for source in sources:
|
for source in sources:
|
||||||
if source["source_type"] == "excluded":
|
if source["source_type"] == "excluded":
|
||||||
@@ -718,13 +730,16 @@ async def get_source_rules(tenant_id: int = None) -> dict:
|
|||||||
elif source["source_type"] == "rss_feed" and source["url"]:
|
elif source["source_type"] == "rss_feed" and source["url"]:
|
||||||
feed_entry = {"name": source["name"], "url": source["url"]}
|
feed_entry = {"name": source["name"], "url": source["url"]}
|
||||||
cat = source["category"]
|
cat = source["category"]
|
||||||
|
src_lang = source.get("primary_language") or "de"
|
||||||
if cat == "behoerde":
|
if cat == "behoerde":
|
||||||
rss_feeds["behoerden"].append(feed_entry)
|
rss_feeds["behoerden"].append(feed_entry)
|
||||||
elif cat == "international":
|
elif src_lang == org_lang_iso:
|
||||||
rss_feeds["international"].append(feed_entry)
|
# Feed-Sprache entspricht Org-Sprache -> primary
|
||||||
|
rss_feeds["primary"].append(feed_entry)
|
||||||
else:
|
else:
|
||||||
# Alle anderen Kategorien → deutsch
|
# Andere Sprache -> international (wird nur bei
|
||||||
rss_feeds["deutsch"].append(feed_entry)
|
# 'international'-Lagen verwendet)
|
||||||
|
rss_feeds["international"].append(feed_entry)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"excluded_domains": excluded_domains,
|
"excluded_domains": excluded_domains,
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren