diff --git a/src/database.py b/src/database.py index ecbb33d..8f6c4c1 100644 --- a/src/database.py +++ b/src/database.py @@ -181,7 +181,8 @@ CREATE TABLE IF NOT EXISTS sources ( eu_disinfo_case_count INTEGER DEFAULT 0, eu_disinfo_last_seen TIMESTAMP, ifcn_signatory INTEGER DEFAULT 0, - external_data_synced_at TIMESTAMP + external_data_synced_at TIMESTAMP, + primary_language TEXT ); CREATE TABLE IF NOT EXISTS source_alignments ( @@ -817,6 +818,42 @@ async def init_db(): """) await db.commit() + # Migration: sources.primary_language (ISO-2-Sprachcode aus Freitext-Feld 'language') + cursor = await db.execute("PRAGMA table_info(sources)") + sources_columns = [row[1] for row in await cursor.fetchall()] + if "primary_language" not in sources_columns: + await db.execute("ALTER TABLE sources ADD COLUMN primary_language TEXT") + await db.commit() + logger.info("Migration: primary_language zu sources hinzugefuegt") + + # Backfill: aus Freitext-Feld 'language' (z.B. 'Deutsch', 'Hebraeisch/Englisch') + # die erste Sprache als ISO-Code uebernehmen. Nur fuer Quellen mit NULL primary_language. + _LANGUAGE_LOOKUP = { + "Deutsch": "de", "Englisch": "en", "Russisch": "ru", "Ukrainisch": "uk", + "Arabisch": "ar", "Hebraeisch": "he", "Hebräisch": "he", + "Farsi": "fa", "Japanisch": "ja", "Kurdisch": "ku", "Malaiisch": "ms", + } + cursor = await db.execute( + "SELECT id, language FROM sources WHERE primary_language IS NULL" + ) + rows = await cursor.fetchall() + backfilled = 0 + for row in rows: + sid = row[0] + lang = row[1] + iso = "de" # Default fuer NULL oder unbekannt + if lang: + first = lang.split("/")[0].strip() + iso = _LANGUAGE_LOOKUP.get(first, "de") + await db.execute( + "UPDATE sources SET primary_language = ? WHERE id = ?", + (iso, sid), + ) + backfilled += 1 + if backfilled: + await db.commit() + logger.info("Migration: primary_language Backfill fuer %d Quellen", backfilled) + # Verwaiste running-Eintraege beim Start als error markieren (aelter als 15 Min) await db.execute( """UPDATE refresh_log SET status = 'error', error_message = 'Verwaist beim Neustart', diff --git a/src/feeds/rss_parser.py b/src/feeds/rss_parser.py index 2e65d13..071ae0d 100644 --- a/src/feeds/rss_parser.py +++ b/src/feeds/rss_parser.py @@ -33,7 +33,7 @@ class RSSParser: Args: search_term: Suchbegriff - international: Wenn False, nur deutsche Feeds + Behoerden (keine internationalen) + international: Wenn False, nur Feeds in der Org-Sprache + Behoerden (keine internationalen) tenant_id: Optionale Org-ID fuer tenant-spezifische Quellen keywords: Optionale Claude-generierte Keywords (bevorzugt gegenüber Title-Split) """ @@ -84,7 +84,7 @@ class RSSParser: continue all_articles.extend(result) - cat_info = "alle" if international else "nur deutsch + behörden" + cat_info = "alle" if international else "nur primary + behörden" logger.info(f"RSS-Suche nach '{search_term}' ({cat_info}): {len(all_articles)} Treffer") all_articles = self._apply_domain_cap(all_articles) return all_articles diff --git a/src/source_rules.py b/src/source_rules.py index 24826b0..e66f638 100644 --- a/src/source_rules.py +++ b/src/source_rules.py @@ -692,12 +692,24 @@ async def get_source_rules(tenant_id: int = None) -> dict: Returns: dict mit: - excluded_domains: Liste ausgeschlossener Domains - - rss_feeds: Dict mit Kategorien deutsch/international/behoerden + - rss_feeds: Dict mit Kategorien primary/international/behoerden, wobei + 'primary' diejenigen Feeds enthaelt, deren primary_language der + Ausgabesprache der Org entspricht. Andere Sprachen wandern in + 'international'. Bei tenant_id=None wird die Org-Sprache 'de' angenommen. """ from database import get_db + from services.org_settings import get_org_language db = await get_db() try: + # Ausgabesprache der Org bestimmen (Default 'de') + org_lang_iso = "de" + if tenant_id: + try: + org_lang_iso = await get_org_language(db, tenant_id) + except Exception as e: + logger.warning("Konnte Org-Sprache nicht laden, default 'de': %s", e) + if tenant_id: cursor = await db.execute( "SELECT * FROM sources WHERE status = 'active' AND (tenant_id IS NULL OR tenant_id = ?)", @@ -710,7 +722,7 @@ async def get_source_rules(tenant_id: int = None) -> dict: sources = [dict(row) for row in await cursor.fetchall()] excluded_domains = [] - rss_feeds = {"deutsch": [], "international": [], "behoerden": []} + rss_feeds = {"primary": [], "international": [], "behoerden": []} for source in sources: if source["source_type"] == "excluded": @@ -718,13 +730,16 @@ async def get_source_rules(tenant_id: int = None) -> dict: elif source["source_type"] == "rss_feed" and source["url"]: feed_entry = {"name": source["name"], "url": source["url"]} cat = source["category"] + src_lang = source.get("primary_language") or "de" if cat == "behoerde": rss_feeds["behoerden"].append(feed_entry) - elif cat == "international": - rss_feeds["international"].append(feed_entry) + elif src_lang == org_lang_iso: + # Feed-Sprache entspricht Org-Sprache -> primary + rss_feeds["primary"].append(feed_entry) else: - # Alle anderen Kategorien → deutsch - rss_feeds["deutsch"].append(feed_entry) + # Andere Sprache -> international (wird nur bei + # 'international'-Lagen verwendet) + rss_feeds["international"].append(feed_entry) return { "excluded_domains": excluded_domains,