feat(x): X (Twitter) als Bezugsquelle pro Lage
X-Accounts werden analog zu Telegram als Quelle (source_type=x_account) konfiguriert und pro Lage ueber include_x zugeschaltet. Der Scraper (feeds/x_parser.py, twscrape) liest Account-Timelines, optional ueber einen HTTP-Proxy mit Fallback auf direkten Abruf ueber die Server-IP. - DB-Migration include_x, Pydantic-Modelle, incidents-Router - Orchestrator-X-Pipeline plus Haiku-Account-Vorselektion - sources-Router /x/validate, x_account-Typ in Stats und Frontend - Lage-Einstellungen: X-Toggle neben international und Telegram - twscrape als Abhaengigkeit Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
320
src/feeds/x_parser.py
Normale Datei
320
src/feeds/x_parser.py
Normale Datei
@@ -0,0 +1,320 @@
|
||||
"""X (Twitter) Parser: Liest Posts aus konfigurierten X-Accounts via twscrape.
|
||||
|
||||
Egress laeuft -- wenn X_PROXY_URL gesetzt -- ueber den HTTP-Proxy am RUTX11
|
||||
(Mobilfunk-IP). Faellt der Proxy aus, wird direkt ueber die Server-IP
|
||||
abgerufen (Fallback). Gibt Artikel-Dicts im RSS-/Telegram-kompatiblen Format
|
||||
zurueck.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timezone, timedelta
|
||||
|
||||
import httpx
|
||||
|
||||
from config import (
|
||||
TIMEZONE, X_ACCOUNTS_DB_PATH, X_PROXY_URL,
|
||||
X_POST_CAP_PER_ACCOUNT, X_RECENCY_DAYS, X_SCRAPER_ENABLED,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("osint.x")
|
||||
|
||||
# Stoppwoerter (gleich wie RSS-/Telegram-Parser)
|
||||
STOP_WORDS = {
|
||||
"und", "oder", "der", "die", "das", "ein", "eine", "in", "im", "am", "an",
|
||||
"auf", "fuer", "mit", "von", "zu", "zum", "zur", "bei", "nach", "vor",
|
||||
"ueber", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_handle(raw: str) -> str:
|
||||
"""X-Handle aus URL-/@-Form auf den nackten Benutzernamen normalisieren."""
|
||||
h = (raw or "").strip()
|
||||
for prefix in ("https://", "http://"):
|
||||
if h.startswith(prefix):
|
||||
h = h[len(prefix):]
|
||||
for prefix in ("www.", "x.com/", "twitter.com/", "nitter.net/"):
|
||||
if h.startswith(prefix):
|
||||
h = h[len(prefix):]
|
||||
h = h.lstrip("@").strip("/")
|
||||
# Pfad-/Query-Reste abschneiden (z.B. handle/status/123 oder handle?lang=de)
|
||||
for sep in ("/", "?"):
|
||||
if sep in h:
|
||||
h = h.split(sep)[0]
|
||||
return h
|
||||
|
||||
|
||||
class XParser:
|
||||
"""Durchsucht konfigurierte X-Accounts nach relevanten Posts."""
|
||||
|
||||
async def _resolve_proxy(self) -> tuple[str | None, str | None]:
|
||||
"""Proxy-Strategie aufloesen.
|
||||
|
||||
Returns (proxy_url, egress_ip):
|
||||
- X_PROXY_URL leer -> (None, None): direkter Abruf ueber Server-IP.
|
||||
- X_PROXY_URL gesetzt und erreichbar -> (proxy, egress_ip).
|
||||
- X_PROXY_URL gesetzt aber tot -> (None, None): Fallback direkt + Warnung.
|
||||
"""
|
||||
if not X_PROXY_URL:
|
||||
return None, None
|
||||
try:
|
||||
async with httpx.AsyncClient(proxy=X_PROXY_URL, timeout=8.0) as client:
|
||||
resp = await client.get("https://api.ipify.org")
|
||||
resp.raise_for_status()
|
||||
egress_ip = resp.text.strip()
|
||||
logger.info("X-Egress ueber Proxy %s aktiv (IP: %s)", X_PROXY_URL, egress_ip)
|
||||
return X_PROXY_URL, egress_ip
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"X-Proxy %s nicht erreichbar (%s) -- Fallback auf direkte Server-IP",
|
||||
X_PROXY_URL, e,
|
||||
)
|
||||
return None, None
|
||||
|
||||
async def _get_api(self, proxy: str | None):
|
||||
"""twscrape-API-Objekt erstellen.
|
||||
|
||||
Gibt None zurueck wenn der Account-Store fehlt oder keine
|
||||
nutzbaren Accounts vorhanden sind.
|
||||
"""
|
||||
if not os.path.exists(X_ACCOUNTS_DB_PATH):
|
||||
logger.error("X-Account-Store nicht gefunden: %s", X_ACCOUNTS_DB_PATH)
|
||||
return None
|
||||
try:
|
||||
from twscrape import API
|
||||
except ImportError:
|
||||
logger.error("twscrape nicht installiert: pip install twscrape")
|
||||
return None
|
||||
try:
|
||||
api = API(X_ACCOUNTS_DB_PATH, proxy=proxy)
|
||||
# Account-Pool pruefen -- ohne aktive Accounts liefert twscrape nichts
|
||||
try:
|
||||
accounts = await api.pool.get_all()
|
||||
active = [a for a in accounts if getattr(a, "active", True)]
|
||||
if not accounts:
|
||||
logger.error("X-Account-Pool leer -- keine Accounts konfiguriert")
|
||||
return None
|
||||
if not active:
|
||||
logger.error(
|
||||
"X-Account-Pool: alle %d Accounts inaktiv/gesperrt", len(accounts)
|
||||
)
|
||||
return None
|
||||
logger.info("X-Account-Pool: %d/%d Accounts aktiv", len(active), len(accounts))
|
||||
except Exception as e:
|
||||
# Pool-Status nicht ermittelbar -- trotzdem weiterversuchen
|
||||
logger.debug("X-Account-Pool-Status nicht ermittelbar: %s", e)
|
||||
return api
|
||||
except Exception as e:
|
||||
logger.error("X-API-Initialisierung fehlgeschlagen: %s", e)
|
||||
return None
|
||||
|
||||
async def search_accounts(self, search_term: str, tenant_id: int = None,
|
||||
keywords: dict | list = None,
|
||||
account_ids: list[int] = None) -> list[dict]:
|
||||
"""Liest Posts aus konfigurierten X-Accounts.
|
||||
|
||||
Args:
|
||||
keywords: Sprach-Dict {iso_lang: [keyword,...]} oder flache Liste.
|
||||
Match nutzt pro Account die "en"-Universalbegriffe + die
|
||||
Keywords der Account-Sprache (primary_language aus sources).
|
||||
|
||||
Gibt Artikel-Dicts zurueck (kompatibel mit RSS-/Telegram-Format).
|
||||
"""
|
||||
if not X_SCRAPER_ENABLED:
|
||||
logger.info("X-Scraper deaktiviert (X_SCRAPER_ENABLED=false)")
|
||||
return []
|
||||
|
||||
from agents.researcher import keywords_for_language
|
||||
|
||||
accounts = await self._get_x_accounts(tenant_id, account_ids=account_ids)
|
||||
if not accounts:
|
||||
logger.info("Keine X-Accounts konfiguriert")
|
||||
return []
|
||||
|
||||
proxy, _egress_ip = await self._resolve_proxy()
|
||||
api = await self._get_api(proxy)
|
||||
if not api:
|
||||
logger.warning("X-API nicht verfuegbar, ueberspringe X-Pipeline")
|
||||
return []
|
||||
|
||||
# Fallback-Suchwoerter wenn keine Keywords da sind
|
||||
fallback_words: list[str] | None = None
|
||||
if not keywords:
|
||||
fallback_words = [
|
||||
w for w in search_term.lower().split()
|
||||
if w not in STOP_WORDS and len(w) >= 3
|
||||
]
|
||||
if not fallback_words:
|
||||
fallback_words = search_term.lower().split()[:2]
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=X_RECENCY_DAYS)
|
||||
|
||||
# Accounts parallel abrufen
|
||||
tasks = []
|
||||
for acc in accounts:
|
||||
handle = _normalize_handle(acc["url"] or acc["name"])
|
||||
acc_lang = acc.get("primary_language")
|
||||
if keywords:
|
||||
search_words = [w.lower() for w in keywords_for_language(keywords, acc_lang)]
|
||||
else:
|
||||
search_words = fallback_words or []
|
||||
tasks.append(self._fetch_account(api, handle, search_words, cutoff, acc_lang))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
all_articles = []
|
||||
for i, result in enumerate(results):
|
||||
if isinstance(result, Exception):
|
||||
logger.warning("X-Account %s: %s", accounts[i]["name"], result)
|
||||
continue
|
||||
all_articles.extend(result)
|
||||
|
||||
logger.info("X: %d relevante Posts aus %d Accounts", len(all_articles), len(accounts))
|
||||
return all_articles
|
||||
|
||||
async def _get_x_accounts(self, tenant_id: int = None,
|
||||
account_ids: list[int] = None) -> list[dict]:
|
||||
"""Laedt X-Accounts aus der sources-Tabelle."""
|
||||
try:
|
||||
from database import get_db
|
||||
db = await get_db()
|
||||
try:
|
||||
if account_ids and len(account_ids) > 0:
|
||||
placeholders = ",".join("?" for _ in account_ids)
|
||||
cursor = await db.execute(
|
||||
f"""SELECT id, name, url, category, notes, primary_language FROM sources
|
||||
WHERE source_type = 'x_account'
|
||||
AND status = 'active'
|
||||
AND id IN ({placeholders})""",
|
||||
tuple(account_ids),
|
||||
)
|
||||
else:
|
||||
cursor = await db.execute(
|
||||
"""SELECT id, name, url, category, notes, primary_language FROM sources
|
||||
WHERE source_type = 'x_account'
|
||||
AND status = 'active'
|
||||
AND (tenant_id IS NULL OR tenant_id = ?)""",
|
||||
(tenant_id,),
|
||||
)
|
||||
rows = await cursor.fetchall()
|
||||
return [dict(row) for row in rows]
|
||||
finally:
|
||||
await db.close()
|
||||
except Exception as e:
|
||||
logger.error("Fehler beim Laden der X-Accounts: %s", e)
|
||||
return []
|
||||
|
||||
async def _fetch_account(self, api, handle: str, search_words: list[str],
|
||||
cutoff: datetime, account_lang: str | None = None) -> list[dict]:
|
||||
"""Letzte Posts eines X-Accounts abrufen und nach Keywords filtern."""
|
||||
from twscrape import gather
|
||||
|
||||
articles: list[dict] = []
|
||||
if not handle:
|
||||
return articles
|
||||
try:
|
||||
user = await api.user_by_login(handle)
|
||||
if not user:
|
||||
logger.warning("X-Account @%s nicht gefunden", handle)
|
||||
return articles
|
||||
|
||||
tweets = await gather(api.user_tweets(user.id, limit=X_POST_CAP_PER_ACCOUNT))
|
||||
|
||||
for tw in tweets:
|
||||
# Reine Retweets ueberspringen (Original wird ohnehin erfasst)
|
||||
if getattr(tw, "retweetedTweet", None) is not None:
|
||||
continue
|
||||
|
||||
text = getattr(tw, "rawContent", None) or ""
|
||||
# Quote-Tweet: zitierten Text anhaengen, damit Kontext erhalten bleibt
|
||||
quoted = getattr(tw, "quotedTweet", None)
|
||||
if quoted is not None:
|
||||
q_text = getattr(quoted, "rawContent", "") or ""
|
||||
if q_text:
|
||||
text = "%s\n\n[Zitiert] %s" % (text, q_text)
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
# Recency-Fenster
|
||||
tw_date = getattr(tw, "date", None)
|
||||
if tw_date is not None:
|
||||
try:
|
||||
if tw_date < cutoff:
|
||||
continue
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
# Keyword-Matching (lockerer als RSS: 1 Match reicht,
|
||||
# da Accounts bereits thematisch vorselektiert sind)
|
||||
text_lower = text.lower()
|
||||
match_count = sum(1 for w in search_words if w in text_lower)
|
||||
if search_words and match_count < 1:
|
||||
continue
|
||||
|
||||
lines = text.strip().split("\n")
|
||||
headline = (lines[0][:200] if lines else text[:200]).strip()
|
||||
|
||||
published = None
|
||||
if tw_date is not None:
|
||||
try:
|
||||
published = tw_date.astimezone(TIMEZONE).isoformat()
|
||||
except Exception:
|
||||
published = tw_date.isoformat()
|
||||
|
||||
source_url = getattr(tw, "url", None) or \
|
||||
"https://x.com/%s/status/%s" % (handle, getattr(tw, "id", ""))
|
||||
tw_lang = getattr(tw, "lang", None)
|
||||
language = account_lang \
|
||||
or (tw_lang if tw_lang and tw_lang != "und" else None) \
|
||||
or ("de" if self._is_german(text) else "en")
|
||||
relevance_score = (match_count / len(search_words)) if search_words else 0.0
|
||||
|
||||
articles.append({
|
||||
"headline": headline,
|
||||
"headline_de": headline if self._is_german(headline) else None,
|
||||
"source": "X: @%s" % handle,
|
||||
"source_url": source_url,
|
||||
"content_original": text[:2000],
|
||||
"content_de": text[:2000] if self._is_german(text) else None,
|
||||
"language": language,
|
||||
"published_at": published,
|
||||
"relevance_score": relevance_score,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("X-Account @%s: %s", handle, e)
|
||||
|
||||
return articles
|
||||
|
||||
async def validate_account(self, handle: str) -> dict | None:
|
||||
"""Prueft ob ein X-Account erreichbar ist und gibt Account-Info zurueck."""
|
||||
handle = _normalize_handle(handle)
|
||||
if not handle:
|
||||
return None
|
||||
proxy, _ = await self._resolve_proxy()
|
||||
api = await self._get_api(proxy)
|
||||
if not api:
|
||||
return None
|
||||
try:
|
||||
user = await api.user_by_login(handle)
|
||||
if not user:
|
||||
return None
|
||||
return {
|
||||
"valid": True,
|
||||
"name": getattr(user, "displayname", None) or handle,
|
||||
"username": getattr(user, "username", handle),
|
||||
"description": getattr(user, "rawDescription", "") or "",
|
||||
"subscribers": getattr(user, "followersCount", None),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("X-Account-Validierung fehlgeschlagen fuer @%s: %s", handle, e)
|
||||
return None
|
||||
|
||||
def _is_german(self, text: str) -> bool:
|
||||
"""Einfache Heuristik ob ein Text deutsch ist."""
|
||||
german_words = {"der", "die", "das", "und", "ist", "von", "mit", "fuer", "auf", "ein",
|
||||
"eine", "den", "dem", "des", "sich", "wird", "nach", "bei", "auch",
|
||||
"ueber", "wie", "aus", "hat", "zum", "zur", "als", "noch", "mehr",
|
||||
"nicht", "aber", "oder", "sind", "vor", "einem", "einer", "wurde"}
|
||||
words = set(text.lower().split())
|
||||
return len(words & german_words) >= 2
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren