Geoparsing von spaCy auf Haiku umgestellt
- geoparsing.py: Komplett-Rewrite (spaCy NER + Nominatim -> Haiku + geonamescache) - orchestrator.py: incident_context an geoparse_articles, category in INSERT - incidents.py: incident_context aus DB laden und an Geoparsing uebergeben - public_api.py: Locations aggregiert im Lagebild-Endpoint - components.js: response-Kategorie neben retaliation (beide akzeptiert) - requirements.txt: spaCy und geopy entfernt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -9,6 +9,4 @@ apscheduler==3.10.4
|
|||||||
websockets
|
websockets
|
||||||
python-multipart
|
python-multipart
|
||||||
aiosmtplib
|
aiosmtplib
|
||||||
spacy>=3.7,<4.0
|
|
||||||
geonamescache>=2.0
|
geonamescache>=2.0
|
||||||
geopy>=2.4
|
|
||||||
|
|||||||
@@ -1,361 +1,318 @@
|
|||||||
"""Geoparsing-Modul: NER-basierte Ortsextraktion und Geocoding fuer Artikel."""
|
"""Geoparsing-Modul: Haiku-basierte Ortsextraktion und Geocoding fuer Artikel."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import json
|
||||||
import re
|
import logging
|
||||||
from difflib import SequenceMatcher
|
import re
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
logger = logging.getLogger("osint.geoparsing")
|
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
|
||||||
|
from config import CLAUDE_MODEL_FAST
|
||||||
# Lazy-loaded spaCy-Modelle (erst beim ersten Aufruf geladen)
|
|
||||||
_nlp_de = None
|
logger = logging.getLogger("osint.geoparsing")
|
||||||
_nlp_en = None
|
|
||||||
|
# Geocoding-Cache (in-memory, lebt solange der Prozess laeuft)
|
||||||
# Stopwords: Entitaeten die von spaCy faelschlicherweise als Orte erkannt werden
|
_geocode_cache: dict[str, Optional[dict]] = {}
|
||||||
LOCATION_STOPWORDS = {
|
|
||||||
"EU", "UN", "NATO", "WHO", "OSZE", "OPEC", "G7", "G20", "BRICS",
|
# geonamescache-Instanz (lazy)
|
||||||
"Nato", "Eu", "Un", "Onu",
|
_gc = None
|
||||||
"Bundesregierung", "Bundestag", "Bundesrat", "Bundeskanzler",
|
|
||||||
"Kreml", "Weisses Haus", "White House", "Pentagon", "Elysee",
|
|
||||||
"Twitter", "Facebook", "Telegram", "Signal", "WhatsApp",
|
def _get_geonamescache():
|
||||||
"Reuters", "AP", "AFP", "DPA", "dpa",
|
"""Laedt geonamescache lazy."""
|
||||||
"Internet", "Online", "Web",
|
global _gc
|
||||||
# Regionale/vage Begriffe (kein einzelner Punkt auf der Karte)
|
if _gc is None:
|
||||||
"Naher Osten", "Mittlerer Osten", "Middle East", "Near East",
|
try:
|
||||||
"Golf-Staaten", "Golfstaaten", "Golfregion", "Gulf States", "Persian Gulf",
|
import geonamescache
|
||||||
"Nordafrika", "Subsahara", "Zentralasien", "Suedostasien",
|
_gc = geonamescache.GeonamesCache()
|
||||||
"Westeuropa", "Osteuropa", "Suedeuropa", "Nordeuropa",
|
logger.info("geonamescache geladen")
|
||||||
"Balkan", "Kaukasus", "Levante", "Maghreb", "Sahel",
|
except ImportError:
|
||||||
"Arabische Welt", "Arab World",
|
logger.error("geonamescache nicht installiert - pip install geonamescache")
|
||||||
}
|
return None
|
||||||
|
return _gc
|
||||||
# Maximale Textlaenge fuer NER-Verarbeitung
|
|
||||||
MAX_TEXT_LENGTH = 10000
|
|
||||||
|
def _geocode_offline(name: str, country_code: str = "") -> Optional[dict]:
|
||||||
|
"""Geocoding ueber geonamescache (offline).
|
||||||
# Marker-Kategorien fuer Karten-Klassifizierung
|
|
||||||
CATEGORY_KEYWORDS = {
|
Args:
|
||||||
"target": [
|
name: Ortsname (normalisiert von Haiku)
|
||||||
"angriff", "angegriff", "bombardier", "luftschlag", "luftangriff",
|
country_code: ISO-2 Laendercode (von Haiku) fuer bessere Disambiguierung
|
||||||
"beschuss", "beschossen", "getroffen", "zerstoer", "einschlag",
|
"""
|
||||||
"detonation", "explosion", "strike", "attack", "bombed", "hit",
|
gc = _get_geonamescache()
|
||||||
"shelled", "destroyed", "targeted", "missile hit", "air strike",
|
if gc is None:
|
||||||
"airstrike", "bombardment", "killed", "casualties", "dead",
|
return None
|
||||||
"tote", "opfer", "getoetet",
|
|
||||||
],
|
name_lower = name.lower().strip()
|
||||||
"retaliation": [
|
|
||||||
"gegenschlag", "vergeltung", "reaktion", "gegenangriff",
|
# 1. Stadtsuche
|
||||||
"abgefeuert", "retaliat", "counter-attack", "counterattack",
|
cities = gc.get_cities()
|
||||||
"counter-strike", "response", "fired back", "launched",
|
matches = []
|
||||||
"rakete abgefeuert", "vergeltungsschlag", "abfangen",
|
for gid, city in cities.items():
|
||||||
"abgefangen", "intercepted", "eskalation", "escalat",
|
city_name = city.get("name", "")
|
||||||
],
|
alt_names = city.get("alternatenames", "")
|
||||||
"actor": [
|
if isinstance(alt_names, list):
|
||||||
"regierung", "praesident", "ministerium", "hauptquartier",
|
alt_list = [n.strip().lower() for n in alt_names if n.strip()]
|
||||||
"kommando", "nato", "pentagon", "kongress", "senat", "parlament",
|
else:
|
||||||
"government", "president", "ministry", "headquarters", "command",
|
alt_list = [n.strip().lower() for n in str(alt_names).split(",") if n.strip()]
|
||||||
"congress", "senate", "parliament", "white house", "weisses haus",
|
if city_name.lower() == name_lower or name_lower in alt_list:
|
||||||
"verteidigungsminister", "aussenminister", "generalstab",
|
matches.append(city)
|
||||||
"defense secretary", "secretary of state", "general staff",
|
|
||||||
"un-sicherheitsrat", "security council", "summit", "gipfel",
|
if matches:
|
||||||
"diplomati", "botschaft", "embassy",
|
# Disambiguierung: country_code bevorzugen, dann Population
|
||||||
],
|
if country_code:
|
||||||
}
|
cc_matches = [c for c in matches if c.get("countrycode", "").upper() == country_code.upper()]
|
||||||
|
if cc_matches:
|
||||||
|
matches = cc_matches
|
||||||
def _classify_location(source_text: str, article_text: str = "") -> str:
|
best = max(matches, key=lambda c: c.get("population", 0))
|
||||||
"""Klassifiziert eine Location basierend auf dem Kontext.
|
return {
|
||||||
|
"lat": float(best["latitude"]),
|
||||||
Returns:
|
"lon": float(best["longitude"]),
|
||||||
Kategorie: 'target', 'retaliation', 'actor', oder 'mentioned'
|
"country_code": best.get("countrycode", ""),
|
||||||
"""
|
"normalized_name": best["name"],
|
||||||
text = (source_text + " " + article_text[:500]).lower()
|
"confidence": min(1.0, 0.6 + (best.get("population", 0) / 10_000_000)),
|
||||||
|
}
|
||||||
scores = {"target": 0, "retaliation": 0, "actor": 0}
|
|
||||||
for category, keywords in CATEGORY_KEYWORDS.items():
|
# 2. Laendersuche
|
||||||
for kw in keywords:
|
countries = gc.get_countries()
|
||||||
if kw in text:
|
for code, country in countries.items():
|
||||||
scores[category] += 1
|
if country.get("name", "").lower() == name_lower:
|
||||||
|
capital = country.get("capital", "")
|
||||||
best = max(scores, key=scores.get)
|
if capital:
|
||||||
if scores[best] >= 1:
|
cap_result = _geocode_offline(capital)
|
||||||
return best
|
if cap_result:
|
||||||
return "mentioned"
|
cap_result["normalized_name"] = country["name"]
|
||||||
|
cap_result["confidence"] = 0.5
|
||||||
|
return cap_result
|
||||||
|
|
||||||
def _load_spacy_model(lang: str):
|
return None
|
||||||
"""Laedt ein spaCy-Modell lazy (nur beim ersten Aufruf)."""
|
|
||||||
global _nlp_de, _nlp_en
|
|
||||||
try:
|
def _geocode_location(name: str, country_code: str = "", haiku_coords: Optional[dict] = None) -> Optional[dict]:
|
||||||
import spacy
|
"""Geocoded einen Ortsnamen. Prioritaet: geonamescache > Haiku-Koordinaten.
|
||||||
except ImportError:
|
|
||||||
logger.error("spaCy nicht installiert - pip install spacy")
|
Args:
|
||||||
return None
|
name: Ortsname
|
||||||
|
country_code: ISO-2 Code (von Haiku)
|
||||||
if lang == "de" and _nlp_de is None:
|
haiku_coords: {"lat": float, "lon": float} (Fallback von Haiku)
|
||||||
try:
|
"""
|
||||||
_nlp_de = spacy.load("de_core_news_sm", disable=["parser", "lemmatizer", "textcat"])
|
cache_key = f"{name.lower().strip()}|{country_code.upper()}"
|
||||||
logger.info("spaCy-Modell de_core_news_sm geladen")
|
if cache_key in _geocode_cache:
|
||||||
except OSError:
|
return _geocode_cache[cache_key]
|
||||||
logger.warning("spaCy-Modell de_core_news_sm nicht gefunden - python -m spacy download de_core_news_sm")
|
|
||||||
return None
|
result = _geocode_offline(name, country_code)
|
||||||
elif lang == "en" and _nlp_en is None:
|
|
||||||
try:
|
# Fallback: Haiku-Koordinaten nutzen
|
||||||
_nlp_en = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer", "textcat"])
|
if result is None and haiku_coords:
|
||||||
logger.info("spaCy-Modell en_core_web_sm geladen")
|
lat = haiku_coords.get("lat")
|
||||||
except OSError:
|
lon = haiku_coords.get("lon")
|
||||||
logger.warning("spaCy-Modell en_core_web_sm nicht gefunden - python -m spacy download en_core_web_sm")
|
if lat is not None and lon is not None:
|
||||||
return None
|
result = {
|
||||||
|
"lat": float(lat),
|
||||||
return _nlp_de if lang == "de" else _nlp_en
|
"lon": float(lon),
|
||||||
|
"country_code": country_code.upper() if country_code else "",
|
||||||
|
"normalized_name": name,
|
||||||
def _extract_locations_from_text(text: str, language: str = "de") -> list[dict]:
|
"confidence": 0.45,
|
||||||
"""Extrahiert Ortsnamen aus Text via spaCy NER.
|
}
|
||||||
|
|
||||||
Returns:
|
_geocode_cache[cache_key] = result
|
||||||
Liste von dicts: [{name: str, source_text: str}]
|
return result
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return []
|
HAIKU_GEOPARSE_PROMPT = """Extrahiere alle geographischen Orte aus diesen Nachrichten-Headlines.
|
||||||
|
|
||||||
text = text[:MAX_TEXT_LENGTH]
|
Kontext der Lage: "{incident_context}"
|
||||||
|
|
||||||
nlp = _load_spacy_model(language)
|
Regeln:
|
||||||
if nlp is None:
|
- Nur echte Orte (Staedte, Laender, Regionen)
|
||||||
# Fallback: anderes Modell versuchen
|
- Keine Personen, Organisationen, Gebaeude, Alltagswoerter
|
||||||
fallback = "en" if language == "de" else "de"
|
- Bei "US-Militaer" etc: Land (USA) extrahieren, nicht das Kompositum
|
||||||
nlp = _load_spacy_model(fallback)
|
- HTML-Tags ignorieren
|
||||||
if nlp is None:
|
- Jeder Ort nur einmal pro Headline
|
||||||
return []
|
- Regionen wie "Middle East", "Gulf", "Naher Osten" NICHT extrahieren (kein einzelner Punkt auf der Karte)
|
||||||
|
|
||||||
doc = nlp(text)
|
Klassifiziere basierend auf dem Lage-Kontext:
|
||||||
|
- "target": Wo das Ereignis passiert / Schaden entsteht
|
||||||
locations = []
|
- "response": Wo Reaktionen / Gegenmassnahmen stattfinden
|
||||||
seen = set()
|
- "actor": Wo Entscheidungen getroffen werden / Entscheider sitzen
|
||||||
for ent in doc.ents:
|
- "mentioned": Nur erwaehnt, kein direkter Bezug
|
||||||
if ent.label_ in ("LOC", "GPE"):
|
|
||||||
name = ent.text.strip()
|
Headlines:
|
||||||
# Filter: zu kurz, Stopword, oder nur Zahlen/Sonderzeichen
|
{headlines}
|
||||||
if len(name) < 2:
|
|
||||||
continue
|
Antwort NUR als JSON-Array, kein anderer Text:
|
||||||
if name in LOCATION_STOPWORDS:
|
[{{"headline_idx": 0, "locations": [
|
||||||
continue
|
{{"name": "Teheran", "normalized": "Tehran", "country_code": "IR",
|
||||||
if re.match(r'^[\d\W]+$', name):
|
"type": "city", "category": "target",
|
||||||
continue
|
"lat": 35.69, "lon": 51.42}}
|
||||||
|
]}}]"""
|
||||||
name_lower = name.lower()
|
|
||||||
if name_lower not in seen:
|
|
||||||
seen.add(name_lower)
|
async def _extract_locations_haiku(
|
||||||
# Kontext: 50 Zeichen um die Entitaet herum
|
headlines: list[dict], incident_context: str
|
||||||
start = max(0, ent.start_char - 25)
|
) -> dict[int, list[dict]]:
|
||||||
end = min(len(text), ent.end_char + 25)
|
"""Extrahiert Orte aus Headlines via Haiku.
|
||||||
source_text = text[start:end].strip()
|
|
||||||
locations.append({"name": name, "source_text": source_text})
|
Args:
|
||||||
|
headlines: [{"idx": article_id, "text": headline_text}, ...]
|
||||||
return locations
|
incident_context: Lage-Kontext fuer Klassifizierung
|
||||||
|
|
||||||
|
Returns:
|
||||||
# Geocoding-Cache (in-memory, lebt solange der Prozess laeuft)
|
dict[article_id -> list[{name, normalized, country_code, type, category, lat, lon}]]
|
||||||
_geocode_cache: dict[str, Optional[dict]] = {}
|
"""
|
||||||
|
if not headlines:
|
||||||
# geonamescache-Instanz (lazy)
|
return {}
|
||||||
_gc = None
|
|
||||||
|
# Headlines formatieren
|
||||||
|
headline_lines = []
|
||||||
def _get_geonamescache():
|
for i, h in enumerate(headlines):
|
||||||
"""Laedt geonamescache lazy."""
|
headline_lines.append(f"[{i}] {h['text']}")
|
||||||
global _gc
|
|
||||||
if _gc is None:
|
prompt = HAIKU_GEOPARSE_PROMPT.format(
|
||||||
try:
|
incident_context=incident_context or "Allgemeine Nachrichtenlage",
|
||||||
import geonamescache
|
headlines="\n".join(headline_lines),
|
||||||
_gc = geonamescache.GeonamesCache()
|
)
|
||||||
logger.info("geonamescache geladen")
|
|
||||||
except ImportError:
|
try:
|
||||||
logger.error("geonamescache nicht installiert - pip install geonamescache")
|
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||||
return None
|
except Exception as e:
|
||||||
return _gc
|
logger.error(f"Haiku-Geoparsing fehlgeschlagen: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
def _geocode_location(name: str) -> Optional[dict]:
|
# JSON parsen (mit Regex-Fallback)
|
||||||
"""Geocoded einen Ortsnamen. Offline via geonamescache, Fallback Nominatim.
|
parsed = None
|
||||||
|
try:
|
||||||
Returns:
|
parsed = json.loads(result_text)
|
||||||
dict mit {lat, lon, country_code, normalized_name, confidence} oder None
|
except json.JSONDecodeError:
|
||||||
"""
|
match = re.search(r'\[.*\]', result_text, re.DOTALL)
|
||||||
name_lower = name.lower().strip()
|
if match:
|
||||||
if name_lower in _geocode_cache:
|
try:
|
||||||
return _geocode_cache[name_lower]
|
parsed = json.loads(match.group())
|
||||||
|
except json.JSONDecodeError:
|
||||||
result = _geocode_offline(name)
|
logger.warning("Haiku-Geoparsing: JSON-Parse fehlgeschlagen auch mit Regex-Fallback")
|
||||||
if result is None:
|
return {}
|
||||||
result = _geocode_nominatim(name)
|
|
||||||
|
if not parsed or not isinstance(parsed, list):
|
||||||
_geocode_cache[name_lower] = result
|
logger.warning("Haiku-Geoparsing: Kein gueltiges JSON-Array erhalten")
|
||||||
return result
|
return {}
|
||||||
|
|
||||||
|
# Ergebnisse den Artikeln zuordnen
|
||||||
def _geocode_offline(name: str) -> Optional[dict]:
|
results = {}
|
||||||
"""Versucht Geocoding ueber geonamescache (offline)."""
|
for entry in parsed:
|
||||||
gc = _get_geonamescache()
|
if not isinstance(entry, dict):
|
||||||
if gc is None:
|
continue
|
||||||
return None
|
headline_idx = entry.get("headline_idx")
|
||||||
|
if headline_idx is None or headline_idx >= len(headlines):
|
||||||
name_lower = name.lower().strip()
|
continue
|
||||||
|
|
||||||
# 1. Direkte Suche in Staedten
|
article_id = headlines[headline_idx]["idx"]
|
||||||
cities = gc.get_cities()
|
locations = entry.get("locations", [])
|
||||||
matches = []
|
|
||||||
for gid, city in cities.items():
|
if not locations:
|
||||||
city_name = city.get("name", "")
|
continue
|
||||||
alt_names = city.get("alternatenames", "")
|
|
||||||
# alternatenames kann String (komma-getrennt) oder Liste sein
|
article_locs = []
|
||||||
if isinstance(alt_names, list):
|
for loc in locations:
|
||||||
alt_list = [n.strip().lower() for n in alt_names if n.strip()]
|
if not isinstance(loc, dict):
|
||||||
else:
|
continue
|
||||||
alt_list = [n.strip().lower() for n in str(alt_names).split(",") if n.strip()]
|
loc_type = loc.get("type", "city")
|
||||||
if city_name.lower() == name_lower:
|
# Regionen nicht speichern (kein sinnvoller Punkt auf der Karte)
|
||||||
matches.append(city)
|
if loc_type == "region":
|
||||||
elif name_lower in alt_list:
|
continue
|
||||||
matches.append(city)
|
|
||||||
|
name = loc.get("name", "")
|
||||||
if matches:
|
if not name:
|
||||||
# Disambiguierung: groesste Stadt gewinnt
|
continue
|
||||||
best = max(matches, key=lambda c: c.get("population", 0))
|
|
||||||
return {
|
article_locs.append({
|
||||||
"lat": float(best["latitude"]),
|
"name": name,
|
||||||
"lon": float(best["longitude"]),
|
"normalized": loc.get("normalized", name),
|
||||||
"country_code": best.get("countrycode", ""),
|
"country_code": loc.get("country_code", ""),
|
||||||
"normalized_name": best["name"],
|
"type": loc_type,
|
||||||
"confidence": min(1.0, 0.6 + (best.get("population", 0) / 10_000_000)),
|
"category": loc.get("category", "mentioned"),
|
||||||
}
|
"lat": loc.get("lat"),
|
||||||
|
"lon": loc.get("lon"),
|
||||||
# 2. Laendersuche
|
})
|
||||||
countries = gc.get_countries()
|
|
||||||
for code, country in countries.items():
|
if article_locs:
|
||||||
if country.get("name", "").lower() == name_lower:
|
results[article_id] = article_locs
|
||||||
# Hauptstadt-Koordinaten als Fallback
|
|
||||||
capital = country.get("capital", "")
|
return results
|
||||||
if capital:
|
|
||||||
cap_result = _geocode_offline(capital)
|
|
||||||
if cap_result:
|
async def geoparse_articles(
|
||||||
cap_result["normalized_name"] = country["name"]
|
articles: list[dict],
|
||||||
cap_result["confidence"] = 0.5 # Land, nicht Stadt
|
incident_context: str = "",
|
||||||
return cap_result
|
) -> dict[int, list[dict]]:
|
||||||
|
"""Geoparsing fuer eine Liste von Artikeln via Haiku + geonamescache.
|
||||||
return None
|
|
||||||
|
Args:
|
||||||
|
articles: Liste von Artikel-Dicts (mit id, headline, headline_de, language)
|
||||||
def _geocode_nominatim(name: str) -> Optional[dict]:
|
incident_context: Lage-Kontext (Titel + Beschreibung) fuer kontextbewusste Klassifizierung
|
||||||
"""Fallback-Geocoding ueber Nominatim (1 Request/Sekunde)."""
|
|
||||||
try:
|
Returns:
|
||||||
from geopy.geocoders import Nominatim
|
dict[article_id -> list[{location_name, location_name_normalized, country_code,
|
||||||
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
|
lat, lon, confidence, source_text, category}]]
|
||||||
except ImportError:
|
"""
|
||||||
return None
|
if not articles:
|
||||||
|
return {}
|
||||||
try:
|
|
||||||
geocoder = Nominatim(user_agent="aegissight-monitor/1.0", timeout=5)
|
# Headlines sammeln
|
||||||
location = geocoder.geocode(name, language="de", exactly_one=True)
|
headlines = []
|
||||||
if location:
|
for article in articles:
|
||||||
# Country-Code aus Address extrahieren falls verfuegbar
|
article_id = article.get("id")
|
||||||
raw = location.raw or {}
|
if not article_id:
|
||||||
country_code = ""
|
continue
|
||||||
if "address" in raw:
|
|
||||||
country_code = raw["address"].get("country_code", "").upper()
|
# Deutsche Headline bevorzugen
|
||||||
|
headline = article.get("headline_de") or article.get("headline") or ""
|
||||||
normalized_name = location.address.split(",")[0] if location.address else name
|
headline = headline.strip()
|
||||||
|
if not headline:
|
||||||
# Plausibilitaetspruefung: Nominatim-Ergebnis muss zum Suchbegriff passen
|
continue
|
||||||
similarity = SequenceMatcher(None, name.lower(), normalized_name.lower()).ratio()
|
|
||||||
if similarity < 0.3:
|
headlines.append({"idx": article_id, "text": headline})
|
||||||
logger.debug(f"Nominatim-Ergebnis verworfen: '{name}' -> '{normalized_name}' (Aehnlichkeit {similarity:.2f})")
|
|
||||||
return None
|
if not headlines:
|
||||||
|
return {}
|
||||||
return {
|
|
||||||
"lat": float(location.latitude),
|
# Batches bilden (max 50 Headlines pro Haiku-Call)
|
||||||
"lon": float(location.longitude),
|
batch_size = 50
|
||||||
"country_code": country_code,
|
all_haiku_results = {}
|
||||||
"normalized_name": normalized_name,
|
for i in range(0, len(headlines), batch_size):
|
||||||
"confidence": 0.4, # Nominatim-Ergebnis = niedrigere Konfidenz
|
batch = headlines[i:i + batch_size]
|
||||||
}
|
batch_results = await _extract_locations_haiku(batch, incident_context)
|
||||||
except (GeocoderTimedOut, GeocoderServiceError) as e:
|
all_haiku_results.update(batch_results)
|
||||||
logger.debug(f"Nominatim-Fehler fuer '{name}': {e}")
|
|
||||||
except Exception as e:
|
if not all_haiku_results:
|
||||||
logger.debug(f"Geocoding-Fehler fuer '{name}': {e}")
|
return {}
|
||||||
|
|
||||||
return None
|
# Geocoding via geonamescache (mit Haiku-Koordinaten als Fallback)
|
||||||
|
result = {}
|
||||||
|
for article_id, haiku_locs in all_haiku_results.items():
|
||||||
async def geoparse_articles(articles: list[dict]) -> dict[int, list[dict]]:
|
locations = []
|
||||||
"""Geoparsing fuer eine Liste von Artikeln.
|
for loc in haiku_locs:
|
||||||
|
haiku_coords = None
|
||||||
Args:
|
if loc.get("lat") is not None and loc.get("lon") is not None:
|
||||||
articles: Liste von Artikel-Dicts (mit id, content_de, content_original, language, headline, headline_de)
|
haiku_coords = {"lat": loc["lat"], "lon": loc["lon"]}
|
||||||
|
|
||||||
Returns:
|
geo = _geocode_location(
|
||||||
dict[article_id -> list[{location_name, location_name_normalized, country_code, lat, lon, confidence, source_text}]]
|
loc["normalized"],
|
||||||
"""
|
loc.get("country_code", ""),
|
||||||
if not articles:
|
haiku_coords,
|
||||||
return {}
|
)
|
||||||
|
|
||||||
result = {}
|
if geo:
|
||||||
|
locations.append({
|
||||||
for article in articles:
|
"location_name": loc["name"],
|
||||||
article_id = article.get("id")
|
"location_name_normalized": geo["normalized_name"],
|
||||||
if not article_id:
|
"country_code": geo["country_code"],
|
||||||
continue
|
"lat": geo["lat"],
|
||||||
|
"lon": geo["lon"],
|
||||||
language = article.get("language", "de")
|
"confidence": geo["confidence"],
|
||||||
|
"source_text": loc["name"],
|
||||||
# Text zusammenbauen: Headline + Content
|
"category": loc.get("category", "mentioned"),
|
||||||
text_parts = []
|
})
|
||||||
if language == "de":
|
|
||||||
if article.get("headline_de"):
|
if locations:
|
||||||
text_parts.append(article["headline_de"])
|
result[article_id] = locations
|
||||||
elif article.get("headline"):
|
|
||||||
text_parts.append(article["headline"])
|
return result
|
||||||
if article.get("content_de"):
|
|
||||||
text_parts.append(article["content_de"])
|
|
||||||
elif article.get("content_original"):
|
|
||||||
text_parts.append(article["content_original"])
|
|
||||||
else:
|
|
||||||
if article.get("headline"):
|
|
||||||
text_parts.append(article["headline"])
|
|
||||||
if article.get("content_original"):
|
|
||||||
text_parts.append(article["content_original"])
|
|
||||||
|
|
||||||
text = "\n".join(text_parts)
|
|
||||||
if not text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# NER-Extraktion (CPU-bound, in Thread ausfuehren)
|
|
||||||
locations_raw = await asyncio.to_thread(
|
|
||||||
_extract_locations_from_text, text, language
|
|
||||||
)
|
|
||||||
|
|
||||||
if not locations_raw:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Geocoding (enthaelt potentiell Netzwerk-Calls)
|
|
||||||
locations = []
|
|
||||||
for loc in locations_raw:
|
|
||||||
geo = await asyncio.to_thread(_geocode_location, loc["name"])
|
|
||||||
if geo:
|
|
||||||
category = _classify_location(loc.get("source_text", ""), text)
|
|
||||||
locations.append({
|
|
||||||
"location_name": loc["name"],
|
|
||||||
"location_name_normalized": geo["normalized_name"],
|
|
||||||
"country_code": geo["country_code"],
|
|
||||||
"lat": geo["lat"],
|
|
||||||
"lon": geo["lon"],
|
|
||||||
"confidence": geo["confidence"],
|
|
||||||
"source_text": loc.get("source_text", ""),
|
|
||||||
"category": category,
|
|
||||||
})
|
|
||||||
|
|
||||||
if locations:
|
|
||||||
result[article_id] = locations
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|||||||
@@ -714,19 +714,20 @@ class AgentOrchestrator:
|
|||||||
if new_articles_for_analysis:
|
if new_articles_for_analysis:
|
||||||
try:
|
try:
|
||||||
from agents.geoparsing import geoparse_articles
|
from agents.geoparsing import geoparse_articles
|
||||||
|
incident_context = f"{title} - {description}"
|
||||||
logger.info(f"Geoparsing fuer {len(new_articles_for_analysis)} neue Artikel...")
|
logger.info(f"Geoparsing fuer {len(new_articles_for_analysis)} neue Artikel...")
|
||||||
geo_results = await geoparse_articles(new_articles_for_analysis)
|
geo_results = await geoparse_articles(new_articles_for_analysis, incident_context)
|
||||||
geo_count = 0
|
geo_count = 0
|
||||||
for art_id, locations in geo_results.items():
|
for art_id, locations in geo_results.items():
|
||||||
for loc in locations:
|
for loc in locations:
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"""INSERT INTO article_locations
|
"""INSERT INTO article_locations
|
||||||
(article_id, incident_id, location_name, location_name_normalized,
|
(article_id, incident_id, location_name, location_name_normalized,
|
||||||
country_code, latitude, longitude, confidence, source_text, tenant_id)
|
country_code, latitude, longitude, confidence, source_text, tenant_id, category)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
(art_id, incident_id, loc["location_name"], loc["location_name_normalized"],
|
(art_id, incident_id, loc["location_name"], loc["location_name_normalized"],
|
||||||
loc["country_code"], loc["lat"], loc["lon"], loc["confidence"],
|
loc["country_code"], loc["lat"], loc["lon"], loc["confidence"],
|
||||||
loc.get("source_text", ""), tenant_id),
|
loc.get("source_text", ""), tenant_id, loc.get("category", "mentioned")),
|
||||||
)
|
)
|
||||||
geo_count += 1
|
geo_count += 1
|
||||||
if geo_count > 0:
|
if geo_count > 0:
|
||||||
|
|||||||
@@ -351,6 +351,15 @@ async def _run_geoparse_background(incident_id: int, tenant_id: int | None):
|
|||||||
from agents.geoparsing import geoparse_articles
|
from agents.geoparsing import geoparse_articles
|
||||||
db = await get_db()
|
db = await get_db()
|
||||||
|
|
||||||
|
# Incident-Kontext fuer Haiku laden
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT title, description FROM incidents WHERE id = ?", (incident_id,)
|
||||||
|
)
|
||||||
|
inc_row = await cursor.fetchone()
|
||||||
|
incident_context = ""
|
||||||
|
if inc_row:
|
||||||
|
incident_context = f"{inc_row['title']} - {inc_row['description'] or ''}"
|
||||||
|
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"""SELECT a.* FROM articles a
|
"""SELECT a.* FROM articles a
|
||||||
WHERE a.incident_id = ?
|
WHERE a.incident_id = ?
|
||||||
@@ -373,7 +382,7 @@ async def _run_geoparse_background(incident_id: int, tenant_id: int | None):
|
|||||||
processed = 0
|
processed = 0
|
||||||
for i in range(0, total, batch_size):
|
for i in range(0, total, batch_size):
|
||||||
batch = articles[i:i + batch_size]
|
batch = articles[i:i + batch_size]
|
||||||
geo_results = await geoparse_articles(batch)
|
geo_results = await geoparse_articles(batch, incident_context)
|
||||||
for art_id, locations in geo_results.items():
|
for art_id, locations in geo_results.items():
|
||||||
for loc in locations:
|
for loc in locations:
|
||||||
await db.execute(
|
await db.execute(
|
||||||
|
|||||||
@@ -108,6 +108,23 @@ async def get_lagebild(db=Depends(db_dependency)):
|
|||||||
except (json.JSONDecodeError, TypeError):
|
except (json.JSONDecodeError, TypeError):
|
||||||
sources_json = []
|
sources_json = []
|
||||||
|
|
||||||
|
# Locations aggregiert nach normalisierten Ortsnamen
|
||||||
|
cursor = await db.execute(
|
||||||
|
f"""SELECT
|
||||||
|
al.location_name_normalized as name,
|
||||||
|
al.latitude as lat,
|
||||||
|
al.longitude as lon,
|
||||||
|
al.country_code,
|
||||||
|
al.category,
|
||||||
|
COUNT(*) as article_count,
|
||||||
|
MAX(al.confidence) as confidence
|
||||||
|
FROM article_locations al
|
||||||
|
WHERE al.incident_id IN ({ids})
|
||||||
|
GROUP BY al.location_name_normalized
|
||||||
|
ORDER BY article_count DESC"""
|
||||||
|
)
|
||||||
|
locations = [dict(r) for r in await cursor.fetchall()]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"generated_at": datetime.now(TIMEZONE).isoformat(),
|
"generated_at": datetime.now(TIMEZONE).isoformat(),
|
||||||
"incident": {
|
"incident": {
|
||||||
@@ -130,6 +147,7 @@ async def get_lagebild(db=Depends(db_dependency)):
|
|||||||
"articles": articles,
|
"articles": articles,
|
||||||
"fact_checks": fact_checks,
|
"fact_checks": fact_checks,
|
||||||
"available_snapshots": available_snapshots,
|
"available_snapshots": available_snapshots,
|
||||||
|
"locations": locations,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -642,6 +642,7 @@ const UI = {
|
|||||||
this._markerIcons = {
|
this._markerIcons = {
|
||||||
target: this._createSvgIcon('#dc3545', '#a71d2a'),
|
target: this._createSvgIcon('#dc3545', '#a71d2a'),
|
||||||
retaliation: this._createSvgIcon('#f39c12', '#c47d0a'),
|
retaliation: this._createSvgIcon('#f39c12', '#c47d0a'),
|
||||||
|
response: this._createSvgIcon('#f39c12', '#c47d0a'),
|
||||||
actor: this._createSvgIcon('#2a81cb', '#1a5c8f'),
|
actor: this._createSvgIcon('#2a81cb', '#1a5c8f'),
|
||||||
mentioned: this._createSvgIcon('#7b7b7b', '#555555'),
|
mentioned: this._createSvgIcon('#7b7b7b', '#555555'),
|
||||||
};
|
};
|
||||||
@@ -650,12 +651,14 @@ const UI = {
|
|||||||
_categoryLabels: {
|
_categoryLabels: {
|
||||||
target: 'Angegriffene Ziele',
|
target: 'Angegriffene Ziele',
|
||||||
retaliation: 'Vergeltung / Eskalation',
|
retaliation: 'Vergeltung / Eskalation',
|
||||||
|
response: 'Reaktion / Gegenmassnahmen',
|
||||||
actor: 'Strategische Akteure',
|
actor: 'Strategische Akteure',
|
||||||
mentioned: 'Erwaehnt',
|
mentioned: 'Erwaehnt',
|
||||||
},
|
},
|
||||||
_categoryColors: {
|
_categoryColors: {
|
||||||
target: '#cb2b3e',
|
target: '#cb2b3e',
|
||||||
retaliation: '#f39c12',
|
retaliation: '#f39c12',
|
||||||
|
response: '#f39c12',
|
||||||
actor: '#2a81cb',
|
actor: '#2a81cb',
|
||||||
mentioned: '#7b7b7b',
|
mentioned: '#7b7b7b',
|
||||||
},
|
},
|
||||||
@@ -799,7 +802,7 @@ const UI = {
|
|||||||
legend.onAdd = function() {
|
legend.onAdd = function() {
|
||||||
const div = L.DomUtil.create('div', 'map-legend-ctrl');
|
const div = L.DomUtil.create('div', 'map-legend-ctrl');
|
||||||
let html = '<strong style="display:block;margin-bottom:6px;">Legende</strong>';
|
let html = '<strong style="display:block;margin-bottom:6px;">Legende</strong>';
|
||||||
['target', 'retaliation', 'actor', 'mentioned'].forEach(cat => {
|
['target', 'retaliation', 'response', 'actor', 'mentioned'].forEach(cat => {
|
||||||
if (usedCategories.has(cat)) {
|
if (usedCategories.has(cat)) {
|
||||||
html += `<div style="display:flex;align-items:center;gap:6px;margin:3px 0;"><span style="width:10px;height:10px;border-radius:50%;background:${self2._categoryColors[cat]};flex-shrink:0;"></span><span>${self2._categoryLabels[cat]}</span></div>`;
|
html += `<div style="display:flex;align-items:center;gap:6px;margin:3px 0;"><span style="width:10px;height:10px;border-radius:50%;background:${self2._categoryColors[cat]};flex-shrink:0;"></span><span>${self2._categoryLabels[cat]}</span></div>`;
|
||||||
}
|
}
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren