Geoparsing: Laender vor Staedten pruefen, Alias-Tabelle
Behebt falsche Geocodierung bei Laendernamen die auch als Staedte existieren (Lebanon->US statt Libanon, Jordan->HK statt Jordanien). - Laender-Aliase (50+ deutsch/englisch) werden zuerst geprueft - geonamescache Laendersuche vor Staedtesuche - Stadtsuche in eigene _geocode_city() Funktion extrahiert - Bestehende falsche Marker in DB korrigiert Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -1,318 +1,409 @@
|
|||||||
"""Geoparsing-Modul: Haiku-basierte Ortsextraktion und Geocoding fuer Artikel."""
|
"""Geoparsing-Modul: Haiku-basierte Ortsextraktion und Geocoding fuer Artikel."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
|
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
|
||||||
from config import CLAUDE_MODEL_FAST
|
from config import CLAUDE_MODEL_FAST
|
||||||
|
|
||||||
logger = logging.getLogger("osint.geoparsing")
|
logger = logging.getLogger("osint.geoparsing")
|
||||||
|
|
||||||
# Geocoding-Cache (in-memory, lebt solange der Prozess laeuft)
|
# Geocoding-Cache (in-memory, lebt solange der Prozess laeuft)
|
||||||
_geocode_cache: dict[str, Optional[dict]] = {}
|
_geocode_cache: dict[str, Optional[dict]] = {}
|
||||||
|
|
||||||
# geonamescache-Instanz (lazy)
|
# geonamescache-Instanz (lazy)
|
||||||
_gc = None
|
_gc = None
|
||||||
|
|
||||||
|
|
||||||
def _get_geonamescache():
|
def _get_geonamescache():
|
||||||
"""Laedt geonamescache lazy."""
|
"""Laedt geonamescache lazy."""
|
||||||
global _gc
|
global _gc
|
||||||
if _gc is None:
|
if _gc is None:
|
||||||
try:
|
try:
|
||||||
import geonamescache
|
import geonamescache
|
||||||
_gc = geonamescache.GeonamesCache()
|
_gc = geonamescache.GeonamesCache()
|
||||||
logger.info("geonamescache geladen")
|
logger.info("geonamescache geladen")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.error("geonamescache nicht installiert - pip install geonamescache")
|
logger.error("geonamescache nicht installiert - pip install geonamescache")
|
||||||
return None
|
return None
|
||||||
return _gc
|
return _gc
|
||||||
|
|
||||||
|
|
||||||
def _geocode_offline(name: str, country_code: str = "") -> Optional[dict]:
|
# Bekannte Laendernamen (deutsch/englisch/alternativ -> ISO-2 Code + Hauptstadt-Koordinaten)
|
||||||
"""Geocoding ueber geonamescache (offline).
|
_COUNTRY_ALIASES = {
|
||||||
|
"libanon": {"code": "LB", "name": "Lebanon", "lat": 33.8938, "lon": 35.5018},
|
||||||
Args:
|
"lebanon": {"code": "LB", "name": "Lebanon", "lat": 33.8938, "lon": 35.5018},
|
||||||
name: Ortsname (normalisiert von Haiku)
|
"jordan": {"code": "JO", "name": "Jordan", "lat": 31.9454, "lon": 35.9284},
|
||||||
country_code: ISO-2 Laendercode (von Haiku) fuer bessere Disambiguierung
|
"jordanien": {"code": "JO", "name": "Jordan", "lat": 31.9454, "lon": 35.9284},
|
||||||
"""
|
"iran": {"code": "IR", "name": "Iran", "lat": 35.6892, "lon": 51.3890},
|
||||||
gc = _get_geonamescache()
|
"irak": {"code": "IQ", "name": "Iraq", "lat": 33.3152, "lon": 44.3661},
|
||||||
if gc is None:
|
"iraq": {"code": "IQ", "name": "Iraq", "lat": 33.3152, "lon": 44.3661},
|
||||||
return None
|
"israel": {"code": "IL", "name": "Israel", "lat": 31.7683, "lon": 35.2137},
|
||||||
|
"syrien": {"code": "SY", "name": "Syria", "lat": 33.5138, "lon": 36.2765},
|
||||||
name_lower = name.lower().strip()
|
"syria": {"code": "SY", "name": "Syria", "lat": 33.5138, "lon": 36.2765},
|
||||||
|
"tuerkei": {"code": "TR", "name": "Turkey", "lat": 39.9334, "lon": 32.8597},
|
||||||
# 1. Stadtsuche
|
"turkey": {"code": "TR", "name": "Turkey", "lat": 39.9334, "lon": 32.8597},
|
||||||
cities = gc.get_cities()
|
"kuwait": {"code": "KW", "name": "Kuwait", "lat": 29.3759, "lon": 47.9774},
|
||||||
matches = []
|
"bahrain": {"code": "BH", "name": "Bahrain", "lat": 26.0667, "lon": 50.5577},
|
||||||
for gid, city in cities.items():
|
"katar": {"code": "QA", "name": "Qatar", "lat": 25.2854, "lon": 51.5310},
|
||||||
city_name = city.get("name", "")
|
"qatar": {"code": "QA", "name": "Qatar", "lat": 25.2854, "lon": 51.5310},
|
||||||
alt_names = city.get("alternatenames", "")
|
"jemen": {"code": "YE", "name": "Yemen", "lat": 15.3694, "lon": 44.1910},
|
||||||
if isinstance(alt_names, list):
|
"yemen": {"code": "YE", "name": "Yemen", "lat": 15.3694, "lon": 44.1910},
|
||||||
alt_list = [n.strip().lower() for n in alt_names if n.strip()]
|
"oman": {"code": "OM", "name": "Oman", "lat": 23.5880, "lon": 58.3829},
|
||||||
else:
|
"pakistan": {"code": "PK", "name": "Pakistan", "lat": 33.6844, "lon": 73.0479},
|
||||||
alt_list = [n.strip().lower() for n in str(alt_names).split(",") if n.strip()]
|
"afghanistan": {"code": "AF", "name": "Afghanistan", "lat": 34.5553, "lon": 69.2075},
|
||||||
if city_name.lower() == name_lower or name_lower in alt_list:
|
"aegypten": {"code": "EG", "name": "Egypt", "lat": 30.0444, "lon": 31.2357},
|
||||||
matches.append(city)
|
"egypt": {"code": "EG", "name": "Egypt", "lat": 30.0444, "lon": 31.2357},
|
||||||
|
"saudi-arabien": {"code": "SA", "name": "Saudi Arabia", "lat": 24.7136, "lon": 46.6753},
|
||||||
if matches:
|
"saudi arabia": {"code": "SA", "name": "Saudi Arabia", "lat": 24.7136, "lon": 46.6753},
|
||||||
# Disambiguierung: country_code bevorzugen, dann Population
|
"deutschland": {"code": "DE", "name": "Germany", "lat": 52.5200, "lon": 13.4050},
|
||||||
if country_code:
|
"germany": {"code": "DE", "name": "Germany", "lat": 52.5200, "lon": 13.4050},
|
||||||
cc_matches = [c for c in matches if c.get("countrycode", "").upper() == (country_code or "").upper()]
|
"frankreich": {"code": "FR", "name": "France", "lat": 48.8566, "lon": 2.3522},
|
||||||
if cc_matches:
|
"france": {"code": "FR", "name": "France", "lat": 48.8566, "lon": 2.3522},
|
||||||
matches = cc_matches
|
"russland": {"code": "RU", "name": "Russia", "lat": 55.7558, "lon": 37.6173},
|
||||||
best = max(matches, key=lambda c: c.get("population", 0))
|
"russia": {"code": "RU", "name": "Russia", "lat": 55.7558, "lon": 37.6173},
|
||||||
return {
|
"china": {"code": "CN", "name": "China", "lat": 39.9042, "lon": 116.4074},
|
||||||
"lat": float(best["latitude"]),
|
"indien": {"code": "IN", "name": "India", "lat": 28.6139, "lon": 77.2090},
|
||||||
"lon": float(best["longitude"]),
|
"india": {"code": "IN", "name": "India", "lat": 28.6139, "lon": 77.2090},
|
||||||
"country_code": best.get("countrycode", ""),
|
"usa": {"code": "US", "name": "United States", "lat": 38.9072, "lon": -77.0369},
|
||||||
"normalized_name": best["name"],
|
"vereinigte staaten": {"code": "US", "name": "United States", "lat": 38.9072, "lon": -77.0369},
|
||||||
"confidence": min(1.0, 0.6 + (best.get("population", 0) / 10_000_000)),
|
"united states": {"code": "US", "name": "United States", "lat": 38.9072, "lon": -77.0369},
|
||||||
}
|
"grossbritannien": {"code": "GB", "name": "United Kingdom", "lat": 51.5074, "lon": -0.1278},
|
||||||
|
"united kingdom": {"code": "GB", "name": "United Kingdom", "lat": 51.5074, "lon": -0.1278},
|
||||||
# 2. Laendersuche
|
"schweiz": {"code": "CH", "name": "Switzerland", "lat": 46.9480, "lon": 7.4474},
|
||||||
countries = gc.get_countries()
|
"switzerland": {"code": "CH", "name": "Switzerland", "lat": 46.9480, "lon": 7.4474},
|
||||||
for code, country in countries.items():
|
"spanien": {"code": "ES", "name": "Spain", "lat": 40.4168, "lon": -3.7038},
|
||||||
if country.get("name", "").lower() == name_lower:
|
"spain": {"code": "ES", "name": "Spain", "lat": 40.4168, "lon": -3.7038},
|
||||||
capital = country.get("capital", "")
|
"italien": {"code": "IT", "name": "Italy", "lat": 41.9028, "lon": 12.4964},
|
||||||
if capital:
|
"italy": {"code": "IT", "name": "Italy", "lat": 41.9028, "lon": 12.4964},
|
||||||
cap_result = _geocode_offline(capital)
|
"zypern": {"code": "CY", "name": "Cyprus", "lat": 35.1856, "lon": 33.3823},
|
||||||
if cap_result:
|
"cyprus": {"code": "CY", "name": "Cyprus", "lat": 35.1856, "lon": 33.3823},
|
||||||
cap_result["normalized_name"] = country["name"]
|
"aserbaidschan": {"code": "AZ", "name": "Azerbaijan", "lat": 40.4093, "lon": 49.8671},
|
||||||
cap_result["confidence"] = 0.5
|
"azerbaijan": {"code": "AZ", "name": "Azerbaijan", "lat": 40.4093, "lon": 49.8671},
|
||||||
return cap_result
|
"griechenland": {"code": "GR", "name": "Greece", "lat": 37.9838, "lon": 23.7275},
|
||||||
|
"greece": {"code": "GR", "name": "Greece", "lat": 37.9838, "lon": 23.7275},
|
||||||
return None
|
"niederlande": {"code": "NL", "name": "Netherlands", "lat": 52.3676, "lon": 4.9041},
|
||||||
|
"netherlands": {"code": "NL", "name": "Netherlands", "lat": 52.3676, "lon": 4.9041},
|
||||||
|
"ukraine": {"code": "UA", "name": "Ukraine", "lat": 50.4501, "lon": 30.5234},
|
||||||
def _geocode_location(name: str, country_code: str = "", haiku_coords: Optional[dict] = None) -> Optional[dict]:
|
}
|
||||||
"""Geocoded einen Ortsnamen. Prioritaet: geonamescache > Haiku-Koordinaten.
|
|
||||||
|
|
||||||
Args:
|
def _geocode_offline(name: str, country_code: str = "") -> Optional[dict]:
|
||||||
name: Ortsname
|
"""Geocoding ueber geonamescache (offline).
|
||||||
country_code: ISO-2 Code (von Haiku)
|
|
||||||
haiku_coords: {"lat": float, "lon": float} (Fallback von Haiku)
|
Reihenfolge: 1. Bekannte Laender-Aliase, 2. geonamescache-Laender, 3. Staedte.
|
||||||
"""
|
Laender werden IMMER vor Staedten geprueft um Verwechslungen zu vermeiden
|
||||||
cache_key = f"{name.lower().strip()}|{(country_code or '').upper()}"
|
(z.B. Lebanon/US vs Libanon, Jordan/HK vs Jordanien).
|
||||||
if cache_key in _geocode_cache:
|
"""
|
||||||
return _geocode_cache[cache_key]
|
gc = _get_geonamescache()
|
||||||
|
if gc is None:
|
||||||
result = _geocode_offline(name, country_code)
|
return None
|
||||||
|
|
||||||
# Fallback: Haiku-Koordinaten nutzen
|
name_lower = name.lower().strip()
|
||||||
if result is None and haiku_coords:
|
|
||||||
lat = haiku_coords.get("lat")
|
# 1. Bekannte Laender-Aliase (schnellster + sicherster Pfad)
|
||||||
lon = haiku_coords.get("lon")
|
alias = _COUNTRY_ALIASES.get(name_lower)
|
||||||
if lat is not None and lon is not None:
|
if alias:
|
||||||
result = {
|
return {
|
||||||
"lat": float(lat),
|
"lat": alias["lat"],
|
||||||
"lon": float(lon),
|
"lon": alias["lon"],
|
||||||
"country_code": country_code.upper() if country_code else "",
|
"country_code": alias["code"],
|
||||||
"normalized_name": name,
|
"normalized_name": alias["name"],
|
||||||
"confidence": 0.45,
|
"confidence": 0.95,
|
||||||
}
|
}
|
||||||
|
|
||||||
_geocode_cache[cache_key] = result
|
# 2. geonamescache Laendersuche (vor Staedten!)
|
||||||
return result
|
countries = gc.get_countries()
|
||||||
|
for code, country in countries.items():
|
||||||
|
if country.get("name", "").lower() == name_lower:
|
||||||
HAIKU_GEOPARSE_PROMPT = """Extrahiere alle geographischen Orte aus diesen Nachrichten-Headlines.
|
capital = country.get("capital", "")
|
||||||
|
if capital:
|
||||||
Kontext der Lage: "{incident_context}"
|
# Hauptstadt geocoden, aber als Land benennen
|
||||||
|
cap_alias = _COUNTRY_ALIASES.get(capital.lower())
|
||||||
Regeln:
|
if cap_alias:
|
||||||
- Nur echte Orte (Staedte, Laender, Regionen)
|
return {
|
||||||
- Keine Personen, Organisationen, Gebaeude, Alltagswoerter
|
"lat": cap_alias["lat"],
|
||||||
- Bei "US-Militaer" etc: Land (USA) extrahieren, nicht das Kompositum
|
"lon": cap_alias["lon"],
|
||||||
- HTML-Tags ignorieren
|
"country_code": code,
|
||||||
- Jeder Ort nur einmal pro Headline
|
"normalized_name": country["name"],
|
||||||
- Regionen wie "Middle East", "Gulf", "Naher Osten" NICHT extrahieren (kein einzelner Punkt auf der Karte)
|
"confidence": 0.9,
|
||||||
|
}
|
||||||
Klassifiziere basierend auf dem Lage-Kontext:
|
# Rekursiv die Hauptstadt suchen (nur Staedte-Pfad)
|
||||||
- "target": Wo das Ereignis passiert / Schaden entsteht
|
cap_result = _geocode_city(capital, code)
|
||||||
- "response": Wo Reaktionen / Gegenmassnahmen stattfinden
|
if cap_result:
|
||||||
- "actor": Wo Entscheidungen getroffen werden / Entscheider sitzen
|
cap_result["normalized_name"] = country["name"]
|
||||||
- "mentioned": Nur erwaehnt, kein direkter Bezug
|
cap_result["confidence"] = 0.9
|
||||||
|
return cap_result
|
||||||
Headlines:
|
|
||||||
{headlines}
|
# 3. Stadtsuche (nur wenn kein Land gefunden)
|
||||||
|
return _geocode_city(name, country_code)
|
||||||
Antwort NUR als JSON-Array, kein anderer Text:
|
|
||||||
[{{"headline_idx": 0, "locations": [
|
|
||||||
{{"name": "Teheran", "normalized": "Tehran", "country_code": "IR",
|
def _geocode_city(name: str, country_code: str = "") -> Optional[dict]:
|
||||||
"type": "city", "category": "target",
|
"""Sucht einen Stadtnamen in geonamescache."""
|
||||||
"lat": 35.69, "lon": 51.42}}
|
gc = _get_geonamescache()
|
||||||
]}}]"""
|
if gc is None:
|
||||||
|
return None
|
||||||
|
|
||||||
async def _extract_locations_haiku(
|
name_lower = name.lower().strip()
|
||||||
headlines: list[dict], incident_context: str
|
cities = gc.get_cities()
|
||||||
) -> dict[int, list[dict]]:
|
matches = []
|
||||||
"""Extrahiert Orte aus Headlines via Haiku.
|
for gid, city in cities.items():
|
||||||
|
city_name = city.get("name", "")
|
||||||
Args:
|
alt_names = city.get("alternatenames", "")
|
||||||
headlines: [{"idx": article_id, "text": headline_text}, ...]
|
if isinstance(alt_names, list):
|
||||||
incident_context: Lage-Kontext fuer Klassifizierung
|
alt_list = [n.strip().lower() for n in alt_names if n.strip()]
|
||||||
|
else:
|
||||||
Returns:
|
alt_list = [n.strip().lower() for n in str(alt_names).split(",") if n.strip()]
|
||||||
dict[article_id -> list[{name, normalized, country_code, type, category, lat, lon}]]
|
if city_name.lower() == name_lower or name_lower in alt_list:
|
||||||
"""
|
matches.append(city)
|
||||||
if not headlines:
|
|
||||||
return {}
|
if not matches:
|
||||||
|
return None
|
||||||
# Headlines formatieren
|
|
||||||
headline_lines = []
|
# Disambiguierung: country_code bevorzugen, dann Population
|
||||||
for i, h in enumerate(headlines):
|
if country_code:
|
||||||
headline_lines.append(f"[{i}] {h['text']}")
|
cc_matches = [c for c in matches if c.get("countrycode", "").upper() == (country_code or "").upper()]
|
||||||
|
if cc_matches:
|
||||||
prompt = HAIKU_GEOPARSE_PROMPT.format(
|
matches = cc_matches
|
||||||
incident_context=incident_context or "Allgemeine Nachrichtenlage",
|
best = max(matches, key=lambda c: c.get("population", 0))
|
||||||
headlines="\n".join(headline_lines),
|
return {
|
||||||
)
|
"lat": float(best["latitude"]),
|
||||||
|
"lon": float(best["longitude"]),
|
||||||
try:
|
"country_code": best.get("countrycode", ""),
|
||||||
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
"normalized_name": best["name"],
|
||||||
except Exception as e:
|
"confidence": min(1.0, 0.6 + (best.get("population", 0) / 10_000_000)),
|
||||||
logger.error(f"Haiku-Geoparsing fehlgeschlagen: {e}")
|
}
|
||||||
return {}
|
|
||||||
|
|
||||||
# JSON parsen (mit Regex-Fallback)
|
def _geocode_location(name: str, country_code: str = "", haiku_coords: Optional[dict] = None) -> Optional[dict]:
|
||||||
parsed = None
|
"""Geocoded einen Ortsnamen. Prioritaet: geonamescache > Haiku-Koordinaten.
|
||||||
try:
|
|
||||||
parsed = json.loads(result_text)
|
Args:
|
||||||
except json.JSONDecodeError:
|
name: Ortsname
|
||||||
match = re.search(r'\[.*\]', result_text, re.DOTALL)
|
country_code: ISO-2 Code (von Haiku)
|
||||||
if match:
|
haiku_coords: {"lat": float, "lon": float} (Fallback von Haiku)
|
||||||
try:
|
"""
|
||||||
parsed = json.loads(match.group())
|
cache_key = f"{name.lower().strip()}|{(country_code or '').upper()}"
|
||||||
except json.JSONDecodeError:
|
if cache_key in _geocode_cache:
|
||||||
logger.warning("Haiku-Geoparsing: JSON-Parse fehlgeschlagen auch mit Regex-Fallback")
|
return _geocode_cache[cache_key]
|
||||||
return {}
|
|
||||||
|
result = _geocode_offline(name, country_code)
|
||||||
if not parsed or not isinstance(parsed, list):
|
|
||||||
logger.warning("Haiku-Geoparsing: Kein gueltiges JSON-Array erhalten")
|
# Fallback: Haiku-Koordinaten nutzen
|
||||||
return {}
|
if result is None and haiku_coords:
|
||||||
|
lat = haiku_coords.get("lat")
|
||||||
# Ergebnisse den Artikeln zuordnen
|
lon = haiku_coords.get("lon")
|
||||||
results = {}
|
if lat is not None and lon is not None:
|
||||||
for entry in parsed:
|
result = {
|
||||||
if not isinstance(entry, dict):
|
"lat": float(lat),
|
||||||
continue
|
"lon": float(lon),
|
||||||
headline_idx = entry.get("headline_idx")
|
"country_code": country_code.upper() if country_code else "",
|
||||||
if headline_idx is None or headline_idx >= len(headlines):
|
"normalized_name": name,
|
||||||
continue
|
"confidence": 0.45,
|
||||||
|
}
|
||||||
article_id = headlines[headline_idx]["idx"]
|
|
||||||
locations = entry.get("locations", [])
|
_geocode_cache[cache_key] = result
|
||||||
|
return result
|
||||||
if not locations:
|
|
||||||
continue
|
|
||||||
|
HAIKU_GEOPARSE_PROMPT = """Extrahiere alle geographischen Orte aus diesen Nachrichten-Headlines.
|
||||||
article_locs = []
|
|
||||||
for loc in locations:
|
Kontext der Lage: "{incident_context}"
|
||||||
if not isinstance(loc, dict):
|
|
||||||
continue
|
Regeln:
|
||||||
loc_type = loc.get("type", "city")
|
- Nur echte Orte (Staedte, Laender, Regionen)
|
||||||
# Regionen nicht speichern (kein sinnvoller Punkt auf der Karte)
|
- Keine Personen, Organisationen, Gebaeude, Alltagswoerter
|
||||||
if loc_type == "region":
|
- Bei "US-Militaer" etc: Land (USA) extrahieren, nicht das Kompositum
|
||||||
continue
|
- HTML-Tags ignorieren
|
||||||
|
- Jeder Ort nur einmal pro Headline
|
||||||
name = loc.get("name", "")
|
- Regionen wie "Middle East", "Gulf", "Naher Osten" NICHT extrahieren (kein einzelner Punkt auf der Karte)
|
||||||
if not name:
|
|
||||||
continue
|
Klassifiziere basierend auf dem Lage-Kontext:
|
||||||
|
- "target": Wo das Ereignis passiert / Schaden entsteht
|
||||||
article_locs.append({
|
- "response": Wo Reaktionen / Gegenmassnahmen stattfinden
|
||||||
"name": name,
|
- "actor": Wo Entscheidungen getroffen werden / Entscheider sitzen
|
||||||
"normalized": loc.get("normalized", name),
|
- "mentioned": Nur erwaehnt, kein direkter Bezug
|
||||||
"country_code": loc.get("country_code", ""),
|
|
||||||
"type": loc_type,
|
Headlines:
|
||||||
"category": loc.get("category", "mentioned"),
|
{headlines}
|
||||||
"lat": loc.get("lat"),
|
|
||||||
"lon": loc.get("lon"),
|
Antwort NUR als JSON-Array, kein anderer Text:
|
||||||
})
|
[{{"headline_idx": 0, "locations": [
|
||||||
|
{{"name": "Teheran", "normalized": "Tehran", "country_code": "IR",
|
||||||
if article_locs:
|
"type": "city", "category": "target",
|
||||||
results[article_id] = article_locs
|
"lat": 35.69, "lon": 51.42}}
|
||||||
|
]}}]"""
|
||||||
return results
|
|
||||||
|
|
||||||
|
async def _extract_locations_haiku(
|
||||||
async def geoparse_articles(
|
headlines: list[dict], incident_context: str
|
||||||
articles: list[dict],
|
) -> dict[int, list[dict]]:
|
||||||
incident_context: str = "",
|
"""Extrahiert Orte aus Headlines via Haiku.
|
||||||
) -> dict[int, list[dict]]:
|
|
||||||
"""Geoparsing fuer eine Liste von Artikeln via Haiku + geonamescache.
|
Args:
|
||||||
|
headlines: [{"idx": article_id, "text": headline_text}, ...]
|
||||||
Args:
|
incident_context: Lage-Kontext fuer Klassifizierung
|
||||||
articles: Liste von Artikel-Dicts (mit id, headline, headline_de, language)
|
|
||||||
incident_context: Lage-Kontext (Titel + Beschreibung) fuer kontextbewusste Klassifizierung
|
Returns:
|
||||||
|
dict[article_id -> list[{name, normalized, country_code, type, category, lat, lon}]]
|
||||||
Returns:
|
"""
|
||||||
dict[article_id -> list[{location_name, location_name_normalized, country_code,
|
if not headlines:
|
||||||
lat, lon, confidence, source_text, category}]]
|
return {}
|
||||||
"""
|
|
||||||
if not articles:
|
# Headlines formatieren
|
||||||
return {}
|
headline_lines = []
|
||||||
|
for i, h in enumerate(headlines):
|
||||||
# Headlines sammeln
|
headline_lines.append(f"[{i}] {h['text']}")
|
||||||
headlines = []
|
|
||||||
for article in articles:
|
prompt = HAIKU_GEOPARSE_PROMPT.format(
|
||||||
article_id = article.get("id")
|
incident_context=incident_context or "Allgemeine Nachrichtenlage",
|
||||||
if not article_id:
|
headlines="\n".join(headline_lines),
|
||||||
continue
|
)
|
||||||
|
|
||||||
# Deutsche Headline bevorzugen
|
try:
|
||||||
headline = article.get("headline_de") or article.get("headline") or ""
|
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||||
headline = headline.strip()
|
except Exception as e:
|
||||||
if not headline:
|
logger.error(f"Haiku-Geoparsing fehlgeschlagen: {e}")
|
||||||
continue
|
return {}
|
||||||
|
|
||||||
headlines.append({"idx": article_id, "text": headline})
|
# JSON parsen (mit Regex-Fallback)
|
||||||
|
parsed = None
|
||||||
if not headlines:
|
try:
|
||||||
return {}
|
parsed = json.loads(result_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
# Batches bilden (max 50 Headlines pro Haiku-Call)
|
match = re.search(r'\[.*\]', result_text, re.DOTALL)
|
||||||
batch_size = 50
|
if match:
|
||||||
all_haiku_results = {}
|
try:
|
||||||
for i in range(0, len(headlines), batch_size):
|
parsed = json.loads(match.group())
|
||||||
batch = headlines[i:i + batch_size]
|
except json.JSONDecodeError:
|
||||||
batch_results = await _extract_locations_haiku(batch, incident_context)
|
logger.warning("Haiku-Geoparsing: JSON-Parse fehlgeschlagen auch mit Regex-Fallback")
|
||||||
all_haiku_results.update(batch_results)
|
return {}
|
||||||
|
|
||||||
if not all_haiku_results:
|
if not parsed or not isinstance(parsed, list):
|
||||||
return {}
|
logger.warning("Haiku-Geoparsing: Kein gueltiges JSON-Array erhalten")
|
||||||
|
return {}
|
||||||
# Geocoding via geonamescache (mit Haiku-Koordinaten als Fallback)
|
|
||||||
result = {}
|
# Ergebnisse den Artikeln zuordnen
|
||||||
for article_id, haiku_locs in all_haiku_results.items():
|
results = {}
|
||||||
locations = []
|
for entry in parsed:
|
||||||
for loc in haiku_locs:
|
if not isinstance(entry, dict):
|
||||||
haiku_coords = None
|
continue
|
||||||
if loc.get("lat") is not None and loc.get("lon") is not None:
|
headline_idx = entry.get("headline_idx")
|
||||||
haiku_coords = {"lat": loc["lat"], "lon": loc["lon"]}
|
if headline_idx is None or headline_idx >= len(headlines):
|
||||||
|
continue
|
||||||
geo = _geocode_location(
|
|
||||||
loc["normalized"],
|
article_id = headlines[headline_idx]["idx"]
|
||||||
loc.get("country_code", ""),
|
locations = entry.get("locations", [])
|
||||||
haiku_coords,
|
|
||||||
)
|
if not locations:
|
||||||
|
continue
|
||||||
if geo:
|
|
||||||
locations.append({
|
article_locs = []
|
||||||
"location_name": loc["name"],
|
for loc in locations:
|
||||||
"location_name_normalized": geo["normalized_name"],
|
if not isinstance(loc, dict):
|
||||||
"country_code": geo["country_code"],
|
continue
|
||||||
"lat": geo["lat"],
|
loc_type = loc.get("type", "city")
|
||||||
"lon": geo["lon"],
|
# Regionen nicht speichern (kein sinnvoller Punkt auf der Karte)
|
||||||
"confidence": geo["confidence"],
|
if loc_type == "region":
|
||||||
"source_text": loc["name"],
|
continue
|
||||||
"category": loc.get("category", "mentioned"),
|
|
||||||
})
|
name = loc.get("name", "")
|
||||||
|
if not name:
|
||||||
if locations:
|
continue
|
||||||
result[article_id] = locations
|
|
||||||
|
article_locs.append({
|
||||||
return result
|
"name": name,
|
||||||
|
"normalized": loc.get("normalized", name),
|
||||||
|
"country_code": loc.get("country_code", ""),
|
||||||
|
"type": loc_type,
|
||||||
|
"category": loc.get("category", "mentioned"),
|
||||||
|
"lat": loc.get("lat"),
|
||||||
|
"lon": loc.get("lon"),
|
||||||
|
})
|
||||||
|
|
||||||
|
if article_locs:
|
||||||
|
results[article_id] = article_locs
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def geoparse_articles(
|
||||||
|
articles: list[dict],
|
||||||
|
incident_context: str = "",
|
||||||
|
) -> dict[int, list[dict]]:
|
||||||
|
"""Geoparsing fuer eine Liste von Artikeln via Haiku + geonamescache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: Liste von Artikel-Dicts (mit id, headline, headline_de, language)
|
||||||
|
incident_context: Lage-Kontext (Titel + Beschreibung) fuer kontextbewusste Klassifizierung
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict[article_id -> list[{location_name, location_name_normalized, country_code,
|
||||||
|
lat, lon, confidence, source_text, category}]]
|
||||||
|
"""
|
||||||
|
if not articles:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Headlines sammeln
|
||||||
|
headlines = []
|
||||||
|
for article in articles:
|
||||||
|
article_id = article.get("id")
|
||||||
|
if not article_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Deutsche Headline bevorzugen
|
||||||
|
headline = article.get("headline_de") or article.get("headline") or ""
|
||||||
|
headline = headline.strip()
|
||||||
|
if not headline:
|
||||||
|
continue
|
||||||
|
|
||||||
|
headlines.append({"idx": article_id, "text": headline})
|
||||||
|
|
||||||
|
if not headlines:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Batches bilden (max 50 Headlines pro Haiku-Call)
|
||||||
|
batch_size = 50
|
||||||
|
all_haiku_results = {}
|
||||||
|
for i in range(0, len(headlines), batch_size):
|
||||||
|
batch = headlines[i:i + batch_size]
|
||||||
|
batch_results = await _extract_locations_haiku(batch, incident_context)
|
||||||
|
all_haiku_results.update(batch_results)
|
||||||
|
|
||||||
|
if not all_haiku_results:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Geocoding via geonamescache (mit Haiku-Koordinaten als Fallback)
|
||||||
|
result = {}
|
||||||
|
for article_id, haiku_locs in all_haiku_results.items():
|
||||||
|
locations = []
|
||||||
|
for loc in haiku_locs:
|
||||||
|
haiku_coords = None
|
||||||
|
if loc.get("lat") is not None and loc.get("lon") is not None:
|
||||||
|
haiku_coords = {"lat": loc["lat"], "lon": loc["lon"]}
|
||||||
|
|
||||||
|
geo = _geocode_location(
|
||||||
|
loc["normalized"],
|
||||||
|
loc.get("country_code", ""),
|
||||||
|
haiku_coords,
|
||||||
|
)
|
||||||
|
|
||||||
|
if geo:
|
||||||
|
locations.append({
|
||||||
|
"location_name": loc["name"],
|
||||||
|
"location_name_normalized": geo["normalized_name"],
|
||||||
|
"country_code": geo["country_code"],
|
||||||
|
"lat": geo["lat"],
|
||||||
|
"lon": geo["lon"],
|
||||||
|
"confidence": geo["confidence"],
|
||||||
|
"source_text": loc["name"],
|
||||||
|
"category": loc.get("category", "mentioned"),
|
||||||
|
})
|
||||||
|
|
||||||
|
if locations:
|
||||||
|
result[article_id] = locations
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren