From 7e600184e85c838323cec7956c2c8a60ffa833ef Mon Sep 17 00:00:00 2001 From: claude-dev Date: Sat, 7 Mar 2026 21:00:01 +0100 Subject: [PATCH] =?UTF-8?q?Geoparsing:=20Nominatim-Plausibilit=C3=A4tspr?= =?UTF-8?q?=C3=BCfung=20+=20regionale=20Stopwords?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SequenceMatcher prüft ob Nominatim-Ergebnis zum Suchbegriff passt (Schwelle 0.3) - Verwirft Fehlzuordnungen wie 'Golf-Staaten' -> Uganda - Regionale/vage Begriffe als Stopwords (Naher Osten, Golf-Staaten, Balkan etc.) - Falscher DB-Eintrag (Botschafter-Residenz Uganda) bereinigt Co-Authored-By: Claude Opus 4.6 --- src/agents/geoparsing.py | 71 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/src/agents/geoparsing.py b/src/agents/geoparsing.py index 4ec7165..1a18569 100644 --- a/src/agents/geoparsing.py +++ b/src/agents/geoparsing.py @@ -2,6 +2,7 @@ import asyncio import logging import re +from difflib import SequenceMatcher from typing import Optional logger = logging.getLogger("osint.geoparsing") @@ -19,12 +20,70 @@ LOCATION_STOPWORDS = { "Twitter", "Facebook", "Telegram", "Signal", "WhatsApp", "Reuters", "AP", "AFP", "DPA", "dpa", "Internet", "Online", "Web", + # Regionale/vage Begriffe (kein einzelner Punkt auf der Karte) + "Naher Osten", "Mittlerer Osten", "Middle East", "Near East", + "Golf-Staaten", "Golfstaaten", "Golfregion", "Gulf States", "Persian Gulf", + "Nordafrika", "Subsahara", "Zentralasien", "Suedostasien", + "Westeuropa", "Osteuropa", "Suedeuropa", "Nordeuropa", + "Balkan", "Kaukasus", "Levante", "Maghreb", "Sahel", + "Arabische Welt", "Arab World", } # Maximale Textlaenge fuer NER-Verarbeitung MAX_TEXT_LENGTH = 10000 +# Marker-Kategorien fuer Karten-Klassifizierung +CATEGORY_KEYWORDS = { + "target": [ + "angriff", "angegriff", "bombardier", "luftschlag", "luftangriff", + "beschuss", "beschossen", "getroffen", "zerstoer", "einschlag", + "detonation", "explosion", "strike", "attack", "bombed", "hit", + "shelled", "destroyed", "targeted", "missile hit", "air strike", + "airstrike", "bombardment", "killed", "casualties", "dead", + "tote", "opfer", "getoetet", + ], + "retaliation": [ + "gegenschlag", "vergeltung", "reaktion", "gegenangriff", + "abgefeuert", "retaliat", "counter-attack", "counterattack", + "counter-strike", "response", "fired back", "launched", + "rakete abgefeuert", "vergeltungsschlag", "abfangen", + "abgefangen", "intercepted", "eskalation", "escalat", + ], + "actor": [ + "regierung", "praesident", "ministerium", "hauptquartier", + "kommando", "nato", "pentagon", "kongress", "senat", "parlament", + "government", "president", "ministry", "headquarters", "command", + "congress", "senate", "parliament", "white house", "weisses haus", + "verteidigungsminister", "aussenminister", "generalstab", + "defense secretary", "secretary of state", "general staff", + "un-sicherheitsrat", "security council", "summit", "gipfel", + "diplomati", "botschaft", "embassy", + ], +} + + +def _classify_location(source_text: str, article_text: str = "") -> str: + """Klassifiziert eine Location basierend auf dem Kontext. + + Returns: + Kategorie: 'target', 'retaliation', 'actor', oder 'mentioned' + """ + text = (source_text + " " + article_text[:500]).lower() + + scores = {"target": 0, "retaliation": 0, "actor": 0} + for category, keywords in CATEGORY_KEYWORDS.items(): + for kw in keywords: + if kw in text: + scores[category] += 1 + + best = max(scores, key=scores.get) + if scores[best] >= 1: + return best + return "mentioned" + + + def _load_spacy_model(lang: str): """Laedt ein spaCy-Modell lazy (nur beim ersten Aufruf).""" global _nlp_de, _nlp_en @@ -206,11 +265,19 @@ def _geocode_nominatim(name: str) -> Optional[dict]: if "address" in raw: country_code = raw["address"].get("country_code", "").upper() + normalized_name = location.address.split(",")[0] if location.address else name + + # Plausibilitaetspruefung: Nominatim-Ergebnis muss zum Suchbegriff passen + similarity = SequenceMatcher(None, name.lower(), normalized_name.lower()).ratio() + if similarity < 0.3: + logger.debug(f"Nominatim-Ergebnis verworfen: '{name}' -> '{normalized_name}' (Aehnlichkeit {similarity:.2f})") + return None + return { "lat": float(location.latitude), "lon": float(location.longitude), "country_code": country_code, - "normalized_name": location.address.split(",")[0] if location.address else name, + "normalized_name": normalized_name, "confidence": 0.4, # Nominatim-Ergebnis = niedrigere Konfidenz } except (GeocoderTimedOut, GeocoderServiceError) as e: @@ -276,6 +343,7 @@ async def geoparse_articles(articles: list[dict]) -> dict[int, list[dict]]: for loc in locations_raw: geo = await asyncio.to_thread(_geocode_location, loc["name"]) if geo: + category = _classify_location(loc.get("source_text", ""), text) locations.append({ "location_name": loc["name"], "location_name_normalized": geo["normalized_name"], @@ -284,6 +352,7 @@ async def geoparse_articles(articles: list[dict]) -> dict[int, list[dict]]: "lon": geo["lon"], "confidence": geo["confidence"], "source_text": loc.get("source_text", ""), + "category": category, }) if locations: