Geoparsing: Nominatim-Plausibilitätsprüfung + regionale Stopwords
- SequenceMatcher prüft ob Nominatim-Ergebnis zum Suchbegriff passt (Schwelle 0.3) - Verwirft Fehlzuordnungen wie 'Golf-Staaten' -> Uganda - Regionale/vage Begriffe als Stopwords (Naher Osten, Golf-Staaten, Balkan etc.) - Falscher DB-Eintrag (Botschafter-Residenz Uganda) bereinigt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -2,6 +2,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from difflib import SequenceMatcher
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
logger = logging.getLogger("osint.geoparsing")
|
logger = logging.getLogger("osint.geoparsing")
|
||||||
@@ -19,12 +20,70 @@ LOCATION_STOPWORDS = {
|
|||||||
"Twitter", "Facebook", "Telegram", "Signal", "WhatsApp",
|
"Twitter", "Facebook", "Telegram", "Signal", "WhatsApp",
|
||||||
"Reuters", "AP", "AFP", "DPA", "dpa",
|
"Reuters", "AP", "AFP", "DPA", "dpa",
|
||||||
"Internet", "Online", "Web",
|
"Internet", "Online", "Web",
|
||||||
|
# Regionale/vage Begriffe (kein einzelner Punkt auf der Karte)
|
||||||
|
"Naher Osten", "Mittlerer Osten", "Middle East", "Near East",
|
||||||
|
"Golf-Staaten", "Golfstaaten", "Golfregion", "Gulf States", "Persian Gulf",
|
||||||
|
"Nordafrika", "Subsahara", "Zentralasien", "Suedostasien",
|
||||||
|
"Westeuropa", "Osteuropa", "Suedeuropa", "Nordeuropa",
|
||||||
|
"Balkan", "Kaukasus", "Levante", "Maghreb", "Sahel",
|
||||||
|
"Arabische Welt", "Arab World",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Maximale Textlaenge fuer NER-Verarbeitung
|
# Maximale Textlaenge fuer NER-Verarbeitung
|
||||||
MAX_TEXT_LENGTH = 10000
|
MAX_TEXT_LENGTH = 10000
|
||||||
|
|
||||||
|
|
||||||
|
# Marker-Kategorien fuer Karten-Klassifizierung
|
||||||
|
CATEGORY_KEYWORDS = {
|
||||||
|
"target": [
|
||||||
|
"angriff", "angegriff", "bombardier", "luftschlag", "luftangriff",
|
||||||
|
"beschuss", "beschossen", "getroffen", "zerstoer", "einschlag",
|
||||||
|
"detonation", "explosion", "strike", "attack", "bombed", "hit",
|
||||||
|
"shelled", "destroyed", "targeted", "missile hit", "air strike",
|
||||||
|
"airstrike", "bombardment", "killed", "casualties", "dead",
|
||||||
|
"tote", "opfer", "getoetet",
|
||||||
|
],
|
||||||
|
"retaliation": [
|
||||||
|
"gegenschlag", "vergeltung", "reaktion", "gegenangriff",
|
||||||
|
"abgefeuert", "retaliat", "counter-attack", "counterattack",
|
||||||
|
"counter-strike", "response", "fired back", "launched",
|
||||||
|
"rakete abgefeuert", "vergeltungsschlag", "abfangen",
|
||||||
|
"abgefangen", "intercepted", "eskalation", "escalat",
|
||||||
|
],
|
||||||
|
"actor": [
|
||||||
|
"regierung", "praesident", "ministerium", "hauptquartier",
|
||||||
|
"kommando", "nato", "pentagon", "kongress", "senat", "parlament",
|
||||||
|
"government", "president", "ministry", "headquarters", "command",
|
||||||
|
"congress", "senate", "parliament", "white house", "weisses haus",
|
||||||
|
"verteidigungsminister", "aussenminister", "generalstab",
|
||||||
|
"defense secretary", "secretary of state", "general staff",
|
||||||
|
"un-sicherheitsrat", "security council", "summit", "gipfel",
|
||||||
|
"diplomati", "botschaft", "embassy",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_location(source_text: str, article_text: str = "") -> str:
|
||||||
|
"""Klassifiziert eine Location basierend auf dem Kontext.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Kategorie: 'target', 'retaliation', 'actor', oder 'mentioned'
|
||||||
|
"""
|
||||||
|
text = (source_text + " " + article_text[:500]).lower()
|
||||||
|
|
||||||
|
scores = {"target": 0, "retaliation": 0, "actor": 0}
|
||||||
|
for category, keywords in CATEGORY_KEYWORDS.items():
|
||||||
|
for kw in keywords:
|
||||||
|
if kw in text:
|
||||||
|
scores[category] += 1
|
||||||
|
|
||||||
|
best = max(scores, key=scores.get)
|
||||||
|
if scores[best] >= 1:
|
||||||
|
return best
|
||||||
|
return "mentioned"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _load_spacy_model(lang: str):
|
def _load_spacy_model(lang: str):
|
||||||
"""Laedt ein spaCy-Modell lazy (nur beim ersten Aufruf)."""
|
"""Laedt ein spaCy-Modell lazy (nur beim ersten Aufruf)."""
|
||||||
global _nlp_de, _nlp_en
|
global _nlp_de, _nlp_en
|
||||||
@@ -206,11 +265,19 @@ def _geocode_nominatim(name: str) -> Optional[dict]:
|
|||||||
if "address" in raw:
|
if "address" in raw:
|
||||||
country_code = raw["address"].get("country_code", "").upper()
|
country_code = raw["address"].get("country_code", "").upper()
|
||||||
|
|
||||||
|
normalized_name = location.address.split(",")[0] if location.address else name
|
||||||
|
|
||||||
|
# Plausibilitaetspruefung: Nominatim-Ergebnis muss zum Suchbegriff passen
|
||||||
|
similarity = SequenceMatcher(None, name.lower(), normalized_name.lower()).ratio()
|
||||||
|
if similarity < 0.3:
|
||||||
|
logger.debug(f"Nominatim-Ergebnis verworfen: '{name}' -> '{normalized_name}' (Aehnlichkeit {similarity:.2f})")
|
||||||
|
return None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"lat": float(location.latitude),
|
"lat": float(location.latitude),
|
||||||
"lon": float(location.longitude),
|
"lon": float(location.longitude),
|
||||||
"country_code": country_code,
|
"country_code": country_code,
|
||||||
"normalized_name": location.address.split(",")[0] if location.address else name,
|
"normalized_name": normalized_name,
|
||||||
"confidence": 0.4, # Nominatim-Ergebnis = niedrigere Konfidenz
|
"confidence": 0.4, # Nominatim-Ergebnis = niedrigere Konfidenz
|
||||||
}
|
}
|
||||||
except (GeocoderTimedOut, GeocoderServiceError) as e:
|
except (GeocoderTimedOut, GeocoderServiceError) as e:
|
||||||
@@ -276,6 +343,7 @@ async def geoparse_articles(articles: list[dict]) -> dict[int, list[dict]]:
|
|||||||
for loc in locations_raw:
|
for loc in locations_raw:
|
||||||
geo = await asyncio.to_thread(_geocode_location, loc["name"])
|
geo = await asyncio.to_thread(_geocode_location, loc["name"])
|
||||||
if geo:
|
if geo:
|
||||||
|
category = _classify_location(loc.get("source_text", ""), text)
|
||||||
locations.append({
|
locations.append({
|
||||||
"location_name": loc["name"],
|
"location_name": loc["name"],
|
||||||
"location_name_normalized": geo["normalized_name"],
|
"location_name_normalized": geo["normalized_name"],
|
||||||
@@ -284,6 +352,7 @@ async def geoparse_articles(articles: list[dict]) -> dict[int, list[dict]]:
|
|||||||
"lon": geo["lon"],
|
"lon": geo["lon"],
|
||||||
"confidence": geo["confidence"],
|
"confidence": geo["confidence"],
|
||||||
"source_text": loc.get("source_text", ""),
|
"source_text": loc.get("source_text", ""),
|
||||||
|
"category": category,
|
||||||
})
|
})
|
||||||
|
|
||||||
if locations:
|
if locations:
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren