feat: Kontextabhängige Karten-Kategorien

4 feste Farbstufen (primary/secondary/tertiary/mentioned) mit
variablen Labels pro Lage, die von Haiku generiert werden.

- DB: category_labels Spalte in incidents, alte Kategorien migriert
  (target->primary, response/retaliation->secondary, actor->tertiary)
- Geoparsing: generate_category_labels() + neuer Prompt mit neuen Keys
- QC: Kategorieprüfung auf neue Keys umgestellt
- Orchestrator: Tuple-Rückgabe + Labels in DB speichern
- API: category_labels im Locations- und Lagebild-Response
- Frontend: Dynamische Legende aus API-Labels mit Fallback-Defaults
- Migrationsskript für bestehende Lagen

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
Claude Dev
2026-03-15 15:04:02 +01:00
Ursprung 5fd65657c5
Commit 19da099583
9 geänderte Dateien mit 1315 neuen und 1012 gelöschten Zeilen

Datei anzeigen

@@ -209,6 +209,90 @@ def _geocode_location(name: str, country_code: str = "", haiku_coords: Optional[
return result
# Default-Labels (Fallback wenn Haiku keine generiert)
DEFAULT_CATEGORY_LABELS = {
"primary": "Hauptgeschehen",
"secondary": "Reaktionen",
"tertiary": "Beteiligte",
"mentioned": "Erwaehnt",
}
CATEGORY_LABELS_PROMPT = """Generiere kurze, praegnante Kategorie-Labels fuer Karten-Pins zu dieser Nachrichtenlage.
Lage: "{incident_context}"
Es gibt 4 Farbstufen fuer Orte auf der Karte:
1. primary (Rot): Wo das Hauptgeschehen stattfindet
2. secondary (Orange): Direkte Reaktionen/Gegenmassnahmen
3. tertiary (Blau): Entscheidungstraeger/Beteiligte
4. mentioned (Grau): Nur erwaehnt
Generiere fuer jede Stufe ein kurzes Label (1-3 Woerter), das zum Thema passt.
Wenn eine Stufe fuer dieses Thema nicht sinnvoll ist, setze null.
Beispiele:
- Militaerkonflikt Iran: {{"primary": "Kampfschauplätze", "secondary": "Vergeltungsschläge", "tertiary": "Strategische Akteure", "mentioned": "Erwähnt"}}
- Erdbeben Tuerkei: {{"primary": "Katastrophenzone", "secondary": "Hilfsoperationen", "tertiary": "Geberländer", "mentioned": "Erwähnt"}}
- Bundestagswahl: {{"primary": "Wahlkreise", "secondary": "Koalitionspartner", "tertiary": "Internationale Reaktionen", "mentioned": "Erwähnt"}}
Antworte NUR als JSON-Objekt:"""
async def generate_category_labels(incident_context: str) -> dict[str, str | None]:
"""Generiert kontextabhaengige Kategorie-Labels via Haiku.
Args:
incident_context: Lage-Titel + Beschreibung
Returns:
Dict mit Labels fuer primary/secondary/tertiary/mentioned (oder None wenn nicht passend)
"""
if not incident_context or not incident_context.strip():
return dict(DEFAULT_CATEGORY_LABELS)
prompt = CATEGORY_LABELS_PROMPT.format(incident_context=incident_context[:500])
try:
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
parsed = None
try:
parsed = json.loads(result_text)
except json.JSONDecodeError:
match = re.search(r'\{.*\}', result_text, re.DOTALL)
if match:
try:
parsed = json.loads(match.group())
except json.JSONDecodeError:
pass
if not parsed or not isinstance(parsed, dict):
logger.warning("generate_category_labels: Kein gueltiges JSON erhalten")
return dict(DEFAULT_CATEGORY_LABELS)
# Validierung: Nur erlaubte Keys, Werte muessen str oder None sein
valid_keys = {"primary", "secondary", "tertiary", "mentioned"}
labels = {}
for key in valid_keys:
val = parsed.get(key)
if val is None or val == "null":
labels[key] = None
elif isinstance(val, str) and val.strip():
labels[key] = val.strip()
else:
labels[key] = DEFAULT_CATEGORY_LABELS.get(key)
# mentioned sollte immer einen Wert haben
if not labels.get("mentioned"):
labels["mentioned"] = "Erwaehnt"
logger.info(f"Kategorie-Labels generiert: {labels}")
return labels
except Exception as e:
logger.error(f"generate_category_labels fehlgeschlagen: {e}")
return dict(DEFAULT_CATEGORY_LABELS)
HAIKU_GEOPARSE_PROMPT = """Extrahiere alle geographischen Orte aus diesen Nachrichten-Headlines.
Kontext der Lage: "{incident_context}"
@@ -222,9 +306,9 @@ Regeln:
- Regionen wie "Middle East", "Gulf", "Naher Osten" NICHT extrahieren (kein einzelner Punkt auf der Karte)
Klassifiziere basierend auf dem Lage-Kontext:
- "target": Wo das Ereignis passiert / Schaden entsteht
- "response": Wo Reaktionen / Gegenmassnahmen stattfinden
- "actor": Wo Entscheidungen getroffen werden / Entscheider sitzen
- "primary": Wo das Hauptgeschehen stattfindet (z.B. Angriffsziele, Katastrophenzone, Wahlkreise)
- "secondary": Direkte Reaktionen oder Gegenmassnahmen (z.B. Vergeltung, Hilfsoperationen)
- "tertiary": Entscheidungstraeger, Beteiligte (z.B. wo Entscheidungen getroffen werden)
- "mentioned": Nur erwaehnt, kein direkter Bezug
Headlines:
@@ -233,7 +317,7 @@ Headlines:
Antwort NUR als JSON-Array, kein anderer Text:
[{{"headline_idx": 0, "locations": [
{{"name": "Teheran", "normalized": "Tehran", "country_code": "IR",
"type": "city", "category": "target",
"type": "city", "category": "primary",
"lat": 35.69, "lon": 51.42}}
]}}]"""
@@ -314,12 +398,19 @@ async def _extract_locations_haiku(
if not name:
continue
raw_cat = loc.get("category", "mentioned")
# Alte Kategorien mappen (falls Haiku sie noch generiert)
cat_map = {"target": "primary", "response": "secondary", "retaliation": "secondary", "actor": "tertiary", "context": "tertiary"}
category = cat_map.get(raw_cat, raw_cat)
if category not in ("primary", "secondary", "tertiary", "mentioned"):
category = "mentioned"
article_locs.append({
"name": name,
"normalized": loc.get("normalized", name),
"country_code": loc.get("country_code", ""),
"type": loc_type,
"category": loc.get("category", "mentioned"),
"category": category,
"lat": loc.get("lat"),
"lon": loc.get("lon"),
})
@@ -333,7 +424,7 @@ async def _extract_locations_haiku(
async def geoparse_articles(
articles: list[dict],
incident_context: str = "",
) -> dict[int, list[dict]]:
) -> tuple[dict[int, list[dict]], dict[str, str | None] | None]:
"""Geoparsing fuer eine Liste von Artikeln via Haiku + geonamescache.
Args:
@@ -341,11 +432,15 @@ async def geoparse_articles(
incident_context: Lage-Kontext (Titel + Beschreibung) fuer kontextbewusste Klassifizierung
Returns:
dict[article_id -> list[{location_name, location_name_normalized, country_code,
lat, lon, confidence, source_text, category}]]
Tuple von (dict[article_id -> list[locations]], category_labels oder None)
"""
if not articles:
return {}
return {}, None
# Labels parallel zum Geoparsing generieren (nur wenn Kontext vorhanden)
labels_task = None
if incident_context:
labels_task = asyncio.create_task(generate_category_labels(incident_context))
# Headlines sammeln
headlines = []
@@ -363,7 +458,13 @@ async def geoparse_articles(
headlines.append({"idx": article_id, "text": headline})
if not headlines:
return {}
category_labels = None
if labels_task:
try:
category_labels = await labels_task
except Exception:
pass
return {}, category_labels
# Batches bilden (max 50 Headlines pro Haiku-Call)
batch_size = 50
@@ -374,7 +475,13 @@ async def geoparse_articles(
all_haiku_results.update(batch_results)
if not all_haiku_results:
return {}
category_labels = None
if labels_task:
try:
category_labels = await labels_task
except Exception:
pass
return {}, category_labels
# Geocoding via geonamescache (mit Haiku-Koordinaten als Fallback)
result = {}
@@ -406,4 +513,12 @@ async def geoparse_articles(
if locations:
result[article_id] = locations
return result
# Category-Labels abwarten
category_labels = None
if labels_task:
try:
category_labels = await labels_task
except Exception as e:
logger.warning(f"Category-Labels konnten nicht generiert werden: {e}")
return result, category_labels

Datei anzeigen

@@ -782,7 +782,7 @@ class AgentOrchestrator:
from agents.geoparsing import geoparse_articles
incident_context = f"{title} - {description}"
logger.info(f"Geoparsing fuer {len(new_articles_for_analysis)} neue Artikel...")
geo_results = await geoparse_articles(new_articles_for_analysis, incident_context)
geo_results, category_labels = await geoparse_articles(new_articles_for_analysis, incident_context)
geo_count = 0
for art_id, locations in geo_results.items():
for loc in locations:
@@ -799,6 +799,15 @@ class AgentOrchestrator:
if geo_count > 0:
await db.commit()
logger.info(f"Geoparsing: {geo_count} Orte aus {len(geo_results)} Artikeln gespeichert")
# Category-Labels in Incident speichern (nur wenn neu generiert)
if category_labels:
import json as _json
await db.execute(
"UPDATE incidents SET category_labels = ? WHERE id = ? AND category_labels IS NULL",
(_json.dumps(category_labels, ensure_ascii=False), incident_id),
)
await db.commit()
logger.info(f"Category-Labels gespeichert fuer Incident {incident_id}: {category_labels}")
except Exception as e:
logger.warning(f"Geoparsing fehlgeschlagen (Pipeline laeuft weiter): {e}")

Datei-Diff unterdrückt, da er zu groß ist Diff laden

Datei anzeigen

@@ -338,8 +338,8 @@ async def get_locations(
"source_url": row["source_url"],
})
# Dominanteste Kategorie pro Ort bestimmen (Prioritaet: target > retaliation > actor > mentioned)
priority = {"target": 4, "retaliation": 3, "actor": 2, "mentioned": 1}
# Dominanteste Kategorie pro Ort bestimmen (Prioritaet: primary > secondary > tertiary > mentioned)
priority = {"primary": 4, "secondary": 3, "tertiary": 2, "mentioned": 1}
result = []
for loc in loc_map.values():
cats = loc.pop("categories")
@@ -349,7 +349,20 @@ async def get_locations(
best_cat = "mentioned"
loc["category"] = best_cat
result.append(loc)
return result
# Category-Labels aus Incident laden
cursor = await db.execute(
"SELECT category_labels FROM incidents WHERE id = ?", (incident_id,)
)
inc_row = await cursor.fetchone()
category_labels = None
if inc_row and inc_row["category_labels"]:
try:
category_labels = json.loads(inc_row["category_labels"])
except (json.JSONDecodeError, TypeError):
pass
return {"category_labels": category_labels, "locations": result}
# Geoparse-Status pro Incident (in-memory)
@@ -395,8 +408,23 @@ async def _run_geoparse_background(incident_id: int, tenant_id: int | None):
processed = 0
for i in range(0, total, batch_size):
batch = articles[i:i + batch_size]
geo_results = await geoparse_articles(batch, incident_context)
for art_id, locations in geo_results.items():
geo_result = await geoparse_articles(batch, incident_context)
# Tuple-Rückgabe: (locations_dict, category_labels)
if isinstance(geo_result, tuple):
batch_geo_results, batch_labels = geo_result
# Labels beim ersten Batch speichern
if batch_labels and i == 0:
try:
await db.execute(
"UPDATE incidents SET category_labels = ? WHERE id = ? AND category_labels IS NULL",
(json.dumps(batch_labels, ensure_ascii=False), incident_id),
)
await db.commit()
except Exception:
pass
else:
batch_geo_results = geo_result
for art_id, locations in batch_geo_results.items():
for loc in locations:
await db.execute(
"""INSERT INTO article_locations

Datei anzeigen

@@ -64,6 +64,14 @@ async def get_lagebild(db=Depends(db_dependency)):
raise HTTPException(status_code=404, detail="Incident not found")
incident = dict(incident)
# Category-Labels laden
category_labels = None
if incident.get("category_labels"):
try:
category_labels = json.loads(incident["category_labels"])
except (json.JSONDecodeError, TypeError):
pass
# Alle Artikel aus allen Iran-Incidents laden
cursor = await db.execute(
f"""SELECT id, headline, headline_de, source, source_url, language,
@@ -148,6 +156,7 @@ async def get_lagebild(db=Depends(db_dependency)):
"fact_checks": fact_checks,
"available_snapshots": available_snapshots,
"locations": locations,
"category_labels": category_labels,
}

Datei anzeigen

@@ -1,389 +1,415 @@
"""Post-Refresh Quality Check via Haiku.
Prueft nach jedem Refresh:
1. Semantische Faktencheck-Duplikate (Haiku-Clustering mit Fuzzy-Vorfilter)
2. Falsch kategorisierte Karten-Locations (Haiku bewertet Kontext der Lage)
Regelbasierte Listen dienen als Fallback falls Haiku fehlschlaegt.
"""
import json
import logging
import re
from difflib import SequenceMatcher
from agents.claude_client import call_claude
from config import CLAUDE_MODEL_FAST
logger = logging.getLogger("osint.post_refresh_qc")
STATUS_PRIORITY = {
"confirmed": 5, "established": 5,
"contradicted": 4, "disputed": 4,
"unconfirmed": 3, "unverified": 3,
"developing": 1,
}
# ---------------------------------------------------------------------------
# 1. Faktencheck-Duplikate
# ---------------------------------------------------------------------------
_DEDUP_PROMPT = """\
Du bist ein Deduplizierungs-Agent fuer Faktenchecks eines OSINT-Monitors.
LAGE: {incident_title}
Unten stehen Faktenchecks (ID + Status + Claim). Finde Gruppen von Fakten,
die INHALTLICH DASSELBE aussagen, auch wenn sie unterschiedlich formuliert sind.
REGELN:
- Gleicher Sachverhalt = gleiche Gruppe
(z.B. "Trump fordert Kapitulation" und "US-Praesident verlangt bedingungslose Aufgabe")
- Unterschiedliche Detailtiefe zum SELBEN Fakt = gleiche Gruppe
- VERSCHIEDENE Sachverhalte = VERSCHIEDENE Gruppen
(z.B. "Angriff auf Isfahan" vs "Angriff auf Teheran" sind NICHT dasselbe)
- Eine Gruppe muss mindestens 2 Eintraege haben
Antworte NUR als JSON-Array von Gruppen. Jede Gruppe ist ein Array von IDs:
[[1,5,12], [3,8]]
Wenn keine Duplikate: antworte mit []
FAKTEN:
{facts_text}"""
async def _haiku_find_duplicate_clusters(
facts: list[dict], incident_title: str
) -> list[list[int]]:
"""Fragt Haiku welche Fakten semantische Duplikate sind."""
facts_text = "\n".join(
f'ID={f["id"]} [{f["status"]}]: {f["claim"]}'
for f in facts
)
prompt = _DEDUP_PROMPT.format(
incident_title=incident_title, facts_text=facts_text
)
try:
result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
data = json.loads(result)
if isinstance(data, list) and all(isinstance(g, list) for g in data):
return data
except json.JSONDecodeError:
match = re.search(r'\[.*\]', result, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if isinstance(data, list):
return data
except json.JSONDecodeError:
pass
except Exception as e:
logger.warning("Haiku Duplikat-Clustering fehlgeschlagen: %s", e)
return []
def _fuzzy_prefilter(all_facts: list[dict], max_candidates: int = 80) -> list[dict]:
"""Waehlt Kandidaten fuer Haiku-Check per Fuzzy-Vorfilter aus.
Findet Paare mit Aehnlichkeit >= 0.60 und gibt die betroffenen Fakten zurueck.
Begrenzt auf max_candidates um Haiku-Tokens zu sparen.
"""
from agents.factchecker import normalize_claim, _keyword_set
if len(all_facts) <= max_candidates:
return all_facts
normalized = []
for f in all_facts:
nc = normalize_claim(f["claim"])
kw = _keyword_set(f["claim"])
normalized.append((f, nc, kw))
candidate_ids = set()
recent = normalized[:60]
for i, (fact_a, norm_a, kw_a) in enumerate(recent):
for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
if i >= j or fact_b["id"] == fact_a["id"]:
continue
if not norm_a or not norm_b:
continue
len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
if len_ratio > 2.5 or len_ratio < 0.4:
continue
seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
kw_union = kw_a | kw_b
jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
combined = 0.7 * seq_ratio + 0.3 * jaccard
if combined >= 0.60:
candidate_ids.add(fact_a["id"])
candidate_ids.add(fact_b["id"])
if len(candidate_ids) >= max_candidates:
break
if len(candidate_ids) >= max_candidates:
break
candidates = [f for f in all_facts if f["id"] in candidate_ids]
logger.info(
"Fuzzy-Vorfilter: %d/%d Fakten als Duplikat-Kandidaten identifiziert",
len(candidates), len(all_facts),
)
return candidates
async def check_fact_duplicates(db, incident_id: int, incident_title: str) -> int:
"""Prueft auf semantische Faktencheck-Duplikate via Haiku.
1. Fuzzy-Vorfilter reduziert auf relevante Kandidaten
2. Haiku clustert semantische Duplikate
3. Pro Cluster: behalte besten Fakt, loesche Rest
Returns: Anzahl entfernter Duplikate.
"""
cursor = await db.execute(
"SELECT id, claim, status, sources_count, evidence, checked_at "
"FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
(incident_id,),
)
all_facts = [dict(row) for row in await cursor.fetchall()]
if len(all_facts) < 2:
return 0
# Schritt 1: Fuzzy-Vorfilter
candidates = _fuzzy_prefilter(all_facts)
if len(candidates) < 2:
return 0
# Schritt 2: Haiku-Clustering (in Batches von max 80)
all_clusters = []
batch_size = 80
for i in range(0, len(candidates), batch_size):
batch = candidates[i:i + batch_size]
clusters = await _haiku_find_duplicate_clusters(batch, incident_title)
all_clusters.extend(clusters)
if not all_clusters:
logger.info("QC Fakten: Haiku fand keine Duplikate")
return 0
# Schritt 3: Pro Cluster besten behalten, Rest loeschen
facts_by_id = {f["id"]: f for f in all_facts}
ids_to_delete = set()
for cluster_ids in all_clusters:
valid_ids = [cid for cid in cluster_ids if cid in facts_by_id]
if len(valid_ids) <= 1:
continue
cluster_facts = [facts_by_id[cid] for cid in valid_ids]
best = max(cluster_facts, key=lambda f: (
STATUS_PRIORITY.get(f["status"], 0),
f.get("sources_count", 0),
f.get("checked_at", ""),
))
for fact in cluster_facts:
if fact["id"] != best["id"]:
ids_to_delete.add(fact["id"])
logger.info(
"QC Duplikat: ID %d entfernt, behalte ID %d ('%s')",
fact["id"], best["id"], best["claim"][:60],
)
if ids_to_delete:
placeholders = ",".join("?" * len(ids_to_delete))
await db.execute(
f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
list(ids_to_delete),
)
logger.info(
"QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
len(ids_to_delete), incident_id,
)
return len(ids_to_delete)
# ---------------------------------------------------------------------------
# 2. Karten-Location-Kategorien
# ---------------------------------------------------------------------------
_LOCATION_PROMPT = """\
Du bist ein Geopolitik-Experte fuer einen OSINT-Monitor.
LAGE: {incident_title}
BESCHREIBUNG: {incident_desc}
Unten stehen Orte, die auf der Karte als "target" (Angriffsziel) markiert sind.
Pruefe fuer jeden Ort, ob die Kategorie "target" korrekt ist.
KATEGORIEN:
- target: Ort wurde tatsaechlich militaerisch angegriffen oder bombardiert
- actor: Ort gehoert zu einer Konfliktpartei (z.B. Hauptstadt des Angreifers)
- response: Ort reagiert auf den Konflikt (z.B. diplomatische Reaktion, Sanktionen)
- mentioned: Ort wird nur im Kontext erwaehnt (z.B. wirtschaftliche Auswirkungen)
REGELN:
- Nur Orte die TATSAECHLICH physisch angegriffen/bombardiert wurden = "target"
- Hauptstaedte von Angreiferlaendern (z.B. Washington DC) = "actor"
- Laender die nur wirtschaftlich betroffen sind (z.B. steigende Oelpreise) = "mentioned"
- Laender die diplomatisch reagieren = "response"
- Im Zweifel: "mentioned"
Antworte als JSON-Array mit Korrekturen. Nur Eintraege die GEAENDERT werden muessen:
[{{"id": 123, "category": "mentioned"}}, {{"id": 456, "category": "actor"}}]
Wenn alle Kategorien korrekt sind: antworte mit []
ORTE (aktuell alle als "target" markiert):
{locations_text}"""
async def check_location_categories(
db, incident_id: int, incident_title: str, incident_desc: str
) -> int:
"""Prueft Karten-Location-Kategorien via Haiku.
Returns: Anzahl korrigierter Eintraege.
"""
cursor = await db.execute(
"SELECT id, location_name, latitude, longitude, category "
"FROM article_locations WHERE incident_id = ? AND category = 'target'",
(incident_id,),
)
targets = [dict(row) for row in await cursor.fetchall()]
if not targets:
return 0
# Dedupliziere nach location_name fuer den Prompt (spart Tokens)
unique_names = {}
ids_by_name = {}
for loc in targets:
name = loc["location_name"]
if name not in unique_names:
unique_names[name] = loc
ids_by_name[name] = []
ids_by_name[name].append(loc["id"])
locations_text = "\n".join(
f'ID={loc["id"]} | {loc["location_name"]} ({loc["latitude"]:.2f}, {loc["longitude"]:.2f})'
for loc in unique_names.values()
)
prompt = _LOCATION_PROMPT.format(
incident_title=incident_title,
incident_desc=incident_desc[:500] if incident_desc else "(keine Beschreibung)",
locations_text=locations_text,
)
fixes = []
try:
result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
data = json.loads(result)
if isinstance(data, list):
fixes = data
except json.JSONDecodeError:
match = re.search(r'\[.*\]', result, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if isinstance(data, list):
fixes = data
except json.JSONDecodeError:
pass
except Exception as e:
logger.warning("Haiku Location-Check fehlgeschlagen: %s", e)
return 0
if not fixes:
logger.info("QC Locations: Haiku fand keine falschen Kategorien")
return 0
# Korrekturen anwenden (auch auf alle IDs mit gleichem Namen)
total_fixed = 0
representative_ids = {loc["id"]: name for name, loc in unique_names.items()}
for fix in fixes:
fix_id = fix.get("id")
new_cat = fix.get("category")
if not fix_id or not new_cat:
continue
if new_cat not in ("target", "actor", "response", "mentioned"):
continue
# Finde den location_name fuer diese ID
loc_name = representative_ids.get(fix_id)
if not loc_name:
continue
# Korrigiere ALLE Eintraege mit diesem Namen
all_ids = ids_by_name.get(loc_name, [fix_id])
placeholders = ",".join("?" * len(all_ids))
await db.execute(
f"UPDATE article_locations SET category = ? "
f"WHERE id IN ({placeholders}) AND category = 'target'",
[new_cat] + all_ids,
)
total_fixed += len(all_ids)
logger.info(
"QC Location: '%s' (%d Eintraege): target -> %s",
loc_name, len(all_ids), new_cat,
)
if total_fixed > 0:
logger.info(
"QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
total_fixed, incident_id,
)
return total_fixed
# ---------------------------------------------------------------------------
# 3. Hauptfunktion
# ---------------------------------------------------------------------------
async def run_post_refresh_qc(db, incident_id: int) -> dict:
"""Fuehrt den kompletten Post-Refresh Quality Check via Haiku durch.
Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
"""
try:
# Lage-Titel und Beschreibung laden
cursor = await db.execute(
"SELECT title, description FROM incidents WHERE id = ?",
(incident_id,),
)
row = await cursor.fetchone()
if not row:
return {"facts_removed": 0, "locations_fixed": 0}
incident_title = row["title"] or ""
incident_desc = row["description"] or ""
facts_removed = await check_fact_duplicates(db, incident_id, incident_title)
locations_fixed = await check_location_categories(
db, incident_id, incident_title, incident_desc
)
if facts_removed > 0 or locations_fixed > 0:
await db.commit()
logger.info(
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
incident_id, facts_removed, locations_fixed,
)
return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
except Exception as e:
logger.error(
"Post-Refresh QC Fehler fuer Incident %d: %s",
incident_id, e, exc_info=True,
)
return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}
"""Post-Refresh Quality Check via Haiku.
Prueft nach jedem Refresh:
1. Semantische Faktencheck-Duplikate (Haiku-Clustering mit Fuzzy-Vorfilter)
2. Falsch kategorisierte Karten-Locations (Haiku bewertet Kontext der Lage)
Regelbasierte Listen dienen als Fallback falls Haiku fehlschlaegt.
"""
import json
import logging
import re
from difflib import SequenceMatcher
from agents.claude_client import call_claude
from config import CLAUDE_MODEL_FAST
logger = logging.getLogger("osint.post_refresh_qc")
STATUS_PRIORITY = {
"confirmed": 5, "established": 5,
"contradicted": 4, "disputed": 4,
"unconfirmed": 3, "unverified": 3,
"developing": 1,
}
# ---------------------------------------------------------------------------
# 1. Faktencheck-Duplikate
# ---------------------------------------------------------------------------
_DEDUP_PROMPT = """\
Du bist ein Deduplizierungs-Agent fuer Faktenchecks eines OSINT-Monitors.
LAGE: {incident_title}
Unten stehen Faktenchecks (ID + Status + Claim). Finde Gruppen von Fakten,
die INHALTLICH DASSELBE aussagen, auch wenn sie unterschiedlich formuliert sind.
REGELN:
- Gleicher Sachverhalt = gleiche Gruppe
(z.B. "Trump fordert Kapitulation" und "US-Praesident verlangt bedingungslose Aufgabe")
- Unterschiedliche Detailtiefe zum SELBEN Fakt = gleiche Gruppe
- VERSCHIEDENE Sachverhalte = VERSCHIEDENE Gruppen
(z.B. "Angriff auf Isfahan" vs "Angriff auf Teheran" sind NICHT dasselbe)
- Eine Gruppe muss mindestens 2 Eintraege haben
Antworte NUR als JSON-Array von Gruppen. Jede Gruppe ist ein Array von IDs:
[[1,5,12], [3,8]]
Wenn keine Duplikate: antworte mit []
FAKTEN:
{facts_text}"""
async def _haiku_find_duplicate_clusters(
facts: list[dict], incident_title: str
) -> list[list[int]]:
"""Fragt Haiku welche Fakten semantische Duplikate sind."""
facts_text = "\n".join(
f'ID={f["id"]} [{f["status"]}]: {f["claim"]}'
for f in facts
)
prompt = _DEDUP_PROMPT.format(
incident_title=incident_title, facts_text=facts_text
)
try:
result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
data = json.loads(result)
if isinstance(data, list) and all(isinstance(g, list) for g in data):
return data
except json.JSONDecodeError:
match = re.search(r'\[.*\]', result, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if isinstance(data, list):
return data
except json.JSONDecodeError:
pass
except Exception as e:
logger.warning("Haiku Duplikat-Clustering fehlgeschlagen: %s", e)
return []
def _fuzzy_prefilter(all_facts: list[dict], max_candidates: int = 80) -> list[dict]:
"""Waehlt Kandidaten fuer Haiku-Check per Fuzzy-Vorfilter aus.
Findet Paare mit Aehnlichkeit >= 0.60 und gibt die betroffenen Fakten zurueck.
Begrenzt auf max_candidates um Haiku-Tokens zu sparen.
"""
from agents.factchecker import normalize_claim, _keyword_set
if len(all_facts) <= max_candidates:
return all_facts
normalized = []
for f in all_facts:
nc = normalize_claim(f["claim"])
kw = _keyword_set(f["claim"])
normalized.append((f, nc, kw))
candidate_ids = set()
recent = normalized[:60]
for i, (fact_a, norm_a, kw_a) in enumerate(recent):
for j, (fact_b, norm_b, kw_b) in enumerate(normalized):
if i >= j or fact_b["id"] == fact_a["id"]:
continue
if not norm_a or not norm_b:
continue
len_ratio = len(norm_a) / len(norm_b) if norm_b else 0
if len_ratio > 2.5 or len_ratio < 0.4:
continue
seq_ratio = SequenceMatcher(None, norm_a, norm_b).ratio()
kw_union = kw_a | kw_b
jaccard = len(kw_a & kw_b) / len(kw_union) if kw_union else 0.0
combined = 0.7 * seq_ratio + 0.3 * jaccard
if combined >= 0.60:
candidate_ids.add(fact_a["id"])
candidate_ids.add(fact_b["id"])
if len(candidate_ids) >= max_candidates:
break
if len(candidate_ids) >= max_candidates:
break
candidates = [f for f in all_facts if f["id"] in candidate_ids]
logger.info(
"Fuzzy-Vorfilter: %d/%d Fakten als Duplikat-Kandidaten identifiziert",
len(candidates), len(all_facts),
)
return candidates
async def check_fact_duplicates(db, incident_id: int, incident_title: str) -> int:
"""Prueft auf semantische Faktencheck-Duplikate via Haiku.
1. Fuzzy-Vorfilter reduziert auf relevante Kandidaten
2. Haiku clustert semantische Duplikate
3. Pro Cluster: behalte besten Fakt, loesche Rest
Returns: Anzahl entfernter Duplikate.
"""
cursor = await db.execute(
"SELECT id, claim, status, sources_count, evidence, checked_at "
"FROM fact_checks WHERE incident_id = ? ORDER BY checked_at DESC",
(incident_id,),
)
all_facts = [dict(row) for row in await cursor.fetchall()]
if len(all_facts) < 2:
return 0
# Schritt 1: Fuzzy-Vorfilter
candidates = _fuzzy_prefilter(all_facts)
if len(candidates) < 2:
return 0
# Schritt 2: Haiku-Clustering (in Batches von max 80)
all_clusters = []
batch_size = 80
for i in range(0, len(candidates), batch_size):
batch = candidates[i:i + batch_size]
clusters = await _haiku_find_duplicate_clusters(batch, incident_title)
all_clusters.extend(clusters)
if not all_clusters:
logger.info("QC Fakten: Haiku fand keine Duplikate")
return 0
# Schritt 3: Pro Cluster besten behalten, Rest loeschen
facts_by_id = {f["id"]: f for f in all_facts}
ids_to_delete = set()
for cluster_ids in all_clusters:
valid_ids = [cid for cid in cluster_ids if cid in facts_by_id]
if len(valid_ids) <= 1:
continue
cluster_facts = [facts_by_id[cid] for cid in valid_ids]
best = max(cluster_facts, key=lambda f: (
STATUS_PRIORITY.get(f["status"], 0),
f.get("sources_count", 0),
f.get("checked_at", ""),
))
for fact in cluster_facts:
if fact["id"] != best["id"]:
ids_to_delete.add(fact["id"])
logger.info(
"QC Duplikat: ID %d entfernt, behalte ID %d ('%s')",
fact["id"], best["id"], best["claim"][:60],
)
if ids_to_delete:
placeholders = ",".join("?" * len(ids_to_delete))
await db.execute(
f"DELETE FROM fact_checks WHERE id IN ({placeholders})",
list(ids_to_delete),
)
logger.info(
"QC: %d Faktencheck-Duplikate entfernt fuer Incident %d",
len(ids_to_delete), incident_id,
)
return len(ids_to_delete)
# ---------------------------------------------------------------------------
# 2. Karten-Location-Kategorien
# ---------------------------------------------------------------------------
_LOCATION_PROMPT = """\
Du bist ein Geopolitik-Experte fuer einen OSINT-Monitor.
LAGE: {incident_title}
BESCHREIBUNG: {incident_desc}
{labels_context}
Unten stehen Orte, die auf der Karte als "primary" (Hauptgeschehen) markiert sind.
Pruefe fuer jeden Ort, ob die Kategorie "primary" korrekt ist.
KATEGORIEN:
- primary: {label_primary} — Wo das Hauptgeschehen stattfindet
- secondary: {label_secondary} — Direkte Reaktionen/Gegenmassnahmen
- tertiary: {label_tertiary} — Entscheidungstraeger/Beteiligte
- mentioned: {label_mentioned} — Nur erwaehnt
REGELN:
- Nur Orte die DIREKT vom Hauptgeschehen betroffen sind = "primary"
- Orte mit Reaktionen/Gegenmassnahmen = "secondary"
- Orte von Entscheidungstraegern (z.B. Hauptstaedte) = "tertiary"
- Nur erwaehnte Orte = "mentioned"
- Im Zweifel: "mentioned"
Antworte als JSON-Array mit Korrekturen. Nur Eintraege die GEAENDERT werden muessen:
[{{"id": 123, "category": "mentioned"}}, {{"id": 456, "category": "tertiary"}}]
Wenn alle Kategorien korrekt sind: antworte mit []
ORTE (aktuell alle als "primary" markiert):
{locations_text}"""
async def check_location_categories(
db, incident_id: int, incident_title: str, incident_desc: str
) -> int:
"""Prueft Karten-Location-Kategorien via Haiku.
Returns: Anzahl korrigierter Eintraege.
"""
cursor = await db.execute(
"SELECT id, location_name, latitude, longitude, category "
"FROM article_locations WHERE incident_id = ? AND category = 'primary'",
(incident_id,),
)
targets = [dict(row) for row in await cursor.fetchall()]
if not targets:
return 0
# Category-Labels aus DB laden (fuer kontextabhaengige Prompt-Beschreibungen)
cursor = await db.execute(
"SELECT category_labels FROM incidents WHERE id = ?", (incident_id,)
)
inc_row = await cursor.fetchone()
labels = {}
if inc_row and inc_row["category_labels"]:
try:
labels = json.loads(inc_row["category_labels"])
except (json.JSONDecodeError, TypeError):
pass
label_primary = labels.get("primary") or "Hauptgeschehen"
label_secondary = labels.get("secondary") or "Reaktionen"
label_tertiary = labels.get("tertiary") or "Beteiligte"
label_mentioned = labels.get("mentioned") or "Erwaehnt"
labels_context = ""
if labels:
labels_context = f"KATEGORIE-LABELS: primary={label_primary}, secondary={label_secondary}, tertiary={label_tertiary}, mentioned={label_mentioned}\n"
# Dedupliziere nach location_name fuer den Prompt (spart Tokens)
unique_names = {}
ids_by_name = {}
for loc in targets:
name = loc["location_name"]
if name not in unique_names:
unique_names[name] = loc
ids_by_name[name] = []
ids_by_name[name].append(loc["id"])
locations_text = "\n".join(
f'ID={loc["id"]} | {loc["location_name"]} ({loc["latitude"]:.2f}, {loc["longitude"]:.2f})'
for loc in unique_names.values()
)
prompt = _LOCATION_PROMPT.format(
incident_title=incident_title,
incident_desc=incident_desc[:500] if incident_desc else "(keine Beschreibung)",
labels_context=labels_context,
label_primary=label_primary,
label_secondary=label_secondary,
label_tertiary=label_tertiary,
label_mentioned=label_mentioned,
locations_text=locations_text,
)
fixes = []
try:
result, _usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
data = json.loads(result)
if isinstance(data, list):
fixes = data
except json.JSONDecodeError:
match = re.search(r'\[.*\]', result, re.DOTALL)
if match:
try:
data = json.loads(match.group())
if isinstance(data, list):
fixes = data
except json.JSONDecodeError:
pass
except Exception as e:
logger.warning("Haiku Location-Check fehlgeschlagen: %s", e)
return 0
if not fixes:
logger.info("QC Locations: Haiku fand keine falschen Kategorien")
return 0
# Korrekturen anwenden (auch auf alle IDs mit gleichem Namen)
total_fixed = 0
representative_ids = {loc["id"]: name for name, loc in unique_names.items()}
for fix in fixes:
fix_id = fix.get("id")
new_cat = fix.get("category")
if not fix_id or not new_cat:
continue
if new_cat not in ("primary", "secondary", "tertiary", "mentioned"):
continue
# Finde den location_name fuer diese ID
loc_name = representative_ids.get(fix_id)
if not loc_name:
continue
# Korrigiere ALLE Eintraege mit diesem Namen
all_ids = ids_by_name.get(loc_name, [fix_id])
placeholders = ",".join("?" * len(all_ids))
await db.execute(
f"UPDATE article_locations SET category = ? "
f"WHERE id IN ({placeholders}) AND category = 'primary'",
[new_cat] + all_ids,
)
total_fixed += len(all_ids)
logger.info(
"QC Location: '%s' (%d Eintraege): primary -> %s",
loc_name, len(all_ids), new_cat,
)
if total_fixed > 0:
logger.info(
"QC: %d Karten-Location-Kategorien korrigiert fuer Incident %d",
total_fixed, incident_id,
)
return total_fixed
# ---------------------------------------------------------------------------
# Hauptfunktion
# ---------------------------------------------------------------------------
async def run_post_refresh_qc(db, incident_id: int) -> dict:
"""Fuehrt den kompletten Post-Refresh Quality Check via Haiku durch.
Returns: Dict mit Ergebnissen {facts_removed, locations_fixed}.
"""
try:
# Lage-Titel und Beschreibung laden
cursor = await db.execute(
"SELECT title, description FROM incidents WHERE id = ?",
(incident_id,),
)
row = await cursor.fetchone()
if not row:
return {"facts_removed": 0, "locations_fixed": 0}
incident_title = row["title"] or ""
incident_desc = row["description"] or ""
facts_removed = await check_fact_duplicates(db, incident_id, incident_title)
locations_fixed = await check_location_categories(
db, incident_id, incident_title, incident_desc
)
if facts_removed > 0 or locations_fixed > 0:
await db.commit()
logger.info(
"Post-Refresh QC fuer Incident %d: %d Duplikate entfernt, %d Locations korrigiert",
incident_id, facts_removed, locations_fixed,
)
return {"facts_removed": facts_removed, "locations_fixed": locations_fixed}
except Exception as e:
logger.error(
"Post-Refresh QC Fehler fuer Incident %d: %s",
incident_id, e, exc_info=True,
)
return {"facts_removed": 0, "locations_fixed": 0, "error": str(e)}

Datei anzeigen

@@ -698,7 +698,7 @@ const App = {
async loadIncidentDetail(id) {
try {
const [incident, articles, factchecks, snapshots, locations] = await Promise.all([
const [incident, articles, factchecks, snapshots, locationsResponse] = await Promise.all([
API.getIncident(id),
API.getArticles(id),
API.getFactChecks(id),
@@ -706,14 +706,27 @@ const App = {
API.getLocations(id).catch(() => []),
]);
this.renderIncidentDetail(incident, articles, factchecks, snapshots, locations);
// Locations-API gibt jetzt {category_labels, locations} oder Array (Rückwärtskompatibel)
let locations, categoryLabels;
if (Array.isArray(locationsResponse)) {
locations = locationsResponse;
categoryLabels = null;
} else if (locationsResponse && locationsResponse.locations) {
locations = locationsResponse.locations;
categoryLabels = locationsResponse.category_labels || null;
} else {
locations = [];
categoryLabels = null;
}
this.renderIncidentDetail(incident, articles, factchecks, snapshots, locations, categoryLabels);
} catch (err) {
console.error('loadIncidentDetail Fehler:', err);
UI.showToast('Fehler beim Laden: ' + err.message, 'error');
}
},
renderIncidentDetail(incident, articles, factchecks, snapshots, locations) {
renderIncidentDetail(incident, articles, factchecks, snapshots, locations, categoryLabels) {
// Header Strip
{ const _e = document.getElementById('incident-title'); if (_e) _e.textContent = incident.title; }
{ const _e = document.getElementById('incident-description'); if (_e) _e.textContent = incident.description || ''; }
@@ -845,7 +858,7 @@ const App = {
this._resizeTimelineTile();
// Karte rendern
UI.renderMap(locations || []);
UI.renderMap(locations || [], categoryLabels);
},
_collectEntries(filterType, searchTerm, range) {
@@ -1617,8 +1630,12 @@ const App = {
if (btn) { btn.disabled = false; btn.textContent = 'Orte erkennen'; }
if (st.status === 'done' && st.locations > 0) {
UI.showToast(`${st.locations} Orte aus ${st.processed} Artikeln erkannt`, 'success');
const locations = await API.getLocations(incidentId).catch(() => []);
UI.renderMap(locations);
const locResp = await API.getLocations(incidentId).catch(() => []);
let locs, catLabels;
if (Array.isArray(locResp)) { locs = locResp; catLabels = null; }
else if (locResp && locResp.locations) { locs = locResp.locations; catLabels = locResp.category_labels || null; }
else { locs = []; catLabels = null; }
UI.renderMap(locs, catLabels);
} else if (st.status === 'done') {
UI.showToast('Keine neuen Orte gefunden', 'info');
} else if (st.status === 'error') {

Datei anzeigen

@@ -639,30 +639,29 @@ const UI = {
_initMarkerIcons() {
if (this._markerIcons || typeof L === 'undefined') return;
this._markerIcons = {
target: this._createSvgIcon('#dc3545', '#a71d2a'),
retaliation: this._createSvgIcon('#f39c12', '#c47d0a'),
response: this._createSvgIcon('#f39c12', '#c47d0a'),
actor: this._createSvgIcon('#2a81cb', '#1a5c8f'),
primary: this._createSvgIcon('#dc3545', '#a71d2a'),
secondary: this._createSvgIcon('#f39c12', '#c47d0a'),
tertiary: this._createSvgIcon('#2a81cb', '#1a5c8f'),
mentioned: this._createSvgIcon('#7b7b7b', '#555555'),
};
},
_categoryLabels: {
target: 'Angegriffene Ziele',
retaliation: 'Vergeltung / Eskalation',
response: 'Reaktion / Gegenmassnahmen',
actor: 'Strategische Akteure',
_defaultCategoryLabels: {
primary: 'Hauptgeschehen',
secondary: 'Reaktionen',
tertiary: 'Beteiligte',
mentioned: 'Erwaehnt',
},
_categoryColors: {
target: '#cb2b3e',
retaliation: '#f39c12',
response: '#f39c12',
actor: '#2a81cb',
primary: '#cb2b3e',
secondary: '#f39c12',
tertiary: '#2a81cb',
mentioned: '#7b7b7b',
},
renderMap(locations) {
_activeCategoryLabels: null,
renderMap(locations, categoryLabels) {
const container = document.getElementById('map-container');
const emptyEl = document.getElementById('map-empty');
const statsEl = document.getElementById('map-stats');
@@ -741,6 +740,9 @@ const UI = {
// Marker hinzufuegen
const bounds = [];
this._initMarkerIcons();
// Dynamische Labels verwenden (API > Default)
const catLabels = categoryLabels || this._activeCategoryLabels || this._defaultCategoryLabels;
this._activeCategoryLabels = catLabels;
const usedCategories = new Set();
locations.forEach(loc => {
@@ -751,7 +753,7 @@ const UI = {
const marker = L.marker([loc.lat, loc.lon], markerOpts);
// Popup-Inhalt
const catLabel = this._categoryLabels[cat] || cat;
const catLabel = catLabels[cat] || this._defaultCategoryLabels[cat] || cat;
const catColor = this._categoryColors[cat] || '#7b7b7b';
let popupHtml = `<div class="map-popup">`;
popupHtml += `<div class="map-popup-title">${this.escape(loc.location_name)}`;
@@ -798,12 +800,13 @@ const UI = {
const legend = L.control({ position: 'bottomright' });
const self2 = this;
const legendLabels = catLabels;
legend.onAdd = function() {
const div = L.DomUtil.create('div', 'map-legend-ctrl');
let html = '<strong style="display:block;margin-bottom:6px;">Legende</strong>';
['target', 'retaliation', 'response', 'actor', 'mentioned'].forEach(cat => {
if (usedCategories.has(cat)) {
html += `<div style="display:flex;align-items:center;gap:6px;margin:3px 0;"><span style="width:10px;height:10px;border-radius:50%;background:${self2._categoryColors[cat]};flex-shrink:0;"></span><span>${self2._categoryLabels[cat]}</span></div>`;
['primary', 'secondary', 'tertiary', 'mentioned'].forEach(cat => {
if (usedCategories.has(cat) && legendLabels[cat]) {
html += `<div style="display:flex;align-items:center;gap:6px;margin:3px 0;"><span style="width:10px;height:10px;border-radius:50%;background:${self2._categoryColors[cat]};flex-shrink:0;"></span><span>${legendLabels[cat]}</span></div>`;
}
});
div.innerHTML = html;
@@ -853,7 +856,7 @@ const UI = {
if (this._pendingLocations && typeof L !== 'undefined') {
const locs = this._pendingLocations;
this._pendingLocations = null;
this.renderMap(locs);
this.renderMap(locs, this._activeCategoryLabels);
}
},