Locations: Aggregation in SQL (GROUP BY + Window)

Ersetzt den rohen JOIN ueber article_locations x articles (bei Iran
21.814 Zeilen, 11 MB Payload) durch drei kleine aggregierte Queries:
  1. Orte per GROUP BY (name, lat, lon) — direkt die Ergebnismenge.
  2. Kategorien pro Ort per GROUP BY fuer die dominante Kategorie.
  3. Sample-Artikel (max. 10 pro Ort) via ROW_NUMBER() OVER PARTITION BY.

Response-Shape unveraendert ({category_labels, locations: [...]}), keine
Frontend-Aenderung noetig. Priorisierung primary > secondary > tertiary >
mentioned bleibt erhalten.

Erwarteter Effekt: Iran-Locations 11 MB -> <500 KB; Query-Zeit sinkt
zusaetzlich, da kein 21k-Zeilen-JOIN mehr materialisiert werden muss.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
2026-04-19 23:47:50 +02:00
Ursprung 9a43dffa6c
Commit a302790777

Datei anzeigen

@@ -543,60 +543,100 @@ async def get_locations(
current_user: dict = Depends(get_current_user), current_user: dict = Depends(get_current_user),
db: aiosqlite.Connection = Depends(db_dependency), db: aiosqlite.Connection = Depends(db_dependency),
): ):
"""Geografische Orte einer Lage abrufen (aggregiert nach Ort).""" """Geografische Orte einer Lage abrufen (serverseitig aggregiert nach Ort).
Drei getrennte Queries (alle klein) statt eines 21k-Zeilen-JOINs:
1. Orte-Aggregate per GROUP BY (name, lat, lon) — liefert direkt ~Ergebnismenge.
2. Kategorien pro Ort per GROUP BY (name, lat, lon, category) — fuer dominante Kategorie.
3. Sample-Artikel pro Ort via ROW_NUMBER() — max. 10 pro Ort.
"""
tenant_id = current_user.get("tenant_id") tenant_id = current_user.get("tenant_id")
await _check_incident_access(db, incident_id, current_user["id"], tenant_id) await _check_incident_access(db, incident_id, current_user["id"], tenant_id)
# 1. Orte-Aggregate
cursor = await db.execute( cursor = await db.execute(
"""SELECT al.location_name, al.location_name_normalized, al.country_code, """SELECT
al.latitude, al.longitude, al.confidence, al.category, COALESCE(location_name_normalized, location_name) AS name,
a.id as article_id, a.headline, a.headline_de, a.source, a.source_url ROUND(latitude, 2) AS lat,
FROM article_locations al ROUND(longitude, 2) AS lon,
JOIN articles a ON a.id = al.article_id MIN(country_code) AS country_code,
WHERE al.incident_id = ? MAX(confidence) AS confidence,
ORDER BY al.location_name_normalized, a.collected_at DESC""", COUNT(*) AS article_count
FROM article_locations
WHERE incident_id = ?
GROUP BY name, lat, lon
ORDER BY article_count DESC""",
(incident_id,), (incident_id,),
) )
rows = await cursor.fetchall() loc_rows = [dict(r) for r in await cursor.fetchall()]
# Aggregierung nach normalisiertem Ortsnamen + Koordinaten # 2. Kategorien pro Ort
loc_map = {} cursor = await db.execute(
for row in rows: """SELECT
row = dict(row) COALESCE(location_name_normalized, location_name) AS name,
key = (row["location_name_normalized"] or row["location_name"], round(row["latitude"], 2), round(row["longitude"], 2)) ROUND(latitude, 2) AS lat,
if key not in loc_map: ROUND(longitude, 2) AS lon,
loc_map[key] = { COALESCE(category, 'mentioned') AS category,
"location_name": row["location_name_normalized"] or row["location_name"], COUNT(*) AS cnt
"lat": row["latitude"], FROM article_locations
"lon": row["longitude"], WHERE incident_id = ?
"country_code": row["country_code"], GROUP BY name, lat, lon, category""",
"confidence": row["confidence"], (incident_id,),
"article_count": 0, )
"articles": [], cat_map: dict[tuple, dict[str, int]] = {}
"categories": {}, for r in await cursor.fetchall():
} key = (r["name"], r["lat"], r["lon"])
loc_map[key]["article_count"] += 1 cat_map.setdefault(key, {})[r["category"]] = r["cnt"]
cat = row["category"] or "mentioned"
loc_map[key]["categories"][cat] = loc_map[key]["categories"].get(cat, 0) + 1
# Maximal 10 Artikel pro Ort mitliefern
if len(loc_map[key]["articles"]) < 10:
loc_map[key]["articles"].append({
"id": row["article_id"],
"headline": row["headline_de"] or row["headline"],
"source": row["source"],
"source_url": row["source_url"],
})
# Dominanteste Kategorie pro Ort bestimmen (Prioritaet: primary > secondary > tertiary > mentioned) # 3. Sample-Artikel pro Ort (max. 10, neueste zuerst)
cursor = await db.execute(
"""SELECT name, lat, lon, article_id, headline, headline_de, source, source_url
FROM (
SELECT
COALESCE(al.location_name_normalized, al.location_name) AS name,
ROUND(al.latitude, 2) AS lat,
ROUND(al.longitude, 2) AS lon,
a.id AS article_id,
a.headline, a.headline_de, a.source, a.source_url,
ROW_NUMBER() OVER (
PARTITION BY COALESCE(al.location_name_normalized, al.location_name),
ROUND(al.latitude, 2), ROUND(al.longitude, 2)
ORDER BY a.collected_at DESC
) AS rn
FROM article_locations al
JOIN articles a ON a.id = al.article_id
WHERE al.incident_id = ?
)
WHERE rn <= 10""",
(incident_id,),
)
sample_map: dict[tuple, list[dict]] = {}
for r in await cursor.fetchall():
key = (r["name"], r["lat"], r["lon"])
sample_map.setdefault(key, []).append({
"id": r["article_id"],
"headline": r["headline_de"] or r["headline"],
"source": r["source"],
"source_url": r["source_url"],
})
# Zusammensetzen
priority = {"primary": 4, "secondary": 3, "tertiary": 2, "mentioned": 1} priority = {"primary": 4, "secondary": 3, "tertiary": 2, "mentioned": 1}
result = [] result = []
for loc in loc_map.values(): for loc in loc_rows:
cats = loc.pop("categories") key = (loc["name"], loc["lat"], loc["lon"])
if cats: cats = cat_map.get(key, {})
best_cat = max(cats, key=lambda c: (priority.get(c, 0), cats[c])) best_cat = max(cats, key=lambda c: (priority.get(c, 0), cats[c])) if cats else "mentioned"
else: result.append({
best_cat = "mentioned" "location_name": loc["name"],
loc["category"] = best_cat "lat": loc["lat"],
result.append(loc) "lon": loc["lon"],
"country_code": loc["country_code"],
"confidence": loc["confidence"],
"article_count": loc["article_count"],
"articles": sample_map.get(key, []),
"category": best_cat,
})
# Category-Labels aus Incident laden # Category-Labels aus Incident laden
cursor = await db.execute( cursor = await db.execute(