Block B: ClaudeCliError + differenzierte HTTP-Status + Rate-Limit-Retry
- Neue Exception-Klasse ClaudeCliError(error_type, message) in claude_client.py mit Kategorien rate_limit / auth_error / timeout / cli_error. - _classify_cli_error() als geteilter Klassifikator (Keywords fuer Rate-Limit und Auth-Fehler wie "does not have access", "login again"). - call_claude() erkennt jetzt auch is_error=true im JSON bei returncode=0 (Hauptursache des Ausfalls vom 22.04.: CLI liefert "Your organization does not have access" mit is_error=true statt Exit-Code). - Orchestrator: ClaudeCliError mit rate_limit/timeout als transient behandelt (3 Retries mit Backoff 0s/120s/300s). auth_error/cli_error brechen sofort ab ohne Retry. Behebt den bestehenden Bug, dass Rate-Limit-Fehler gar nicht retried wurden. - routers/incidents.py Enhance-Endpoint: ClaudeCliError wird auf 503 (auth_error) / 429 (rate_limit) gemappt, TimeoutError auf 504. - routers/chat.py _call_claude_chat(): wirft jetzt ClaudeCliError statt generischem RuntimeError. Chat-Endpoint mappt auth_error auf 503. - Frontend: neue ApiError-Klasse in api.js mit status+detail. generateDescription() in app.js zeigt differenzierte Toasts nach HTTP-Status (503/429/504/403). - dashboard.html: Cache-Bust api.js + app.js auf v=20260423a
Dieser Commit ist enthalten in:
@@ -527,8 +527,12 @@ class AgentOrchestrator:
|
||||
|
||||
RETRY_DELAYS = [0, 120, 300] # Sekunden: sofort, 2min, 5min
|
||||
TRANSIENT_ERRORS = (asyncio.TimeoutError, TimeoutError, ConnectionError, OSError)
|
||||
from agents.claude_client import ClaudeCliError
|
||||
last_error = None
|
||||
|
||||
def _is_transient_cli(err: Exception) -> bool:
|
||||
return isinstance(err, ClaudeCliError) and err.error_type in ("rate_limit", "timeout")
|
||||
|
||||
try:
|
||||
# Research-Lagen: Automatisch 3 Durchläufe nur beim ersten Refresh
|
||||
incident_type, has_summary = await self._get_incident_info(incident_id)
|
||||
@@ -557,32 +561,44 @@ class AgentOrchestrator:
|
||||
}, _vis, _cb, _tid)
|
||||
last_error = None
|
||||
break
|
||||
except TRANSIENT_ERRORS as e:
|
||||
last_error = e
|
||||
logger.warning(f"Transienter Fehler bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
delay = RETRY_DELAYS[attempt + 1]
|
||||
logger.info(f"Retry in {delay}s für Lage {incident_id}")
|
||||
# Retry-Status per WebSocket senden
|
||||
if self._ws_manager:
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
_vis, _cb, _tid = "public", None, None
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "retrying", "attempt": attempt + 1, "delay": delay},
|
||||
}, _vis, _cb, _tid)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}")
|
||||
except Exception as e:
|
||||
# Auth/CLI-Fehler: sofort abbrechen, kein Retry sinnvoll
|
||||
if isinstance(e, ClaudeCliError) and e.error_type in ("auth_error", "cli_error"):
|
||||
last_error = e
|
||||
logger.error(f"Permanenter Claude-Fehler [{e.error_type}] bei Lage {incident_id}: {e}")
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
break
|
||||
|
||||
# Transiente Fehler: Retry bis 3x
|
||||
if isinstance(e, TRANSIENT_ERRORS) or _is_transient_cli(e):
|
||||
last_error = e
|
||||
kind = e.error_type if isinstance(e, ClaudeCliError) else type(e).__name__
|
||||
logger.warning(f"Transienter Fehler [{kind}] bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
delay = RETRY_DELAYS[attempt + 1]
|
||||
logger.info(f"Retry in {delay}s für Lage {incident_id}")
|
||||
if self._ws_manager:
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
_vis, _cb, _tid = "public", None, None
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "retrying", "attempt": attempt + 1, "delay": delay},
|
||||
}, _vis, _cb, _tid)
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
else:
|
||||
await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}")
|
||||
break
|
||||
|
||||
# Alles andere: permanent
|
||||
last_error = e
|
||||
logger.error(f"Permanenter Fehler bei Refresh für Lage {incident_id}: {e}")
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
break # Permanenter Fehler, kein Retry
|
||||
break
|
||||
|
||||
if last_error and self._ws_manager:
|
||||
try:
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren