Block B: ClaudeCliError + differenzierte HTTP-Status + Rate-Limit-Retry
- Neue Exception-Klasse ClaudeCliError(error_type, message) in claude_client.py mit Kategorien rate_limit / auth_error / timeout / cli_error. - _classify_cli_error() als geteilter Klassifikator (Keywords fuer Rate-Limit und Auth-Fehler wie "does not have access", "login again"). - call_claude() erkennt jetzt auch is_error=true im JSON bei returncode=0 (Hauptursache des Ausfalls vom 22.04.: CLI liefert "Your organization does not have access" mit is_error=true statt Exit-Code). - Orchestrator: ClaudeCliError mit rate_limit/timeout als transient behandelt (3 Retries mit Backoff 0s/120s/300s). auth_error/cli_error brechen sofort ab ohne Retry. Behebt den bestehenden Bug, dass Rate-Limit-Fehler gar nicht retried wurden. - routers/incidents.py Enhance-Endpoint: ClaudeCliError wird auf 503 (auth_error) / 429 (rate_limit) gemappt, TimeoutError auf 504. - routers/chat.py _call_claude_chat(): wirft jetzt ClaudeCliError statt generischem RuntimeError. Chat-Endpoint mappt auth_error auf 503. - Frontend: neue ApiError-Klasse in api.js mit status+detail. generateDescription() in app.js zeigt differenzierte Toasts nach HTTP-Status (503/429/504/403). - dashboard.html: Cache-Bust api.js + app.js auf v=20260423a
Dieser Commit ist enthalten in:
@@ -13,6 +13,35 @@ _cancel_event_var: contextvars.ContextVar[asyncio.Event | None] = contextvars.Co
|
||||
logger = logging.getLogger("osint.claude_client")
|
||||
|
||||
|
||||
class ClaudeCliError(RuntimeError):
|
||||
"""Strukturierter Fehler aus dem Claude CLI mit Kategorie.
|
||||
|
||||
error_type:
|
||||
- "rate_limit": Anthropic Rate-Limit oder Overload (transient, retry-tauglich)
|
||||
- "auth_error": Account-Problem (Organisation hat keinen Claude-Zugang,
|
||||
Token abgelaufen/ungueltig) - kein Retry sinnvoll, Admin-Aktion noetig
|
||||
- "timeout": Claude CLI Timeout (transient)
|
||||
- "cli_error": Sonstiger CLI-Fehler (unspezifisch, Default)
|
||||
"""
|
||||
|
||||
def __init__(self, error_type: str, message: str):
|
||||
self.error_type = error_type
|
||||
self.message = message
|
||||
super().__init__(f"Claude CLI [{error_type}]: {message}")
|
||||
|
||||
|
||||
def _classify_cli_error(combined_output: str) -> str:
|
||||
"""Ordnet einer Fehler-Ausgabe eine error_type-Kategorie zu."""
|
||||
txt = combined_output.lower()
|
||||
rate_limit_keywords = ["hit your limit", "rate limit", "resets", "rate_limit", "overloaded"]
|
||||
auth_error_keywords = ["does not have access", "login again", "contact your administrator"]
|
||||
if any(kw in txt for kw in rate_limit_keywords):
|
||||
return "rate_limit"
|
||||
if any(kw in txt for kw in auth_error_keywords):
|
||||
return "auth_error"
|
||||
return "cli_error"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClaudeUsage:
|
||||
"""Token-Verbrauch eines einzelnen Claude CLI Aufrufs."""
|
||||
@@ -121,19 +150,20 @@ async def call_claude(prompt: str, tools: str | None = "WebSearch,WebFetch", mod
|
||||
error_msg = stderr.decode("utf-8", errors="replace").strip()
|
||||
stdout_msg = stdout.decode("utf-8", errors="replace").strip()
|
||||
|
||||
# Rate-Limit-Fehler kommen als JSON auf stdout, nicht auf stderr
|
||||
error_type = "cli_error"
|
||||
rate_limit_keywords = ["hit your limit", "rate limit", "resets", "rate_limit", "overloaded"]
|
||||
combined_output = f"{error_msg} {stdout_msg}".lower()
|
||||
if any(kw in combined_output for kw in rate_limit_keywords):
|
||||
error_type = "rate_limit"
|
||||
# Rate-Limit/Auth-Fehler kommen teils als JSON auf stdout, nicht auf stderr
|
||||
combined_output = f"{error_msg} {stdout_msg}"
|
||||
error_type = _classify_cli_error(combined_output)
|
||||
|
||||
if error_type == "rate_limit":
|
||||
logger.warning(f"Claude CLI Rate-Limit (Exit {process.returncode}): {stdout_msg or error_msg}")
|
||||
elif error_type == "auth_error":
|
||||
logger.error(f"Claude CLI Auth-Fehler (Exit {process.returncode}): {stdout_msg or error_msg}")
|
||||
else:
|
||||
logger.error(f"Claude CLI Fehler (Exit {process.returncode}): {error_msg}")
|
||||
if stdout_msg:
|
||||
logger.error(f"Claude CLI stdout bei Fehler: {stdout_msg[:500]}")
|
||||
|
||||
raise RuntimeError(f"Claude CLI Fehler [{error_type}]: {stdout_msg or error_msg}")
|
||||
raise ClaudeCliError(error_type, stdout_msg or error_msg)
|
||||
|
||||
raw = stdout.decode("utf-8", errors="replace").strip()
|
||||
usage = ClaudeUsage()
|
||||
@@ -141,6 +171,19 @@ async def call_claude(prompt: str, tools: str | None = "WebSearch,WebFetch", mod
|
||||
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
# CLI kann returncode=0 liefern und trotzdem is_error=true setzen
|
||||
# (z.B. "Your organization does not have access to Claude")
|
||||
if data.get("is_error"):
|
||||
error_text = str(data.get("result", ""))
|
||||
error_type = _classify_cli_error(error_text)
|
||||
if error_type == "rate_limit":
|
||||
logger.warning(f"Claude CLI Rate-Limit (is_error): {error_text}")
|
||||
elif error_type == "auth_error":
|
||||
logger.error(f"Claude CLI Auth-Fehler (is_error): {error_text}")
|
||||
else:
|
||||
logger.error(f"Claude CLI Fehler (is_error): {error_text}")
|
||||
raise ClaudeCliError(error_type, error_text)
|
||||
|
||||
result_text = data.get("result", raw)
|
||||
u = data.get("usage", {})
|
||||
usage = ClaudeUsage(
|
||||
|
||||
@@ -527,8 +527,12 @@ class AgentOrchestrator:
|
||||
|
||||
RETRY_DELAYS = [0, 120, 300] # Sekunden: sofort, 2min, 5min
|
||||
TRANSIENT_ERRORS = (asyncio.TimeoutError, TimeoutError, ConnectionError, OSError)
|
||||
from agents.claude_client import ClaudeCliError
|
||||
last_error = None
|
||||
|
||||
def _is_transient_cli(err: Exception) -> bool:
|
||||
return isinstance(err, ClaudeCliError) and err.error_type in ("rate_limit", "timeout")
|
||||
|
||||
try:
|
||||
# Research-Lagen: Automatisch 3 Durchläufe nur beim ersten Refresh
|
||||
incident_type, has_summary = await self._get_incident_info(incident_id)
|
||||
@@ -557,14 +561,23 @@ class AgentOrchestrator:
|
||||
}, _vis, _cb, _tid)
|
||||
last_error = None
|
||||
break
|
||||
except TRANSIENT_ERRORS as e:
|
||||
except Exception as e:
|
||||
# Auth/CLI-Fehler: sofort abbrechen, kein Retry sinnvoll
|
||||
if isinstance(e, ClaudeCliError) and e.error_type in ("auth_error", "cli_error"):
|
||||
last_error = e
|
||||
logger.warning(f"Transienter Fehler bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}")
|
||||
logger.error(f"Permanenter Claude-Fehler [{e.error_type}] bei Lage {incident_id}: {e}")
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
break
|
||||
|
||||
# Transiente Fehler: Retry bis 3x
|
||||
if isinstance(e, TRANSIENT_ERRORS) or _is_transient_cli(e):
|
||||
last_error = e
|
||||
kind = e.error_type if isinstance(e, ClaudeCliError) else type(e).__name__
|
||||
logger.warning(f"Transienter Fehler [{kind}] bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
delay = RETRY_DELAYS[attempt + 1]
|
||||
logger.info(f"Retry in {delay}s für Lage {incident_id}")
|
||||
# Retry-Status per WebSocket senden
|
||||
if self._ws_manager:
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
@@ -576,13 +589,16 @@ class AgentOrchestrator:
|
||||
"data": {"status": "retrying", "attempt": attempt + 1, "delay": delay},
|
||||
}, _vis, _cb, _tid)
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
else:
|
||||
await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}")
|
||||
except Exception as e:
|
||||
break
|
||||
|
||||
# Alles andere: permanent
|
||||
last_error = e
|
||||
logger.error(f"Permanenter Fehler bei Refresh für Lage {incident_id}: {e}")
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
break # Permanenter Fehler, kein Retry
|
||||
break
|
||||
|
||||
if last_error and self._ws_manager:
|
||||
try:
|
||||
|
||||
@@ -15,7 +15,7 @@ from config import CLAUDE_PATH, CLAUDE_MODEL_FAST
|
||||
from database import db_dependency
|
||||
from middleware.license_check import require_writable_license
|
||||
from services.license_service import charge_usage_to_tenant
|
||||
from agents.claude_client import ClaudeUsage
|
||||
from agents.claude_client import ClaudeUsage, ClaudeCliError, _classify_cli_error
|
||||
import aiosqlite
|
||||
|
||||
logger = logging.getLogger("osint.chat")
|
||||
@@ -59,10 +59,11 @@ async def _call_claude_chat(prompt: str) -> tuple[str, int, ClaudeUsage]:
|
||||
|
||||
if process.returncode != 0:
|
||||
err_msg = stderr.decode("utf-8", errors="replace").strip()
|
||||
logger.error(f"Chat Claude CLI Fehler (rc={process.returncode}): {err_msg[:500]}")
|
||||
if "rate_limit" in err_msg.lower() or "overloaded" in err_msg.lower():
|
||||
raise RuntimeError("rate_limit")
|
||||
raise RuntimeError(f"Claude CLI Fehler: {err_msg[:200]}")
|
||||
stdout_msg = stdout.decode("utf-8", errors="replace").strip()
|
||||
combined = f"{err_msg} {stdout_msg}"
|
||||
error_type = _classify_cli_error(combined)
|
||||
logger.error(f"Chat Claude CLI Fehler [{error_type}] (rc={process.returncode}): {(stdout_msg or err_msg)[:500]}")
|
||||
raise ClaudeCliError(error_type, stdout_msg or err_msg)
|
||||
|
||||
raw = stdout.decode("utf-8", errors="replace").strip()
|
||||
duration_ms = 0
|
||||
@@ -71,6 +72,12 @@ async def _call_claude_chat(prompt: str) -> tuple[str, int, ClaudeUsage]:
|
||||
|
||||
try:
|
||||
data = _json.loads(raw)
|
||||
if data.get("is_error"):
|
||||
error_text = str(data.get("result", ""))
|
||||
error_type = _classify_cli_error(error_text)
|
||||
logger.error(f"Chat Claude CLI Fehler [{error_type}] (is_error): {error_text[:500]}")
|
||||
raise ClaudeCliError(error_type, error_text)
|
||||
|
||||
result_text = data.get("result", raw)
|
||||
duration_ms = data.get("duration_ms", 0)
|
||||
u = data.get("usage", {})
|
||||
@@ -437,11 +444,15 @@ async def chat(
|
||||
result, duration_ms, usage = await _call_claude_chat(prompt)
|
||||
except TimeoutError:
|
||||
raise HTTPException(status_code=504, detail="Der Assistent antwortet gerade nicht. Bitte versuche es erneut.")
|
||||
except RuntimeError as e:
|
||||
error_str = str(e)
|
||||
if "rate_limit" in error_str:
|
||||
except ClaudeCliError as e:
|
||||
if e.error_type == "rate_limit":
|
||||
raise HTTPException(status_code=429, detail="Der Assistent ist gerade ausgelastet. Bitte versuche es in einer Minute erneut.")
|
||||
logger.error(f"Chat Claude-Fehler: {e}")
|
||||
if e.error_type == "auth_error":
|
||||
raise HTTPException(status_code=503, detail="KI-Zugang aktuell nicht verfuegbar. Bitte Administrator kontaktieren.")
|
||||
logger.error(f"Chat Claude-Fehler [{e.error_type}]: {e}")
|
||||
raise HTTPException(status_code=502, detail="Der Assistent ist voruebergehend nicht erreichbar.")
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Chat Claude-Fehler (unspezifisch): {e}")
|
||||
raise HTTPException(status_code=502, detail="Der Assistent ist voruebergehend nicht erreichbar.")
|
||||
|
||||
# Credits buchen
|
||||
|
||||
@@ -245,7 +245,7 @@ async def enhance_description(
|
||||
db: aiosqlite.Connection = Depends(db_dependency),
|
||||
):
|
||||
"""Generiert eine strukturierte Beschreibung per KI aus dem Titel."""
|
||||
from agents.claude_client import call_claude
|
||||
from agents.claude_client import call_claude, ClaudeCliError
|
||||
from config import CLAUDE_MODEL_FAST
|
||||
from services.license_service import charge_usage_to_tenant
|
||||
|
||||
@@ -255,6 +255,22 @@ async def enhance_description(
|
||||
|
||||
try:
|
||||
result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST, raw_text=True)
|
||||
except ClaudeCliError as e:
|
||||
_enhance_logger.error(f"Beschreibung generieren: ClaudeCliError [{e.error_type}]: {e.message}")
|
||||
if e.error_type == "auth_error":
|
||||
raise HTTPException(status_code=503, detail="KI-Zugang aktuell nicht verfuegbar. Bitte Administrator kontaktieren.")
|
||||
if e.error_type == "rate_limit":
|
||||
raise HTTPException(status_code=429, detail="KI ist gerade ausgelastet. Bitte in einer Minute erneut versuchen.")
|
||||
raise HTTPException(status_code=500, detail="Beschreibung konnte nicht generiert werden")
|
||||
except TimeoutError:
|
||||
_enhance_logger.error("Beschreibung generieren: Timeout")
|
||||
raise HTTPException(status_code=504, detail="Die KI antwortet gerade nicht. Bitte erneut versuchen.")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
_enhance_logger.error(f"Beschreibung generieren fehlgeschlagen: {e}")
|
||||
raise HTTPException(status_code=500, detail="Beschreibung konnte nicht generiert werden")
|
||||
|
||||
_enhance_logger.info(
|
||||
f"Beschreibung generiert fuer \"{data.title[:50]}\": "
|
||||
f"{usage.input_tokens}in/{usage.output_tokens}out"
|
||||
@@ -262,9 +278,6 @@ async def enhance_description(
|
||||
await charge_usage_to_tenant(db, current_user.get("tenant_id"), usage, source="enhance")
|
||||
await db.commit()
|
||||
return {"description": result.strip()}
|
||||
except Exception as e:
|
||||
_enhance_logger.error(f"Beschreibung generieren fehlgeschlagen: {e}")
|
||||
raise HTTPException(status_code=500, detail="Beschreibung konnte nicht generiert werden")
|
||||
|
||||
|
||||
@router.get("/{incident_id}", response_model=IncidentResponse)
|
||||
|
||||
@@ -624,11 +624,11 @@
|
||||
<script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>
|
||||
<script src="/static/vendor/leaflet.js"></script>
|
||||
<script src="/static/vendor/leaflet.markercluster.js"></script>
|
||||
<script src="/static/js/api.js?v=20260316c"></script>
|
||||
<script src="/static/js/api.js?v=20260423a"></script>
|
||||
<script src="/static/js/ws.js?v=20260316b"></script>
|
||||
<script src="/static/js/components.js?v=20260316d"></script>
|
||||
<script src="/static/js/layout.js?v=20260316b"></script>
|
||||
<script src="/static/js/app.js?v=20260316b"></script>
|
||||
<script src="/static/js/app.js?v=20260423a"></script>
|
||||
<script src="/static/js/cluster-data.js?v=20260322f"></script>
|
||||
<script src="/static/js/tutorial.js?v=20260316z"></script>
|
||||
<script src="/static/js/chat.js?v=20260422a"></script>
|
||||
|
||||
@@ -1,6 +1,16 @@
|
||||
/**
|
||||
* API-Client für den OSINT Lagemonitor.
|
||||
*/
|
||||
|
||||
class ApiError extends Error {
|
||||
constructor(status, detail) {
|
||||
super(detail || `Fehler ${status}`);
|
||||
this.name = 'ApiError';
|
||||
this.status = status;
|
||||
this.detail = detail;
|
||||
}
|
||||
}
|
||||
|
||||
const API = {
|
||||
baseUrl: '/api',
|
||||
|
||||
@@ -57,7 +67,7 @@ const API = {
|
||||
} else if (typeof detail === 'object' && detail !== null) {
|
||||
detail = JSON.stringify(detail);
|
||||
}
|
||||
throw new Error(detail || `Fehler ${response.status}`);
|
||||
throw new ApiError(response.status, detail);
|
||||
}
|
||||
|
||||
if (response.status === 204) return null;
|
||||
|
||||
@@ -1816,8 +1816,15 @@ async generateDescription() {
|
||||
textarea.value = result.description;
|
||||
_autoResizeTextarea(textarea);
|
||||
} catch (err) {
|
||||
if (err.name !== 'AbortError') {
|
||||
UI.showToast('Beschreibung konnte nicht generiert werden', 'error');
|
||||
if (err.name === 'AbortError') {
|
||||
// still
|
||||
} else {
|
||||
let msg = 'Beschreibung konnte nicht generiert werden';
|
||||
if (err.status === 503) msg = 'KI-Zugang aktuell nicht verfügbar. Bitte Administrator kontaktieren.';
|
||||
else if (err.status === 429) msg = 'KI ist gerade ausgelastet. Bitte kurz warten und erneut versuchen.';
|
||||
else if (err.status === 504) msg = 'KI antwortet gerade nicht. Bitte erneut versuchen.';
|
||||
else if (err.status === 403) msg = err.detail || 'Zugriff verweigert.';
|
||||
UI.showToast(msg, 'error');
|
||||
}
|
||||
} finally {
|
||||
btnText.textContent = 'Beschreibung generieren';
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren