technology
February 8, 2026Error Handling and Recovery
title: "Error Handling and Recovery" description: "Research on resilience patterns, graceful degradation, and retry strategies in agentic systems" date: 2026-02-06 topics: [error-handling, resilience, recovery, reliability] sources: 0 status: initial
Error Handling and Recovery
Overview
Robust agentic systems must handle failures gracefully, recover automatically where possible, and escalate appropriately when human intervention is needed. This research covers error classification, recovery strategies, and resilience patterns.
Error Classification
1. By Source
typescriptenum ErrorSource { LLM_ERROR = 'llm', // Model failures, hallucinations TOOL_ERROR = 'tool', // External tool failures VALIDATION_ERROR = 'validation', // Input/output validation TIMEOUT_ERROR = 'timeout', // Execution timeouts RESOURCE_ERROR = 'resource', // Resource exhaustion PERMISSION_ERROR = 'permission', // Auth/authorization LOGIC_ERROR = 'logic', // Business logic errors NETWORK_ERROR = 'network', // Connectivity issues UNKNOWN_ERROR = 'unknown' // Uncategorized }
2. By Severity
typescriptenum ErrorSeverity { CRITICAL = 'critical', // System unusable, immediate attention HIGH = 'high', // Major functionality impaired MEDIUM = 'medium', // Minor functionality impaired LOW = 'low', // Cosmetic or informational WARNING = 'warning' // Potential issue, no immediate impact }
3. By Recoverability
typescriptenum Recoverability { AUTO_RECOVERABLE = 'auto', // Can retry/fix automatically MANUAL_RECOVERABLE = 'manual', // Needs human intervention NON_RECOVERABLE = 'none', // Data loss, cannot recover UNKNOWN = 'unknown' // Recovery path unclear }
Error Handling Patterns
1. Circuit Breaker Pattern
typescriptinterface CircuitBreaker { state: 'closed' | 'open' | 'half-open'; failureCount: number; lastFailureTime: Date; async execute<T>(fn: () => Promise<T>): Promise<T> { if (this.state === 'open') { if (this.shouldAttemptReset()) { this.state = 'half-open'; } else { throw new CircuitOpenError('Circuit is open'); } } try { const result = await fn(); this.onSuccess(); return result; } catch (error) { this.onFailure(); throw error; } } private onSuccess(): void { this.failureCount = 0; this.state = 'closed'; } private onFailure(): void { this.failureCount++; if (this.failureCount >= this.threshold) { this.state = 'open'; this.lastFailureTime = new Date(); } } }
2. Retry with Backoff
typescriptinterface RetryConfig { maxAttempts: number; baseDelay: number; maxDelay: number; backoffMultiplier: number; retryableErrors: ErrorClass[]; } async function retryWithBackoff<T>( fn: () => Promise<T>, config: RetryConfig ): Promise<T> { let attempt = 0; let delay = config.baseDelay; while (attempt < config.maxAttempts) { try { return await fn(); } catch (error) { attempt++; if (attempt >= config.maxAttempts) { throw error; } if (!config.retryableErrors.some(e => error instanceof e)) { throw error; } await sleep(delay); delay = Math.min( delay * config.backoffMultiplier, config.maxDelay ); } } throw new Error('Max retries exceeded'); }
3. Fallback Chain
typescriptinterface FallbackChain<T> { strategies: (() => Promise<T>)[]; async execute(): Promise<T> { const errors: Error[] = []; for (const strategy of this.strategies) { try { return await strategy(); } catch (error) { errors.push(error); continue; } } throw new AggregateError( errors, 'All fallback strategies failed' ); } } // Example usage const result = await new FallbackChain<string>({ strategies: [ () => callPrimaryAPI(), () => callBackupAPI(), () => returnCachedValue(), () => returnDefaultValue() ] }).execute();
4. Graceful Degradation
typescriptinterface GracefulDegradation<T> { primary: () => Promise<T>; degraded: () => Promise<T>; fallback: T; async execute(): Promise<T> { try { return await this.primary(); } catch (error) { console.warn('Primary failed, using degraded mode:', error); try { return await this.degraded(); } catch (degradedError) { console.error('Degraded mode failed, using fallback:', degradedError); return this.fallback; } } } }
Recovery Strategies
1. Self-Healing
typescriptinterface SelfHealing { async attemptRecovery(error: Error): Promise<RecoveryResult> { const diagnosis = await this.diagnose(error); switch (diagnosis.rootCause) { case 'stale_cache': await this.clearCache(); return { recovered: true }; case 'deadlock': await this.killBlockedProcesses(); return { recovered: true }; case 'memory_leak': await this.restartService(); return { recovered: true, action: 'restart' }; case 'corrupt_state': await this.resetToLastKnownGood(); return { recovered: true, action: 'reset' }; default: return { recovered: false, reason: 'Unknown root cause' }; } } }
2. Checkpoint and Resume
typescriptinterface CheckpointSystem { checkpoints: Map<CheckpointId, Checkpoint>; async createCheckpoint(state: State): Promise<CheckpointId> { const id = generateId(); this.checkpoints.set(id, { id, timestamp: new Date(), state: structuredClone(state) }); return id; } async resumeFromCheckpoint(id: CheckpointId): Promise<State> { const checkpoint = this.checkpoints.get(id); if (!checkpoint) { throw new Error(`Checkpoint ${id} not found`); } return checkpoint.state; } async rollback(from: CheckpointId, to: CheckpointId): Promise<State> { const current = this.checkpoints.get(from); const target = this.checkpoints.get(to); // Validate rollback is safe await this.validateRollback(current, target); return target.state; } }
3. Transactional Compensation
typescriptinterface CompensationAction { async execute(): Promise<void>; async compensate(): Promise<void>; } interface Transaction { actions: CompensationAction[]; async execute(): Promise<void> { const completed: CompensationAction[] = []; try { for (const action of this.actions) { await action.execute(); completed.push(action); } } catch (error) { // Rollback completed actions for (const action of completed.reverse()) { try { await action.compensate(); } catch (compensationError) { // Log but continue rollback console.error('Compensation failed:', compensationError); } } throw error; } } }
Error Boundaries
1. Per-Phase Boundaries
typescriptinterface PhaseErrorBoundary { async executeWithBoundary<T>( phase: string, fn: () => Promise<T> ): Promise<T | ErrorResult> { try { return await fn(); } catch (error) { const handled = await this.handlePhaseError(phase, error); if (handled.canContinue) { return handled.result; } else { return { error: true, phase, error: error.message, recoveryOptions: handled.options }; } } } }
2. Per-Agent Boundaries
typescriptinterface AgentErrorBoundary { async executeWithIsolation<T>( agent: Agent, fn: () => Promise<T> ): Promise<T> { try { return await fn(); } catch (error) { // Isolate failure to this agent await this.isolateAgent(agent); // Try fallback agent const fallback = await this.getFallbackAgent(agent); return await fallback.execute(fn); } } }
3. Global Error Boundary
typescriptinterface GlobalErrorBoundary { async handleGlobalError(error: Error): Promise<void> { // Log error await this.logError(error); // Assess severity const severity = this.assessSeverity(error); // Notify if needed if (severity >= ErrorSeverity.HIGH) { await this.notifyHumans(error); } // Attempt recovery const recovery = await this.attemptGlobalRecovery(error); if (!recovery.success) { // Enter safe mode await this.enterSafeMode(); } } }
Monitoring and Alerting
1. Error Tracking
typescriptinterface ErrorTracker { async trackError(error: Error, context: Context): Promise<void> { const errorReport = { id: generateId(), timestamp: new Date(), error: { type: error.constructor.name, message: error.message, stack: error.stack }, context: { session: context.sessionId, agent: context.agentId, phase: context.currentPhase, inputs: this.sanitize(context.inputs) }, fingerprint: this.generateFingerprint(error) }; // Store error await this.store(errorReport); // Check for patterns await this.analyzeForPatterns(errorReport); } private generateFingerprint(error: Error): string { // Create hash based on error type and message pattern return hash(`${error.constructor.name}:${this.normalizeMessage(error.message)}`); } }
2. Health Checks
typescriptinterface HealthMonitor { async checkSystemHealth(): Promise<HealthReport> { const checks = await Promise.all([ this.checkLLMHealth(), this.checkToolHealth(), this.checkMemoryHealth(), this.checkQueueHealth() ]); const failing = checks.filter(c => !c.healthy); return { healthy: failing.length === 0, checks, failing, recommendations: this.generateRecommendations(failing) }; } }
3. Alerting Rules
typescriptinterface AlertingRules { rules: { condition: (metrics: Metrics) => boolean; severity: ErrorSeverity; message: string; channels: NotificationChannel[]; }[]; async evaluate(metrics: Metrics): Promise<Alert[]> { const alerts: Alert[] = []; for (const rule of this.rules) { if (rule.condition(metrics)) { alerts.push({ severity: rule.severity, message: rule.message, channels: rule.channels, timestamp: new Date() }); } } return alerts; } }
Resilience Testing
1. Chaos Engineering
typescriptinterface ChaosEngineering { async injectFailure( component: string, failureType: FailureType ): Promise<void> { switch (failureType) { case 'latency': await this.addLatency(component, 5000); break; case 'error': await this.makeComponentError(component, 0.5); break; case 'crash': await this.crashComponent(component); break; case 'resource_exhaustion': await this.exhaustResources(component); break; } } async runChaosExperiment( experiment: ChaosExperiment ): Promise<ExperimentResult> { // Baseline const baseline = await this.measureBaseline(); // Inject chaos await this.injectFailure(experiment.target, experiment.failure); // Measure resilience const resilience = await this.measureResilience(); // Recover await this.recover(experiment.target); return { baseline, resilience, recovery: await this.measureRecovery() }; } }
2. Fault Injection
typescriptinterface FaultInjector { faults: Map<string, Fault>; registerFault(name: string, fault: Fault): void { this.faults.set(name, fault); } async injectFault(name: string): Promise<void> { const fault = this.faults.get(name); if (fault) { await fault.inject(); } } async removeFault(name: string): Promise<void> { const fault = this.faults.get(name); if (fault) { await fault.remove(); } } }
Best Practices
1. Fail Fast vs Fail Safe
typescript// Fail Fast - detect errors early, stop immediately function failFastApproach() { validateInput(input); // Throws if invalid validatePermissions(user); // Throws if unauthorized validateResources(); // Throws if unavailable // Only proceed if all validations pass return executeTask(); } // Fail Safe - degrade gracefully, continue with reduced functionality function failSafeApproach() { try { return executeTask(); } catch (error) { console.warn('Primary approach failed:', error); try { return executeFallback(); } catch (fallbackError) { console.warn('Fallback failed:', fallbackError); return returnDefaultValue(); } } }
2. Error Context Preservation
typescriptfunction preserveContext(fn: () => Promise<void>): Promise<void> { const context = { timestamp: new Date(), requestId: generateRequestId(), user: getCurrentUser(), session: getSessionId() }; return fn().catch(error => { // Enrich error with context error.context = context; error.requestId = context.requestId; throw error; }); }
3. Idempotency
typescriptinterface IdempotentOperation { idempotencyKey: string; async execute(): Promise<Result> { // Check if already executed const existing = await this.checkExisting(this.idempotencyKey); if (existing) { return existing.result; } // Execute and store result const result = await this.doExecute(); await this.storeResult(this.idempotencyKey, result); return result; } }
Open Questions
- When to fail fast vs fail safe?
- Optimal retry counts and delays?
- How to test recovery procedures?
- Balancing resilience with complexity?
- Measuring system reliability accurately?