7 min
technology
February 8, 2026

Error Handling and Recovery


title: "Error Handling and Recovery" description: "Research on resilience patterns, graceful degradation, and retry strategies in agentic systems" date: 2026-02-06 topics: [error-handling, resilience, recovery, reliability] sources: 0 status: initial

Error Handling and Recovery

Overview

Robust agentic systems must handle failures gracefully, recover automatically where possible, and escalate appropriately when human intervention is needed. This research covers error classification, recovery strategies, and resilience patterns.

Error Classification

1. By Source

typescript
enum ErrorSource { LLM_ERROR = 'llm', // Model failures, hallucinations TOOL_ERROR = 'tool', // External tool failures VALIDATION_ERROR = 'validation', // Input/output validation TIMEOUT_ERROR = 'timeout', // Execution timeouts RESOURCE_ERROR = 'resource', // Resource exhaustion PERMISSION_ERROR = 'permission', // Auth/authorization LOGIC_ERROR = 'logic', // Business logic errors NETWORK_ERROR = 'network', // Connectivity issues UNKNOWN_ERROR = 'unknown' // Uncategorized }

2. By Severity

typescript
enum ErrorSeverity { CRITICAL = 'critical', // System unusable, immediate attention HIGH = 'high', // Major functionality impaired MEDIUM = 'medium', // Minor functionality impaired LOW = 'low', // Cosmetic or informational WARNING = 'warning' // Potential issue, no immediate impact }

3. By Recoverability

typescript
enum Recoverability { AUTO_RECOVERABLE = 'auto', // Can retry/fix automatically MANUAL_RECOVERABLE = 'manual', // Needs human intervention NON_RECOVERABLE = 'none', // Data loss, cannot recover UNKNOWN = 'unknown' // Recovery path unclear }

Error Handling Patterns

1. Circuit Breaker Pattern

typescript
interface CircuitBreaker { state: 'closed' | 'open' | 'half-open'; failureCount: number; lastFailureTime: Date; async execute<T>(fn: () => Promise<T>): Promise<T> { if (this.state === 'open') { if (this.shouldAttemptReset()) { this.state = 'half-open'; } else { throw new CircuitOpenError('Circuit is open'); } } try { const result = await fn(); this.onSuccess(); return result; } catch (error) { this.onFailure(); throw error; } } private onSuccess(): void { this.failureCount = 0; this.state = 'closed'; } private onFailure(): void { this.failureCount++; if (this.failureCount >= this.threshold) { this.state = 'open'; this.lastFailureTime = new Date(); } } }

2. Retry with Backoff

typescript
interface RetryConfig { maxAttempts: number; baseDelay: number; maxDelay: number; backoffMultiplier: number; retryableErrors: ErrorClass[]; } async function retryWithBackoff<T>( fn: () => Promise<T>, config: RetryConfig ): Promise<T> { let attempt = 0; let delay = config.baseDelay; while (attempt < config.maxAttempts) { try { return await fn(); } catch (error) { attempt++; if (attempt >= config.maxAttempts) { throw error; } if (!config.retryableErrors.some(e => error instanceof e)) { throw error; } await sleep(delay); delay = Math.min( delay * config.backoffMultiplier, config.maxDelay ); } } throw new Error('Max retries exceeded'); }

3. Fallback Chain

typescript
interface FallbackChain<T> { strategies: (() => Promise<T>)[]; async execute(): Promise<T> { const errors: Error[] = []; for (const strategy of this.strategies) { try { return await strategy(); } catch (error) { errors.push(error); continue; } } throw new AggregateError( errors, 'All fallback strategies failed' ); } } // Example usage const result = await new FallbackChain<string>({ strategies: [ () => callPrimaryAPI(), () => callBackupAPI(), () => returnCachedValue(), () => returnDefaultValue() ] }).execute();

4. Graceful Degradation

typescript
interface GracefulDegradation<T> { primary: () => Promise<T>; degraded: () => Promise<T>; fallback: T; async execute(): Promise<T> { try { return await this.primary(); } catch (error) { console.warn('Primary failed, using degraded mode:', error); try { return await this.degraded(); } catch (degradedError) { console.error('Degraded mode failed, using fallback:', degradedError); return this.fallback; } } } }

Recovery Strategies

1. Self-Healing

typescript
interface SelfHealing { async attemptRecovery(error: Error): Promise<RecoveryResult> { const diagnosis = await this.diagnose(error); switch (diagnosis.rootCause) { case 'stale_cache': await this.clearCache(); return { recovered: true }; case 'deadlock': await this.killBlockedProcesses(); return { recovered: true }; case 'memory_leak': await this.restartService(); return { recovered: true, action: 'restart' }; case 'corrupt_state': await this.resetToLastKnownGood(); return { recovered: true, action: 'reset' }; default: return { recovered: false, reason: 'Unknown root cause' }; } } }

2. Checkpoint and Resume

typescript
interface CheckpointSystem { checkpoints: Map<CheckpointId, Checkpoint>; async createCheckpoint(state: State): Promise<CheckpointId> { const id = generateId(); this.checkpoints.set(id, { id, timestamp: new Date(), state: structuredClone(state) }); return id; } async resumeFromCheckpoint(id: CheckpointId): Promise<State> { const checkpoint = this.checkpoints.get(id); if (!checkpoint) { throw new Error(`Checkpoint ${id} not found`); } return checkpoint.state; } async rollback(from: CheckpointId, to: CheckpointId): Promise<State> { const current = this.checkpoints.get(from); const target = this.checkpoints.get(to); // Validate rollback is safe await this.validateRollback(current, target); return target.state; } }

3. Transactional Compensation

typescript
interface CompensationAction { async execute(): Promise<void>; async compensate(): Promise<void>; } interface Transaction { actions: CompensationAction[]; async execute(): Promise<void> { const completed: CompensationAction[] = []; try { for (const action of this.actions) { await action.execute(); completed.push(action); } } catch (error) { // Rollback completed actions for (const action of completed.reverse()) { try { await action.compensate(); } catch (compensationError) { // Log but continue rollback console.error('Compensation failed:', compensationError); } } throw error; } } }

Error Boundaries

1. Per-Phase Boundaries

typescript
interface PhaseErrorBoundary { async executeWithBoundary<T>( phase: string, fn: () => Promise<T> ): Promise<T | ErrorResult> { try { return await fn(); } catch (error) { const handled = await this.handlePhaseError(phase, error); if (handled.canContinue) { return handled.result; } else { return { error: true, phase, error: error.message, recoveryOptions: handled.options }; } } } }

2. Per-Agent Boundaries

typescript
interface AgentErrorBoundary { async executeWithIsolation<T>( agent: Agent, fn: () => Promise<T> ): Promise<T> { try { return await fn(); } catch (error) { // Isolate failure to this agent await this.isolateAgent(agent); // Try fallback agent const fallback = await this.getFallbackAgent(agent); return await fallback.execute(fn); } } }

3. Global Error Boundary

typescript
interface GlobalErrorBoundary { async handleGlobalError(error: Error): Promise<void> { // Log error await this.logError(error); // Assess severity const severity = this.assessSeverity(error); // Notify if needed if (severity >= ErrorSeverity.HIGH) { await this.notifyHumans(error); } // Attempt recovery const recovery = await this.attemptGlobalRecovery(error); if (!recovery.success) { // Enter safe mode await this.enterSafeMode(); } } }

Monitoring and Alerting

1. Error Tracking

typescript
interface ErrorTracker { async trackError(error: Error, context: Context): Promise<void> { const errorReport = { id: generateId(), timestamp: new Date(), error: { type: error.constructor.name, message: error.message, stack: error.stack }, context: { session: context.sessionId, agent: context.agentId, phase: context.currentPhase, inputs: this.sanitize(context.inputs) }, fingerprint: this.generateFingerprint(error) }; // Store error await this.store(errorReport); // Check for patterns await this.analyzeForPatterns(errorReport); } private generateFingerprint(error: Error): string { // Create hash based on error type and message pattern return hash(`${error.constructor.name}:${this.normalizeMessage(error.message)}`); } }

2. Health Checks

typescript
interface HealthMonitor { async checkSystemHealth(): Promise<HealthReport> { const checks = await Promise.all([ this.checkLLMHealth(), this.checkToolHealth(), this.checkMemoryHealth(), this.checkQueueHealth() ]); const failing = checks.filter(c => !c.healthy); return { healthy: failing.length === 0, checks, failing, recommendations: this.generateRecommendations(failing) }; } }

3. Alerting Rules

typescript
interface AlertingRules { rules: { condition: (metrics: Metrics) => boolean; severity: ErrorSeverity; message: string; channels: NotificationChannel[]; }[]; async evaluate(metrics: Metrics): Promise<Alert[]> { const alerts: Alert[] = []; for (const rule of this.rules) { if (rule.condition(metrics)) { alerts.push({ severity: rule.severity, message: rule.message, channels: rule.channels, timestamp: new Date() }); } } return alerts; } }

Resilience Testing

1. Chaos Engineering

typescript
interface ChaosEngineering { async injectFailure( component: string, failureType: FailureType ): Promise<void> { switch (failureType) { case 'latency': await this.addLatency(component, 5000); break; case 'error': await this.makeComponentError(component, 0.5); break; case 'crash': await this.crashComponent(component); break; case 'resource_exhaustion': await this.exhaustResources(component); break; } } async runChaosExperiment( experiment: ChaosExperiment ): Promise<ExperimentResult> { // Baseline const baseline = await this.measureBaseline(); // Inject chaos await this.injectFailure(experiment.target, experiment.failure); // Measure resilience const resilience = await this.measureResilience(); // Recover await this.recover(experiment.target); return { baseline, resilience, recovery: await this.measureRecovery() }; } }

2. Fault Injection

typescript
interface FaultInjector { faults: Map<string, Fault>; registerFault(name: string, fault: Fault): void { this.faults.set(name, fault); } async injectFault(name: string): Promise<void> { const fault = this.faults.get(name); if (fault) { await fault.inject(); } } async removeFault(name: string): Promise<void> { const fault = this.faults.get(name); if (fault) { await fault.remove(); } } }

Best Practices

1. Fail Fast vs Fail Safe

typescript
// Fail Fast - detect errors early, stop immediately function failFastApproach() { validateInput(input); // Throws if invalid validatePermissions(user); // Throws if unauthorized validateResources(); // Throws if unavailable // Only proceed if all validations pass return executeTask(); } // Fail Safe - degrade gracefully, continue with reduced functionality function failSafeApproach() { try { return executeTask(); } catch (error) { console.warn('Primary approach failed:', error); try { return executeFallback(); } catch (fallbackError) { console.warn('Fallback failed:', fallbackError); return returnDefaultValue(); } } }

2. Error Context Preservation

typescript
function preserveContext(fn: () => Promise<void>): Promise<void> { const context = { timestamp: new Date(), requestId: generateRequestId(), user: getCurrentUser(), session: getSessionId() }; return fn().catch(error => { // Enrich error with context error.context = context; error.requestId = context.requestId; throw error; }); }

3. Idempotency

typescript
interface IdempotentOperation { idempotencyKey: string; async execute(): Promise<Result> { // Check if already executed const existing = await this.checkExisting(this.idempotencyKey); if (existing) { return existing.result; } // Execute and store result const result = await this.doExecute(); await this.storeResult(this.idempotencyKey, result); return result; } }

Open Questions

  • When to fail fast vs fail safe?
  • Optimal retry counts and delays?
  • How to test recovery procedures?
  • Balancing resilience with complexity?
  • Measuring system reliability accurately?