Error Handling and Recovery

title: "Error Handling and Recovery" description: "Research on resilience patterns, graceful degradation, and retry strategies in agentic systems" date: 2026-02-06 topics: [error-handling, resilience, recovery, reliability] sources: 0 status: initial

Error Handling and Recovery

Overview

Robust agentic systems must handle failures gracefully, recover automatically where possible, and escalate appropriately when human intervention is needed. This research covers error classification, recovery strategies, and resilience patterns.

Error Classification

1. By Source

typescript
enum ErrorSource {
  LLM_ERROR = 'llm',           // Model failures, hallucinations
  TOOL_ERROR = 'tool',         // External tool failures
  VALIDATION_ERROR = 'validation', // Input/output validation
  TIMEOUT_ERROR = 'timeout',   // Execution timeouts
  RESOURCE_ERROR = 'resource', // Resource exhaustion
  PERMISSION_ERROR = 'permission', // Auth/authorization
  LOGIC_ERROR = 'logic',       // Business logic errors
  NETWORK_ERROR = 'network',   // Connectivity issues
  UNKNOWN_ERROR = 'unknown'    // Uncategorized
}

2. By Severity

typescript
enum ErrorSeverity {
  CRITICAL = 'critical',       // System unusable, immediate attention
  HIGH = 'high',               // Major functionality impaired
  MEDIUM = 'medium',           // Minor functionality impaired
  LOW = 'low',                 // Cosmetic or informational
  WARNING = 'warning'          // Potential issue, no immediate impact
}

3. By Recoverability

typescript
enum Recoverability {
  AUTO_RECOVERABLE = 'auto',     // Can retry/fix automatically
  MANUAL_RECOVERABLE = 'manual', // Needs human intervention
  NON_RECOVERABLE = 'none',      // Data loss, cannot recover
  UNKNOWN = 'unknown'            // Recovery path unclear
}

Error Handling Patterns

1. Circuit Breaker Pattern

typescript
interface CircuitBreaker {
  state: 'closed' | 'open' | 'half-open';
  failureCount: number;
  lastFailureTime: Date;
  
  async execute<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === 'open') {
      if (this.shouldAttemptReset()) {
        this.state = 'half-open';
      } else {
        throw new CircuitOpenError('Circuit is open');
      }
    }
    
    try {
      const result = await fn();
      this.onSuccess();
      return result;
    } catch (error) {
      this.onFailure();
      throw error;
    }
  }
  
  private onSuccess(): void {
    this.failureCount = 0;
    this.state = 'closed';
  }
  
  private onFailure(): void {
    this.failureCount++;
    if (this.failureCount >= this.threshold) {
      this.state = 'open';
      this.lastFailureTime = new Date();
    }
  }
}

2. Retry with Backoff

typescript
interface RetryConfig {
  maxAttempts: number;
  baseDelay: number;
  maxDelay: number;
  backoffMultiplier: number;
  retryableErrors: ErrorClass[];
}

async function retryWithBackoff<T>(
  fn: () => Promise<T>,
  config: RetryConfig
): Promise<T> {
  let attempt = 0;
  let delay = config.baseDelay;
  
  while (attempt < config.maxAttempts) {
    try {
      return await fn();
    } catch (error) {
      attempt++;
      
      if (attempt >= config.maxAttempts) {
        throw error;
      }
      
      if (!config.retryableErrors.some(e => error instanceof e)) {
        throw error;
      }
      
      await sleep(delay);
      delay = Math.min(
        delay * config.backoffMultiplier,
        config.maxDelay
      );
    }
  }
  
  throw new Error('Max retries exceeded');
}

3. Fallback Chain

typescript
interface FallbackChain<T> {
  strategies: (() => Promise<T>)[];
  
  async execute(): Promise<T> {
    const errors: Error[] = [];
    
    for (const strategy of this.strategies) {
      try {
        return await strategy();
      } catch (error) {
        errors.push(error);
        continue;
      }
    }
    
    throw new AggregateError(
      errors,
      'All fallback strategies failed'
    );
  }
}

// Example usage
const result = await new FallbackChain<string>({
  strategies: [
    () => callPrimaryAPI(),
    () => callBackupAPI(),
    () => returnCachedValue(),
    () => returnDefaultValue()
  ]
}).execute();

4. Graceful Degradation

typescript
interface GracefulDegradation<T> {
  primary: () => Promise<T>;
  degraded: () => Promise<T>;
  fallback: T;
  
  async execute(): Promise<T> {
    try {
      return await this.primary();
    } catch (error) {
      console.warn('Primary failed, using degraded mode:', error);
      
      try {
        return await this.degraded();
      } catch (degradedError) {
        console.error('Degraded mode failed, using fallback:', degradedError);
        return this.fallback;
      }
    }
  }
}

Recovery Strategies

1. Self-Healing

typescript
interface SelfHealing {
  async attemptRecovery(error: Error): Promise<RecoveryResult> {
    const diagnosis = await this.diagnose(error);
    
    switch (diagnosis.rootCause) {
      case 'stale_cache':
        await this.clearCache();
        return { recovered: true };
        
      case 'deadlock':
        await this.killBlockedProcesses();
        return { recovered: true };
        
      case 'memory_leak':
        await this.restartService();
        return { recovered: true, action: 'restart' };
        
      case 'corrupt_state':
        await this.resetToLastKnownGood();
        return { recovered: true, action: 'reset' };
        
      default:
        return { recovered: false, reason: 'Unknown root cause' };
    }
  }
}

2. Checkpoint and Resume

typescript
interface CheckpointSystem {
  checkpoints: Map<CheckpointId, Checkpoint>;
  
  async createCheckpoint(state: State): Promise<CheckpointId> {
    const id = generateId();
    this.checkpoints.set(id, {
      id,
      timestamp: new Date(),
      state: structuredClone(state)
    });
    return id;
  }
  
  async resumeFromCheckpoint(id: CheckpointId): Promise<State> {
    const checkpoint = this.checkpoints.get(id);
    if (!checkpoint) {
      throw new Error(`Checkpoint ${id} not found`);
    }
    return checkpoint.state;
  }
  
  async rollback(from: CheckpointId, to: CheckpointId): Promise<State> {
    const current = this.checkpoints.get(from);
    const target = this.checkpoints.get(to);
    
    // Validate rollback is safe
    await this.validateRollback(current, target);
    
    return target.state;
  }
}

3. Transactional Compensation

typescript
interface CompensationAction {
  async execute(): Promise<void>;
  async compensate(): Promise<void>;
}

interface Transaction {
  actions: CompensationAction[];
  
  async execute(): Promise<void> {
    const completed: CompensationAction[] = [];
    
    try {
      for (const action of this.actions) {
        await action.execute();
        completed.push(action);
      }
    } catch (error) {
      // Rollback completed actions
      for (const action of completed.reverse()) {
        try {
          await action.compensate();
        } catch (compensationError) {
          // Log but continue rollback
          console.error('Compensation failed:', compensationError);
        }
      }
      throw error;
    }
  }
}

Error Boundaries

1. Per-Phase Boundaries

typescript
interface PhaseErrorBoundary {
  async executeWithBoundary<T>(
    phase: string,
    fn: () => Promise<T>
  ): Promise<T | ErrorResult> {
    try {
      return await fn();
    } catch (error) {
      const handled = await this.handlePhaseError(phase, error);
      
      if (handled.canContinue) {
        return handled.result;
      } else {
        return {
          error: true,
          phase,
          error: error.message,
          recoveryOptions: handled.options
        };
      }
    }
  }
}

2. Per-Agent Boundaries

typescript
interface AgentErrorBoundary {
  async executeWithIsolation<T>(
    agent: Agent,
    fn: () => Promise<T>
  ): Promise<T> {
    try {
      return await fn();
    } catch (error) {
      // Isolate failure to this agent
      await this.isolateAgent(agent);
      
      // Try fallback agent
      const fallback = await this.getFallbackAgent(agent);
      return await fallback.execute(fn);
    }
  }
}

3. Global Error Boundary

typescript
interface GlobalErrorBoundary {
  async handleGlobalError(error: Error): Promise<void> {
    // Log error
    await this.logError(error);
    
    // Assess severity
    const severity = this.assessSeverity(error);
    
    // Notify if needed
    if (severity >= ErrorSeverity.HIGH) {
      await this.notifyHumans(error);
    }
    
    // Attempt recovery
    const recovery = await this.attemptGlobalRecovery(error);
    
    if (!recovery.success) {
      // Enter safe mode
      await this.enterSafeMode();
    }
  }
}

Monitoring and Alerting

1. Error Tracking

typescript
interface ErrorTracker {
  async trackError(error: Error, context: Context): Promise<void> {
    const errorReport = {
      id: generateId(),
      timestamp: new Date(),
      error: {
        type: error.constructor.name,
        message: error.message,
        stack: error.stack
      },
      context: {
        session: context.sessionId,
        agent: context.agentId,
        phase: context.currentPhase,
        inputs: this.sanitize(context.inputs)
      },
      fingerprint: this.generateFingerprint(error)
    };
    
    // Store error
    await this.store(errorReport);
    
    // Check for patterns
    await this.analyzeForPatterns(errorReport);
  }
  
  private generateFingerprint(error: Error): string {
    // Create hash based on error type and message pattern
    return hash(`${error.constructor.name}:${this.normalizeMessage(error.message)}`);
  }
}

2. Health Checks

typescript
interface HealthMonitor {
  async checkSystemHealth(): Promise<HealthReport> {
    const checks = await Promise.all([
      this.checkLLMHealth(),
      this.checkToolHealth(),
      this.checkMemoryHealth(),
      this.checkQueueHealth()
    ]);
    
    const failing = checks.filter(c => !c.healthy);
    
    return {
      healthy: failing.length === 0,
      checks,
      failing,
      recommendations: this.generateRecommendations(failing)
    };
  }
}

3. Alerting Rules

typescript
interface AlertingRules {
  rules: {
    condition: (metrics: Metrics) => boolean;
    severity: ErrorSeverity;
    message: string;
    channels: NotificationChannel[];
  }[];
  
  async evaluate(metrics: Metrics): Promise<Alert[]> {
    const alerts: Alert[] = [];
    
    for (const rule of this.rules) {
      if (rule.condition(metrics)) {
        alerts.push({
          severity: rule.severity,
          message: rule.message,
          channels: rule.channels,
          timestamp: new Date()
        });
      }
    }
    
    return alerts;
  }
}

Resilience Testing

1. Chaos Engineering

typescript
interface ChaosEngineering {
  async injectFailure(
    component: string,
    failureType: FailureType
  ): Promise<void> {
    switch (failureType) {
      case 'latency':
        await this.addLatency(component, 5000);
        break;
      case 'error':
        await this.makeComponentError(component, 0.5);
        break;
      case 'crash':
        await this.crashComponent(component);
        break;
      case 'resource_exhaustion':
        await this.exhaustResources(component);
        break;
    }
  }
  
  async runChaosExperiment(
    experiment: ChaosExperiment
  ): Promise<ExperimentResult> {
    // Baseline
    const baseline = await this.measureBaseline();
    
    // Inject chaos
    await this.injectFailure(experiment.target, experiment.failure);
    
    // Measure resilience
    const resilience = await this.measureResilience();
    
    // Recover
    await this.recover(experiment.target);
    
    return {
      baseline,
      resilience,
      recovery: await this.measureRecovery()
    };
  }
}

2. Fault Injection

typescript
interface FaultInjector {
  faults: Map<string, Fault>;
  
  registerFault(name: string, fault: Fault): void {
    this.faults.set(name, fault);
  }
  
  async injectFault(name: string): Promise<void> {
    const fault = this.faults.get(name);
    if (fault) {
      await fault.inject();
    }
  }
  
  async removeFault(name: string): Promise<void> {
    const fault = this.faults.get(name);
    if (fault) {
      await fault.remove();
    }
  }
}

Best Practices

1. Fail Fast vs Fail Safe

typescript
// Fail Fast - detect errors early, stop immediately
function failFastApproach() {
  validateInput(input); // Throws if invalid
  validatePermissions(user); // Throws if unauthorized
  validateResources(); // Throws if unavailable
  
  // Only proceed if all validations pass
  return executeTask();
}

// Fail Safe - degrade gracefully, continue with reduced functionality
function failSafeApproach() {
  try {
    return executeTask();
  } catch (error) {
    console.warn('Primary approach failed:', error);
    
    try {
      return executeFallback();
    } catch (fallbackError) {
      console.warn('Fallback failed:', fallbackError);
      
      return returnDefaultValue();
    }
  }
}

2. Error Context Preservation

typescript
function preserveContext(fn: () => Promise<void>): Promise<void> {
  const context = {
    timestamp: new Date(),
    requestId: generateRequestId(),
    user: getCurrentUser(),
    session: getSessionId()
  };
  
  return fn().catch(error => {
    // Enrich error with context
    error.context = context;
    error.requestId = context.requestId;
    
    throw error;
  });
}

3. Idempotency

typescript
interface IdempotentOperation {
  idempotencyKey: string;
  
  async execute(): Promise<Result> {
    // Check if already executed
    const existing = await this.checkExisting(this.idempotencyKey);
    if (existing) {
      return existing.result;
    }
    
    // Execute and store result
    const result = await this.doExecute();
    await this.storeResult(this.idempotencyKey, result);
    
    return result;
  }
}

Open Questions

When to fail fast vs fail safe?
Optimal retry counts and delays?
How to test recovery procedures?
Balancing resilience with complexity?
Measuring system reliability accurately?