Overview

STARK employs a comprehensive error handling system that combines algebraic error types (Result/Option) with structured exception handling, optimized for AI/ML workflows.

Error Handling Philosophy

STARK's error handling is designed around:

  • Type Safety - Errors are part of the type system, preventing unhandled errors
  • Explicit Error Handling - Errors must be explicitly handled or propagated
  • Rich Error Context - Detailed error information for debugging and monitoring
  • Composable Error Types - Easy composition and transformation of errors
  • Performance - Zero-cost error handling for the happy path
  • ML-Specific Errors - Domain-specific error types for AI/ML operations

High-Level Error Handling Overview

// High-level error handling overview
async fn ml_pipeline() -> Result {
    // Type-safe error propagation with ?
    let dataset = Dataset::load("train.csv")?;
    let model = Model::from_config(&config)?;
    
    // Error context with custom error types
    let metrics = train_model(model, dataset)
        .await
        .context("Failed to train model")?;
    
    // Error recovery with fallback
    save_model(&model, "model.onnx")
        .or_else(|e| {
            warn!("Primary save failed: {e}, trying backup location");
            save_model(&model, "backup/model.onnx")
        })?;
    
    Ok(metrics)
}

Result and Option Types

Core Types Definition

// Result type for operations that may fail
enum Result {
    Ok(T),
    Err(E)
}

impl Result {
    // Construction
    fn ok(value: T) -> Result { Result::Ok(value) }
    fn err(error: E) -> Result { Result::Err(error) }
    
    // Query methods
    fn is_ok(&self) -> bool;
    fn is_err(&self) -> bool;
    
    // Transform methods
    fn map U>(self, op: F) -> Result;
    fn map_err F>(self, op: O) -> Result;
    
    // Boolean operations
    fn and_then Result>(self, op: F) -> Result;
    fn or_else Result>(self, op: O) -> Result;
    
    // Unwrap methods (panic on error)
    fn unwrap(self) -> T where E: Debug;
    fn expect(self, msg: &str) -> T where E: Debug;
    
    // Safe unwrap methods
    fn unwrap_or(self, default: T) -> T;
    fn unwrap_or_else T>(self, op: F) -> T;
}

// Option type for nullable values
enum Option {
    Some(T),
    None
}

impl Option {
    // Construction
    fn some(value: T) -> Option { Option::Some(value) }
    fn none() -> Option { Option::None }
    
    // Query methods
    fn is_some(&self) -> bool;
    fn is_none(&self) -> bool;
    
    // Transform methods
    fn map U>(self, f: F) -> Option;
    fn and_then Option>(self, f: F) -> Option;
    fn filter bool>(self, predicate: P) -> Option;
    
    // Conversion methods
    fn ok_or(self, err: E) -> Result;
    fn ok_or_else E>(self, err: F) -> Result;
}

The Try Operator (?)

// Basic error propagation
fn process_ml_pipeline() -> Result {
    let dataset = load_dataset("train.csv")?;          // Propagate DatasetError
    let model = create_model(&config)?;                // Propagate ModelError
    let optimizer = create_optimizer(&opt_config)?;    // Propagate OptimizerError
    
    let metrics = train_model(model, dataset, optimizer)?; // Propagate TrainingError
    save_model(&model, "output.onnx")?;                // Propagate IOError
    
    Ok(metrics)
}

// Option propagation
fn find_best_checkpoint(directory: &str) -> Option {
    let entries = std::fs::read_dir(directory).ok()?;
    let mut best_checkpoint = None;
    let mut best_metric = 0.0;
    
    for entry in entries {
        let path = entry.ok()?.path();
        let metadata = parse_checkpoint_metadata(&path)?;
        
        if metadata.validation_accuracy > best_metric {
            best_metric = metadata.validation_accuracy;
            best_checkpoint = Some(path.to_string_lossy().to_string());
        }
    }
    
    best_checkpoint
}

Error Composition and Conversion

// Automatic error conversion with From trait
trait From {
    fn from(value: T) -> Self;
}

// Error composition with multiple error types
enum CombinedError {
    Io(IOError),
    Parse(ParseError),
    Validation(ValidationError),
    Network(NetworkError)
}

impl From for CombinedError {
    fn from(err: IOError) -> Self { CombinedError::Io(err) }
}

impl From for CombinedError {
    fn from(err: ParseError) -> Self { CombinedError::Parse(err) }
}

// Error boxing for dynamic error types
type BoxError = Box;

fn flexible_operation() -> Result {
    let data = risky_io_operation()?;           // Any error implementing Error
    let processed = complex_computation(data)?;  // Any error implementing Error
    Ok(processed)
}

ML-Specific Error Types

Tensor Operation Errors

// Tensor operation errors
#[derive(Debug)]
enum TensorError {
    ShapeMismatch { 
        operation: String,
        expected: Vec, 
        actual: Vec 
    },
    DeviceMismatch { 
        expected: Device, 
        actual: Device 
    },
    DTypeMismatch { 
        expected: DataType, 
        actual: DataType 
    },
    IndexOutOfBounds { 
        index: Vec, 
        shape: Vec 
    },
    CudaError { 
        code: i32, 
        message: String 
    },
    OutOfMemory { 
        requested: usize, 
        available: usize, 
        device: Device 
    }
}

impl Display for TensorError {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        match self {
            TensorError::ShapeMismatch { operation, expected, actual } =>
                write!(f, "Shape mismatch in {operation}: expected {expected:?}, got {actual:?}"),
            TensorError::DeviceMismatch { expected, actual } =>
                write!(f, "Device mismatch: expected {expected:?}, got {actual:?}"),
            TensorError::CudaError { code, message } =>
                write!(f, "CUDA error {code}: {message}"),
            TensorError::OutOfMemory { requested, available, device } =>
                write!(f, "Out of memory on {device:?}: requested {requested}, available {available}")
        }
    }
}

Model Errors

// Model errors
#[derive(Debug)]
enum ModelError {
    LoadingFailed { 
        path: String, 
        format: String, 
        reason: String 
    },
    SavingFailed { 
        path: String, 
        reason: String 
    },
    InvalidArchitecture { 
        expected: String, 
        found: String 
    },
    MissingWeights { 
        layer: String 
    },
    IncompatibleWeights { 
        layer: String, 
        expected_shape: Vec, 
        actual_shape: Vec 
    },
    ForwardPassFailed { 
        layer: String, 
        input_shape: Vec, 
        reason: String 
    },
    BackwardPassFailed { 
        layer: String, 
        gradient_shape: Vec, 
        reason: String 
    },
    ConvergenceError { 
        epoch: u32, 
        loss: f64, 
        threshold: f64 
    }
}

Training and Dataset Errors

// Training errors
#[derive(Debug)]
enum TrainingError {
    DatasetError { 
        source: Box 
    },
    ModelError { 
        source: ModelError 
    },
    OptimizerError { 
        optimizer: String, 
        reason: String 
    },
    LossComputation { 
        predictions_shape: Vec, 
        targets_shape: Vec 
    },
    CheckpointError { 
        epoch: u32, 
        path: String, 
        reason: String 
    },
    EarlyStopping { 
        epoch: u32, 
        patience: u32, 
        best_metric: f64, 
        current_metric: f64 
    },
    NumericalInstability { 
        epoch: u32, 
        batch: u32, 
        value: f64 
    }
}

// Dataset errors
#[derive(Debug)]
enum DatasetError {
    LoadingFailed { 
        source: String, 
        reason: String 
    },
    TransformFailed { 
        transform: String, 
        item_index: usize, 
        reason: String 
    },
    BatchingFailed { 
        batch_size: usize, 
        available_items: usize 
    },
    SchemaValidation { 
        expected_columns: Vec, 
        found_columns: Vec 
    },
    CorruptedData { 
        item_index: usize, 
        details: String 
    }
}

Inference Errors

// Inference errors
#[derive(Debug)]
enum InferenceError {
    ModelNotLoaded,
    InvalidInput { 
        expected_shape: Vec, 
        actual_shape: Vec 
    },
    PostprocessingFailed { 
        reason: String 
    },
    BatchSizeMismatch { 
        expected: usize, 
        actual: usize 
    },
    TimeoutExceeded { 
        timeout: Duration, 
        elapsed: Duration 
    },
    ResourceExhausted { 
        resource: String 
    }
}

Error Context and Chaining

Error Context

// Context extension trait
trait Context {
    fn context(self, context: C) -> Result;
    fn with_context C>(self, f: F) -> Result;
}

impl Context for Result {
    fn context(self, context: C) -> Result {
        self.map_err(|e| ContextError::new(e).context(context))
    }
    
    fn with_context C>(self, f: F) -> Result {
        self.map_err(|e| ContextError::new(e).context(f()))
    }
}

// Usage example with rich error context
async fn train_with_context(config: &TrainingConfig) -> Result {
    let dataset = Dataset::load(&config.dataset_path)
        .context("Failed to load training dataset")?;
    
    let model = Model::from_config(&config.model)
        .with_context(|| format!("Failed to create model with architecture '{}'", 
                                config.model.architecture))?;
    
    let optimizer = create_optimizer(&config.optimizer)
        .with_context(|| format!("Failed to create {} optimizer", 
                                config.optimizer.name))?;
    
    let metrics = train_model(model, dataset, optimizer).await
        .context("Training failed")?;
    
    save_checkpoint(&model, &config.checkpoint_path)
        .with_context(|| format!("Failed to save checkpoint to '{}'", 
                                config.checkpoint_path))?;
    
    Ok(metrics)
}

Error Chain Display

// Error chain display
fn display_error_chain(error: &dyn Error) {
    eprintln!("Error: {}", error);
    
    let mut source = error.source();
    let mut level = 1;
    
    while let Some(err) = source {
        eprintln!("  {}: {}", level, err);
        source = err.source();
        level += 1;
    }
}

// Chain of errors
trait ErrorChain {
    fn iter_chain(&self) -> ErrorChainIter<'_>;
    fn find_root_cause(&self) -> &dyn Error;
}

impl ErrorChain for T {
    fn iter_chain(&self) -> ErrorChainIter<'_> {
        ErrorChainIter { current: Some(self) }
    }
    
    fn find_root_cause(&self) -> &dyn Error {
        let mut current = self as &dyn Error;
        while let Some(source) = current.source() {
            current = source;
        }
        current
    }
}

Error Recovery and Fallbacks

Retry with Exponential Backoff

// Retry with exponential backoff
struct RetryConfig {
    max_attempts: u32,
    base_delay: Duration,
    max_delay: Duration,
    backoff_factor: f64
}

impl Default for RetryConfig {
    fn default() -> Self {
        RetryConfig {
            max_attempts: 3,
            base_delay: Duration::from_millis(100),
            max_delay: Duration::from_secs(30),
            backoff_factor: 2.0
        }
    }
}

async fn retry_with_backoff(
    mut operation: F,
    config: RetryConfig
) -> Result
where
    F: FnMut() -> Fut,
    Fut: Future>,
    E: std::fmt::Display
{
    let mut attempt = 0;
    let mut delay = config.base_delay;
    
    loop {
        attempt += 1;
        
        match operation().await {
            Ok(result) => return Ok(result),
            Err(error) => {
                if attempt >= config.max_attempts {
                    error!("Operation failed after {} attempts, last error: {}", 
                          attempt, error);
                    return Err(error);
                }
                
                warn!("Attempt {} failed: {}, retrying in {:?}", 
                     attempt, error, delay);
                
                sleep(delay).await;
                
                // Exponential backoff
                delay = std::cmp::min(
                    Duration::from_secs_f64(
                        delay.as_secs_f64() * config.backoff_factor
                    ),
                    config.max_delay
                );
            }
        }
    }
}

Circuit Breaker Pattern

// Circuit breaker pattern
struct CircuitBreaker {
    failure_threshold: u32,
    success_threshold: u32,
    timeout: Duration,
    state: CircuitState,
    failure_count: u32,
    success_count: u32,
    last_failure_time: Option
}

enum CircuitState {
    Closed,    // Normal operation
    Open,      // Failing, reject requests
    HalfOpen   // Testing if service recovered
}

impl CircuitBreaker {
    fn new(failure_threshold: u32, timeout: Duration) -> Self {
        CircuitBreaker {
            failure_threshold,
            success_threshold: failure_threshold / 2,
            timeout,
            state: CircuitState::Closed,
            failure_count: 0,
            success_count: 0,
            last_failure_time: None
        }
    }
    
    async fn call(&mut self, operation: F) -> Result>
    where
        F: FnOnce() -> Fut,
        Fut: Future>
    {
        match self.state {
            CircuitState::Closed => {
                match operation().await {
                    Ok(result) => {
                        self.on_success();
                        Ok(result)
                    }
                    Err(error) => {
                        self.on_failure();
                        Err(CircuitBreakerError::OperationFailed(error))
                    }
                }
            }
            CircuitState::Open => {
                if let Some(last_failure) = self.last_failure_time {
                    if last_failure.elapsed() > self.timeout {
                        self.state = CircuitState::HalfOpen;
                        self.call(operation).await
                    } else {
                        Err(CircuitBreakerError::CircuitOpen)
                    }
                } else {
                    Err(CircuitBreakerError::CircuitOpen)
                }
            }
            CircuitState::HalfOpen => {
                match operation().await {
                    Ok(result) => {
                        self.success_count += 1;
                        if self.success_count >= self.success_threshold {
                            self.state = CircuitState::Closed;
                            self.failure_count = 0;
                            self.success_count = 0;
                        }
                        Ok(result)
                    }
                    Err(error) => {
                        self.state = CircuitState::Open;
                        self.last_failure_time = Some(Instant::now());
                        Err(CircuitBreakerError::OperationFailed(error))
                    }
                }
            }
        }
    }
}

Error Recovery with Multiple Attempts

// Error recovery with multiple attempts
async fn robust_model_loading(paths: &[String]) -> Result {
    let mut last_error = None;
    
    for (i, path) in paths.iter().enumerate() {
        match Model::load(path).await {
            Ok(model) => {
                if i > 0 {
                    warn!("Loaded model from fallback path: {}", path);
                }
                return Ok(model);
            }
            Err(e) => {
                warn!("Failed to load model from {}: {}", path, e);
                last_error = Some(e);
            }
        }
    }
    
    Err(last_error.unwrap_or_else(|| ModelError::LoadingFailed {
        path: "no paths provided".to_string(),
        format: "unknown".to_string(),
        reason: "no model paths specified".to_string()
    }))
}

// Graceful degradation
async fn inference_with_fallback(
    input: Tensor,
    primary_model: &Model,
    fallback_model: Option<&Model>
) -> Result, InferenceError> {
    match primary_model.predict(&input).await {
        Ok(result) => Ok(result),
        Err(e) => {
            warn!("Primary model failed: {}, trying fallback", e);
            
            if let Some(fallback) = fallback_model {
                fallback.predict(&input).await
                    .map_err(|fallback_err| {
                        error!("Both primary and fallback models failed");
                        error!("Primary error: {}", e);
                        error!("Fallback error: {}", fallback_err);
                        fallback_err
                    })
            } else {
                Err(e)
            }
        }
    }
}

Production Features

Error Aggregation

// Collect multiple errors
fn validate_all_inputs(inputs: &[Input]) -> Result<(), ValidationErrors> {
    let mut errors = Vec::new();
    
    for (i, input) in inputs.iter().enumerate() {
        if let Err(e) = validate_input(input) {
            errors.push(IndexedError { index: i, error: e });
        }
    }
    
    if errors.is_empty() {
        Ok(())
    } else {
        Err(ValidationErrors { errors })
    }
}

#[derive(Debug)]
struct ValidationErrors {
    errors: Vec>
}

#[derive(Debug)]
struct IndexedError {
    index: usize,
    error: E
}

Error Reporting and Monitoring

// Error reporting and logging
struct ErrorReporter {
    logger: Logger,
    metrics: MetricsCollector,
    alerting: AlertingService
}

impl ErrorReporter {
    async fn report_error(&self, error: &dyn Error, context: ErrorContext) {
        // Log error with full context
        error!("Error occurred: {}", error);
        self.log_error_chain(error);
        
        // Collect metrics
        self.metrics.increment_error_counter(&context.operation, &error.to_string());
        
        // Send alerts for critical errors
        if context.severity >= Severity::Critical {
            let alert = Alert {
                title: format!("Critical error in {}", context.operation),
                description: error.to_string(),
                severity: context.severity,
                timestamp: Utc::now(),
                context: context.clone()
            };
            
            if let Err(e) = self.alerting.send_alert(alert).await {
                warn!("Failed to send alert: {}", e);
            }
        }
    }
}

#[derive(Clone, Debug)]
struct ErrorContext {
    operation: String,
    severity: Severity,
    user_id: Option,
    request_id: Option,
    additional_context: HashMap
}

Key Benefits

🛡️ Type Safety

Errors are part of the type system, preventing unhandled error conditions

⚡ Zero Cost

No runtime overhead for error handling in the happy path

🤖 ML-Optimized

Domain-specific error types for tensor operations, training, and inference

🔧 Composable

Easy composition and transformation of errors across system boundaries

📊 Rich Context

Detailed error information with chaining and contextual information

🏭 Production Ready

Circuit breakers, retry mechanisms, monitoring, and alerting integration