🚨 Error Handling System Specification
Type-Safe Error Handling for Production ML Systems
Overview
STARK employs a comprehensive error handling system that combines algebraic error types (Result/Option) with structured exception handling, optimized for AI/ML workflows.
Error Handling Philosophy
STARK's error handling is designed around:
- Type Safety - Errors are part of the type system, preventing unhandled errors
- Explicit Error Handling - Errors must be explicitly handled or propagated
- Rich Error Context - Detailed error information for debugging and monitoring
- Composable Error Types - Easy composition and transformation of errors
- Performance - Zero-cost error handling for the happy path
- ML-Specific Errors - Domain-specific error types for AI/ML operations
High-Level Error Handling Overview
// High-level error handling overview
async fn ml_pipeline() -> Result {
// Type-safe error propagation with ?
let dataset = Dataset::load("train.csv")?;
let model = Model::from_config(&config)?;
// Error context with custom error types
let metrics = train_model(model, dataset)
.await
.context("Failed to train model")?;
// Error recovery with fallback
save_model(&model, "model.onnx")
.or_else(|e| {
warn!("Primary save failed: {e}, trying backup location");
save_model(&model, "backup/model.onnx")
})?;
Ok(metrics)
}
Result and Option Types
Core Types Definition
// Result type for operations that may fail
enum Result {
Ok(T),
Err(E)
}
impl Result {
// Construction
fn ok(value: T) -> Result { Result::Ok(value) }
fn err(error: E) -> Result { Result::Err(error) }
// Query methods
fn is_ok(&self) -> bool;
fn is_err(&self) -> bool;
// Transform methods
fn map U>(self, op: F) -> Result;
fn map_err F>(self, op: O) -> Result;
// Boolean operations
fn and_then Result>(self, op: F) -> Result;
fn or_else Result>(self, op: O) -> Result;
// Unwrap methods (panic on error)
fn unwrap(self) -> T where E: Debug;
fn expect(self, msg: &str) -> T where E: Debug;
// Safe unwrap methods
fn unwrap_or(self, default: T) -> T;
fn unwrap_or_else T>(self, op: F) -> T;
}
// Option type for nullable values
enum Option {
Some(T),
None
}
impl Option {
// Construction
fn some(value: T) -> Option { Option::Some(value) }
fn none() -> Option { Option::None }
// Query methods
fn is_some(&self) -> bool;
fn is_none(&self) -> bool;
// Transform methods
fn map U>(self, f: F) -> Option;
fn and_then Option>(self, f: F) -> Option;
fn filter bool>(self, predicate: P) -> Option;
// Conversion methods
fn ok_or(self, err: E) -> Result;
fn ok_or_else E>(self, err: F) -> Result;
}
The Try Operator (?)
// Basic error propagation
fn process_ml_pipeline() -> Result {
let dataset = load_dataset("train.csv")?; // Propagate DatasetError
let model = create_model(&config)?; // Propagate ModelError
let optimizer = create_optimizer(&opt_config)?; // Propagate OptimizerError
let metrics = train_model(model, dataset, optimizer)?; // Propagate TrainingError
save_model(&model, "output.onnx")?; // Propagate IOError
Ok(metrics)
}
// Option propagation
fn find_best_checkpoint(directory: &str) -> Option {
let entries = std::fs::read_dir(directory).ok()?;
let mut best_checkpoint = None;
let mut best_metric = 0.0;
for entry in entries {
let path = entry.ok()?.path();
let metadata = parse_checkpoint_metadata(&path)?;
if metadata.validation_accuracy > best_metric {
best_metric = metadata.validation_accuracy;
best_checkpoint = Some(path.to_string_lossy().to_string());
}
}
best_checkpoint
}
Error Composition and Conversion
// Automatic error conversion with From trait
trait From {
fn from(value: T) -> Self;
}
// Error composition with multiple error types
enum CombinedError {
Io(IOError),
Parse(ParseError),
Validation(ValidationError),
Network(NetworkError)
}
impl From for CombinedError {
fn from(err: IOError) -> Self { CombinedError::Io(err) }
}
impl From for CombinedError {
fn from(err: ParseError) -> Self { CombinedError::Parse(err) }
}
// Error boxing for dynamic error types
type BoxError = Box;
fn flexible_operation() -> Result {
let data = risky_io_operation()?; // Any error implementing Error
let processed = complex_computation(data)?; // Any error implementing Error
Ok(processed)
}
ML-Specific Error Types
Tensor Operation Errors
// Tensor operation errors
#[derive(Debug)]
enum TensorError {
ShapeMismatch {
operation: String,
expected: Vec,
actual: Vec
},
DeviceMismatch {
expected: Device,
actual: Device
},
DTypeMismatch {
expected: DataType,
actual: DataType
},
IndexOutOfBounds {
index: Vec,
shape: Vec
},
CudaError {
code: i32,
message: String
},
OutOfMemory {
requested: usize,
available: usize,
device: Device
}
}
impl Display for TensorError {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
TensorError::ShapeMismatch { operation, expected, actual } =>
write!(f, "Shape mismatch in {operation}: expected {expected:?}, got {actual:?}"),
TensorError::DeviceMismatch { expected, actual } =>
write!(f, "Device mismatch: expected {expected:?}, got {actual:?}"),
TensorError::CudaError { code, message } =>
write!(f, "CUDA error {code}: {message}"),
TensorError::OutOfMemory { requested, available, device } =>
write!(f, "Out of memory on {device:?}: requested {requested}, available {available}")
}
}
}
Model Errors
// Model errors
#[derive(Debug)]
enum ModelError {
LoadingFailed {
path: String,
format: String,
reason: String
},
SavingFailed {
path: String,
reason: String
},
InvalidArchitecture {
expected: String,
found: String
},
MissingWeights {
layer: String
},
IncompatibleWeights {
layer: String,
expected_shape: Vec,
actual_shape: Vec
},
ForwardPassFailed {
layer: String,
input_shape: Vec,
reason: String
},
BackwardPassFailed {
layer: String,
gradient_shape: Vec,
reason: String
},
ConvergenceError {
epoch: u32,
loss: f64,
threshold: f64
}
}
Training and Dataset Errors
// Training errors
#[derive(Debug)]
enum TrainingError {
DatasetError {
source: Box
},
ModelError {
source: ModelError
},
OptimizerError {
optimizer: String,
reason: String
},
LossComputation {
predictions_shape: Vec,
targets_shape: Vec
},
CheckpointError {
epoch: u32,
path: String,
reason: String
},
EarlyStopping {
epoch: u32,
patience: u32,
best_metric: f64,
current_metric: f64
},
NumericalInstability {
epoch: u32,
batch: u32,
value: f64
}
}
// Dataset errors
#[derive(Debug)]
enum DatasetError {
LoadingFailed {
source: String,
reason: String
},
TransformFailed {
transform: String,
item_index: usize,
reason: String
},
BatchingFailed {
batch_size: usize,
available_items: usize
},
SchemaValidation {
expected_columns: Vec,
found_columns: Vec
},
CorruptedData {
item_index: usize,
details: String
}
}
Inference Errors
// Inference errors
#[derive(Debug)]
enum InferenceError {
ModelNotLoaded,
InvalidInput {
expected_shape: Vec,
actual_shape: Vec
},
PostprocessingFailed {
reason: String
},
BatchSizeMismatch {
expected: usize,
actual: usize
},
TimeoutExceeded {
timeout: Duration,
elapsed: Duration
},
ResourceExhausted {
resource: String
}
}
Error Context and Chaining
Error Context
// Context extension trait
trait Context {
fn context(self, context: C) -> Result;
fn with_context C>(self, f: F) -> Result;
}
impl Context for Result {
fn context(self, context: C) -> Result {
self.map_err(|e| ContextError::new(e).context(context))
}
fn with_context C>(self, f: F) -> Result {
self.map_err(|e| ContextError::new(e).context(f()))
}
}
// Usage example with rich error context
async fn train_with_context(config: &TrainingConfig) -> Result {
let dataset = Dataset::load(&config.dataset_path)
.context("Failed to load training dataset")?;
let model = Model::from_config(&config.model)
.with_context(|| format!("Failed to create model with architecture '{}'",
config.model.architecture))?;
let optimizer = create_optimizer(&config.optimizer)
.with_context(|| format!("Failed to create {} optimizer",
config.optimizer.name))?;
let metrics = train_model(model, dataset, optimizer).await
.context("Training failed")?;
save_checkpoint(&model, &config.checkpoint_path)
.with_context(|| format!("Failed to save checkpoint to '{}'",
config.checkpoint_path))?;
Ok(metrics)
}
Error Chain Display
// Error chain display
fn display_error_chain(error: &dyn Error) {
eprintln!("Error: {}", error);
let mut source = error.source();
let mut level = 1;
while let Some(err) = source {
eprintln!(" {}: {}", level, err);
source = err.source();
level += 1;
}
}
// Chain of errors
trait ErrorChain {
fn iter_chain(&self) -> ErrorChainIter<'_>;
fn find_root_cause(&self) -> &dyn Error;
}
impl ErrorChain for T {
fn iter_chain(&self) -> ErrorChainIter<'_> {
ErrorChainIter { current: Some(self) }
}
fn find_root_cause(&self) -> &dyn Error {
let mut current = self as &dyn Error;
while let Some(source) = current.source() {
current = source;
}
current
}
}
Error Recovery and Fallbacks
Retry with Exponential Backoff
// Retry with exponential backoff
struct RetryConfig {
max_attempts: u32,
base_delay: Duration,
max_delay: Duration,
backoff_factor: f64
}
impl Default for RetryConfig {
fn default() -> Self {
RetryConfig {
max_attempts: 3,
base_delay: Duration::from_millis(100),
max_delay: Duration::from_secs(30),
backoff_factor: 2.0
}
}
}
async fn retry_with_backoff(
mut operation: F,
config: RetryConfig
) -> Result
where
F: FnMut() -> Fut,
Fut: Future
Circuit Breaker Pattern
// Circuit breaker pattern
struct CircuitBreaker {
failure_threshold: u32,
success_threshold: u32,
timeout: Duration,
state: CircuitState,
failure_count: u32,
success_count: u32,
last_failure_time: Option
}
enum CircuitState {
Closed, // Normal operation
Open, // Failing, reject requests
HalfOpen // Testing if service recovered
}
impl CircuitBreaker {
fn new(failure_threshold: u32, timeout: Duration) -> Self {
CircuitBreaker {
failure_threshold,
success_threshold: failure_threshold / 2,
timeout,
state: CircuitState::Closed,
failure_count: 0,
success_count: 0,
last_failure_time: None
}
}
async fn call(&mut self, operation: F) -> Result>
where
F: FnOnce() -> Fut,
Fut: Future
Error Recovery with Multiple Attempts
// Error recovery with multiple attempts
async fn robust_model_loading(paths: &[String]) -> Result {
let mut last_error = None;
for (i, path) in paths.iter().enumerate() {
match Model::load(path).await {
Ok(model) => {
if i > 0 {
warn!("Loaded model from fallback path: {}", path);
}
return Ok(model);
}
Err(e) => {
warn!("Failed to load model from {}: {}", path, e);
last_error = Some(e);
}
}
}
Err(last_error.unwrap_or_else(|| ModelError::LoadingFailed {
path: "no paths provided".to_string(),
format: "unknown".to_string(),
reason: "no model paths specified".to_string()
}))
}
// Graceful degradation
async fn inference_with_fallback(
input: Tensor,
primary_model: &Model,
fallback_model: Option<&Model>
) -> Result, InferenceError> {
match primary_model.predict(&input).await {
Ok(result) => Ok(result),
Err(e) => {
warn!("Primary model failed: {}, trying fallback", e);
if let Some(fallback) = fallback_model {
fallback.predict(&input).await
.map_err(|fallback_err| {
error!("Both primary and fallback models failed");
error!("Primary error: {}", e);
error!("Fallback error: {}", fallback_err);
fallback_err
})
} else {
Err(e)
}
}
}
}
Production Features
Error Aggregation
// Collect multiple errors
fn validate_all_inputs(inputs: &[Input]) -> Result<(), ValidationErrors> {
let mut errors = Vec::new();
for (i, input) in inputs.iter().enumerate() {
if let Err(e) = validate_input(input) {
errors.push(IndexedError { index: i, error: e });
}
}
if errors.is_empty() {
Ok(())
} else {
Err(ValidationErrors { errors })
}
}
#[derive(Debug)]
struct ValidationErrors {
errors: Vec>
}
#[derive(Debug)]
struct IndexedError {
index: usize,
error: E
}
Error Reporting and Monitoring
// Error reporting and logging
struct ErrorReporter {
logger: Logger,
metrics: MetricsCollector,
alerting: AlertingService
}
impl ErrorReporter {
async fn report_error(&self, error: &dyn Error, context: ErrorContext) {
// Log error with full context
error!("Error occurred: {}", error);
self.log_error_chain(error);
// Collect metrics
self.metrics.increment_error_counter(&context.operation, &error.to_string());
// Send alerts for critical errors
if context.severity >= Severity::Critical {
let alert = Alert {
title: format!("Critical error in {}", context.operation),
description: error.to_string(),
severity: context.severity,
timestamp: Utc::now(),
context: context.clone()
};
if let Err(e) = self.alerting.send_alert(alert).await {
warn!("Failed to send alert: {}", e);
}
}
}
}
#[derive(Clone, Debug)]
struct ErrorContext {
operation: String,
severity: Severity,
user_id: Option,
request_id: Option,
additional_context: HashMap
}
Key Benefits
🛡️ Type Safety
Errors are part of the type system, preventing unhandled error conditions
⚡ Zero Cost
No runtime overhead for error handling in the happy path
🤖 ML-Optimized
Domain-specific error types for tensor operations, training, and inference
🔧 Composable
Easy composition and transformation of errors across system boundaries
📊 Rich Context
Detailed error information with chaining and contextual information
🏭 Production Ready
Circuit breakers, retry mechanisms, monitoring, and alerting integration