# Gauss configuration - MIMO rank 8 with variance output (using new gauss.py)
# For use with temperature scaling and variance propagation
# Core dimensions
dim: 128
d_state: 16
embed_dim: 256                  # Token embedding dimension

# MLP encoder/decoder hidden units (set to [] to disable)
encoder_hidden_units: [120]   # Hidden layer sizes for encoder MLP (embed_dim → dim)
decoder_hidden_units: [240]   # Hidden layer sizes for decoder MLP (dim → dim)

# Feature map configuration (for selective_h)
h_feature_map_mode: 'repeat'

# State projection
state_projection_mode: 'equal'  # state_dim = d_model (no projection overhead)

# Head combination strategy
multi_head_combine: 'projected_linear'

# Discretization
discretize: true
dt_min: 0.001
dt_max: 0.1
dt_init_floor: 0.0001

# SSM parameters
a_parameterization: 'log_space'
process_noise_scale: 0.01

# Selective features
selective_h: true
selective_process: false

# Gating
gating_mode: 'multiplicative'
mlp_hidden_units: null

# Causal convolution
use_causal_conv: true
conv_kernel_size: 4
conv_activation: 'silu'
conv_groups: null

# Gating activation
use_gating: true
gating_activation: 'silu'
use_lambda_skip: true
lambda_skip_mode: 'scalar'    # 'scalar' or 'vector' (per-channel like Mamba D)
lambda_init: -1.0             # Initialization value (default -1.0)

# MIMO rank
mimo_rank: 8

# QK Norm
use_qk_norm: true

# Misc
bias: false
return_variance: true         # ENABLED for temperature scaling / variance propagation
max_length: 1280              # Required for position embeddings in autoencoder
