ARCHITECTURE:
  width: 784 
  depth: 4
  input_dim: 784 
  activation: 'gelu'
  activation_control: 0.1 # useful for cswish
  activation_power: 1.0 # useful for repu, cswish
  variance: [2,0] # List of IC distribution parameters (Cw,Cb)
  critical: False
  renormalize: False 
  
SAMPLING:
  seed: 1 # seed for random number generator
  num_samples: 1000 # Number of initializations to generate
  num_inputs: 5 # Number of inputs to test -- drawn from N(0,1)
  
RESULT:
    path: 'result'
    plot: 'plots'
  
epochs: 20000
lr: 0.001
weight_decay: 0.0
momentum: 0.0
save_dir: 'train_results'
classes: 10

Distributed:
  # params for multi gpu/cpu processing

