defaults: 
 - base
 - /step: sequential_markov
 - _self_

name: factformer

params:
  _target_: models.factformer.Factformer
  n_layers: 4
  d_model: 128                   # input dimension
  dim_head: 128              # dimension in each attention head, will be expanded by the kernel_multiplier when computing kernel: d = dim_head * kernel_multiplier
  latent_dim: ${.d_model}            # the output dimension of the projection operator
  heads: 4                 # attention heads
  dim_out: ${.d_model}                # output dimension
  kernel_multiplier: 3     # use more function bases to computer kernel: k(x_i, x_j)=\sum_{c}^dq_c(x_i)k_c(x_j)    
  use_rope: True              # use rotary positional encoding or not, by default True
  scaling_factor: 1        # use scaling factor to modulate the kernel, an example is 1/ sqrt(d) like scaled-dot product attention, by default is: 1
  memory_augmented: False
  d_state: 1  
  norm: True
    

# d_state: 128

optimizer:
  _target_: optimizers.setup_s4_optimizer
  lr: 0.001
  weight_decay: 0.0
  # weight_decay: 0.0

batch_size: 32

# # scheduler: None
# scheduler: cosine
scheduler: step
step_size: 200
gamma: 0.5

warmup_epochs: 1