name: "moa_spec"

model_class:
  _target_: hydra.utils.get_object
  path: moa_spec.models.train.moa_spec.MOASpecLlamaForCausalLM

model_config:
  self_attention_num_key_value_heads: 4
  self_attention_intermediate_size: 6144
  cross_attention_num_key_value_heads: 4
  cross_attention_intermediate_size: 6144
  layer_self_attention_num_key_value_heads: 8
  layer_self_attention_intermediate_size: 3072
  target_layer_inference: 0  # we tried values 0, 1 and 3 in our paper
  variable_future_length: True  # this is to vary the length of the future prediction
  staircase_mask: True  # force the SA to focus on the current context at the start of the training
