name: qwen3
model:
  pretrained_model_name_or_path: "qwen/Qwen3-8B-Base"
  cache_dir: "/HF_Cache/" # Set this to where you want to save checkpoint weights 
  return_dict: true
  load_in_8bit: false
  load_in_4bit: false
  device_map: cpu
  low_cpu_mem_usage: true
  torch_dtype: bfloat16
  attn_implementation: flash_attention_2
  rope_theta: 1000000

attention:
  attention_type: kv_linc
  affine_attention_factors: false
  feature_map: softmax_dim
  feature_map_kwargs:
    eps: 1e-12
    fullspace: true
  learned_kernel: untied_head_einsum
  learned_kernel_kwargs:
    feature_dim: 128
    skip_connection: false
    bias: false
    zero_init: false
  kvquant:
    q_group_size: 128
    nbits: 2
    residual_length: 128
  feature_map_before_repeat: True
  apply_rotations: value