hydra:
  run:
    dir: ${output_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S}

model:
  pretrained_model_name_or_path: Qwen/Qwen3-1.7B
  device_map: cuda
  trust_remote_code: true
  torch_dtype: torch.float16
ue_layer:
  path: ''
  pos_weight: 3
  output_attention: true
  head_cfg:
    head_type: claim
    feature_extractor:
    - name: luh.feature_extractors.basic_attention
      layer_nums: all
      attn_history_sz: 5
      pool: false
    - name: luh.feature_extractors.token_probabilities
      top_n: 4
    uncertainty_head:
      head_dim: 768
      n_layers: 2
      n_heads: 8
      dropout: 0.1
      mask_future_tokens: true
dataset:
  path: hf:user/train_gsm8k_Qwen3-1.7B
  prompt_path: configs/qwen3_prompt.txt
  num_instances: 0
  test_size: 0.1
  validation: test
  label_key_claim: verified
  label_key_token: uncertainty_labels
additional_test_datasets:
training_arguments:
  num_train_epochs: 10
  learning_rate: 0.0001
  warmup_ratio: 0.05
  weight_decay: 0.1
  gradient_accumulation_steps: 16
  per_device_train_batch_size: 2
  max_grad_norm: 1.0
  eval_strategy: epoch
  save_total_limit: 1
do_train: true
do_eval: false
do_hyperopt: false
do_save_checkpoints: false
do_save_final_model: true
report_to: wandb
output_dir: ./workdir/train
do_predict: false
deepspeed_config: null
local_rank: null
