forget_loss: grad_ascent #choices = [grad_ascent, grad_diff, KL, idk, dpo]
attribution: none # choices = [none, g_prod, g_norm]
unification: none # choices = [none, power, exp]
tau: 0.03
score_dict_path: influence_nonDA.json

model_path:      checkpoints/ft_epoch5_lr1e-05_phi_full_wd0.01/checkpoint-5000
save_dir: ${model_path}/${split}_${forget_loss}_tau${tau}_lr${lr}_wd_${weight_decay}_ep${num_epochs}_bs${batch_size}_DA_${attribution}_${unification}

lr: 1e-5
weight_decay: 0.01
batch_size: 1
num_epochs: 5

data_path: locuslab/TOFU
gradient_accumulation_steps: 1
model_family: phi
split: forget01

LoRA:
  r: 0
  alpha: 32
  dropout: 0.05

overwrite_dir: true

save_model: true
eval_while_train: false
eval_only: false
seed: 42
use_flash_attention_2: true

# Parallelism settings
parallelism:
  strategy: false # Options: "pipeline", "tensor", "zero", or null for default
  num_stages: 4       # For pipeline parallelism, split the model into N stages

eval:
  # retain_result: data/retain90_llama_wd0.01/eval_results/ds_size300/eval_log_aggregated.json
  model_path: ${..model_path}
  model_family: ${..model_family}
  save_dir: ${..save_dir}
  data_path: [locuslab/TOFU, locuslab/TOFU, locuslab/TOFU, locuslab/TOFU]
  split: ${..split}_perturbed
  split_list:
    - retain_perturbed
    - real_authors_perturbed
    - world_facts_perturbed
    - ${split}

  eval_task: [eval_log, eval_real_author_wo_options, eval_real_world_wo_options, eval_log_forget]
  question_key: [question, question, question, question]
  answer_key: [answer, answer, answer, answer]
  base_answer_key: [paraphrased_answer, answer, answer, paraphrased_answer]
  perturbed_answer_key: [perturbed_answer, perturbed_answer, perturbed_answer, perturbed_answer]

  generation:
    max_length: 200
    max_new_tokens: null

  save_generated_text: true

  ds_size: 300

  overwrite: true
  use_pretrained: false

  batch_size: 30
  retain_result: null