project: diverse_retrieval
########################################################
save_path: results/ambiguous_qe_inf/
name: test_save_single
train_path: training_datasets/ambiguous_qe/inf/autoregressive_ambiguous_qe_inf_train_dataset_1b_contrastive_2_to_5_ctxs/

# save and load
save_every_n_steps: 250
embedding_model_dim: 1536
adapter_path: results/nq_inf/toy_contrastive/checkpoint_70000   # (stage 1) adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
linear_checkpoint_path: results/nq_inf/toy_contrastive/checkpoint_70000_linear.pt  # (stage 1) linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
# adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
# linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
# adapter_path: 
# linear_checkpoint_path: 

# training
batch_size_training: 16
gradient_accumulation_steps: 1
num_epochs: 50
lr: !!float "1e-4"
weight_decay: 0.01
warmup_ratio: 0.05
scheduler: linear
max_grad_norm: 1.0

# lora
lora_alpha: 16
lora_r: 64
lora_dropout: 0.1

# some options 
shuffle_sequence: True
train_on_all_data: False
save_only_improve: True
take_first: False

# model architecture
temperature: 0.05
loss_function: Contrastive  # MSE, Hungarian_MSE, Contrastive, Hungarian_Contrastive
extra_q_embed: False
compute_loss_on_q: False
use_eos: False
# data loading 
question_only: False

########################################################
# not used that often
only_eval: False
debug: False
epochs_per_stage: 3
max_latent_stage: 3
pad_latent_to_max: True

uniform_prob: 0.0
model_id: meta-llama/Llama-3.2-1B-Instruct
seed: 0
resume: 0
reset_optimizer: True

beta1: 0.9
beta2: 0.98
optim: adamw
lr_min_ratio: 0.0
eps: !!float "1e-6"
weight_tying: False