project: diverse_retrieval
########################################################
save_path: results/ambiguous_qe_inf/
name: toy_contrastive_4_gpus_from_stage2_lr2e5_ep20_temp0.05_warmup0.05
train_path: training_datasets/ambiguous_qe/inf/autoregressive_ambiguous_qe_inf_train_dataset_1b_contrastive_2_to_5_ctxs/

# save and load
save_every_n_steps: 50
embedding_model_dim: 1536
adapter_path: results/nq_inf/toy_contrastive/checkpoint_70000   # (stage 1) adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
linear_checkpoint_path: results/nq_inf/toy_contrastive/checkpoint_70000_linear.pt  # (stage 1) linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
# adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
# linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
# adapter_path: 
# linear_checkpoint_path: 

# training
batch_size_training: 32
gradient_accumulation_steps: 1
num_epochs: 20
lr: !!float "2e-5"
weight_decay: 0.01
warmup_ratio: 0.05
scheduler: linear
max_grad_norm: 1.0

# some options 
shuffle_sequence: True
train_on_all_data: False
save_only_improve: True
take_first: False

# model architecture
temperature: 0.05
loss_function: Hungarian_Contrastive  # MSE, Hungarian_MSE, Contrastive, Hungarian_Contrastive
extra_q_embed: False
compute_loss_on_q: False
use_eos: False

# data loading 
question_only: False

########################################################
# not used that often
debug: False
epochs_per_stage: 3
max_latent_stage: 3
pad_latent_to_max: True

uniform_prob: 0.0
model_id: meta-llama/Llama-3.2-1B-Instruct
seed: 0
resume: 0
reset_optimizer: True

beta1: 0.9
beta2: 0.98
optim: adamw
lr_min_ratio: 0.0
eps: !!float "1e-6"
weight_tying: False