project: diverse_retrieval
########################################################
save_path: results/gaussian_synthetic_inf/
name: toy_contrastive_ordered_lr1e4_ep30_temp0.05_warmup0.05_gradnorm5_bs16_no_shuffle
train_path: training_datasets/gaussian_synthetic/inf/gaussian_synthetic_train_dataset_1b_contrastive

# save and load
save_every_n_steps: 2500
embedding_model_dim: 1024
# adapter_path: results/nq_inf/toy_contrastive/checkpoint_70000   # (stage 1) adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
# linear_checkpoint_path: results/nq_inf/toy_contrastive/checkpoint_70000_linear.pt  # (stage 1) linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
# adapter_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000
# linear_checkpoint_path: results/qampari_inf/toy_qemb_from_nq/checkpoint_30000_linear.pt
adapter_path: 
linear_checkpoint_path: 
# adapter_path: results/gaussian_synthetic_inf/toy_mse_lr2e5_ep20_temp0.05_warmup0.05_gradnorm1_bs16/best_model
# linear_checkpoint_path: results/gaussian_synthetic_inf/toy_mse_lr2e5_ep20_temp0.05_warmup0.05_gradnorm1_bs16/best_model_linear.pt

# training
batch_size_training: 16
gradient_accumulation_steps: 1
num_epochs: 30
lr: !!float "1e-4"
weight_decay: 0.01
warmup_ratio: 0.05
scheduler: linear
max_grad_norm: 5.0

# lora
lora_alpha: 16
lora_r: 64
lora_dropout: 0.1

# some options 
shuffle_sequence: False
train_on_all_data: False
save_only_improve: True
take_first: False

# model architecture
temperature: 0.05
loss_function: Contrastive  # MSE, Hungarian_MSE, Contrastive, Hungarian_Contrastive
extra_q_embed: False
compute_loss_on_q: False
use_eos: False
# data loading 
question_only: False

########################################################
# not used that often
only_eval: False
debug: False
epochs_per_stage: 3
max_latent_stage: 3
pad_latent_to_max: True

uniform_prob: 0.0
model_id: meta-llama/Llama-3.2-1B-Instruct
seed: 0
resume: 0
reset_optimizer: True

beta1: 0.9
beta2: 0.98
optim: adamw
lr_min_ratio: 0.0
eps: !!float "1e-6"
weight_tying: False