defaults:
 - target_models: cdiv_cont_dem

epochs: 40 # 20 8h; 40 16h
warm_up_steps_ratio: 0.1

base_lr: 2e-4
batch_size: 32
random_target_models: -1
num_workers: 16 # 8
# base training settings to scale lr, rarely changed
base_batch_size: 64
base_world_size: 8

weight_decay: 0.01


optimizer:
  _target_: torch.optim.AdamW
  betas: [0.9, 0.999]

lr_scheduler:
  _target_: theia.lr_schedulers.get_cosine_lrs_with_linear_warmup_and_constant
  linear_warmup_ratio: 0.1
  constant_ratio: 0.3
  warm_up_lr_start_factor: 1e-2
  warm_up_lr_end_factor: 1.0
  eta_min: 1e-5

# lr_scheduler:
#   _target_: theia.lr_schedulers.get_constant_lrs_with_linear_warm_up # get_constant_lrs_with_linear_warm_up
#   warm_up_lr_start_factor: 1e-2


grad_clip: False
grad_clip_norm_warmup: 10.0
grad_clip_norm: 1.0

freeze_translator: False
freeze_translator_start_steps_ratio: 0.2
translator_lr_factor: 1.0

main_loss: cos_l1
