timm_model_name: "timm/vit_base_patch14_reg4_dinov2.lvd142m"

train_batch_size: 1024
valid_batch_size: 1024
train_loader_workers: 20
valid_loader_workers: 5

random_crop: "rrc"
color_jitter: 0.0
auto_augment: "rand-m9-mstd0.5-inc1"
random_erasing: 0.25
augment_repeats: 1
test_crop_ratio: 0.875

criterion: "mse"
zoom_map_criterion: "kl"

layers: 12
dim: 768
heads: 12
labels: 1000
num_registers: 4

patch_size: 14
image_size: 518
zoomer_image_size: 154
zoomer_depth: 3

init_seed: 0
mixup_seed: 0
dropout_seed: 0
shuffle_seed: 0
patch_select_seed: 0
pretrained_ckpt: ""
label_mapping: ""

grad_accum: 1
optimizer:
  name: "adamw"
  lr: 2e-4
  weight_decay: 0.05
  betas: [0.9, 0.999]
  eps: 1e-8
  lr_decay: 1.0
  clip_grad: 0.0

warmup_steps: 50_000
training_steps: 500_000
log_interval: 100
eval_interval: 12_511

dataset_name: "imagenet"
top_k: None
top_k_range:
  min: 16
  max: 128
patch_selection_method: "topk-zoomer"
distillation_losses:
  cls: 1.0
  reg: 0.0
  patch: 1.0
  map: 0.1
conditioning_tokens: ["cls_token", "reg_token"]
upsample_features:
  type: NN
  K: 5
  distance_power: 1

attn_aggregate: ["patch_token"]
aggregate_layers: "last"

# NOT USED
do_mean: True
rl_coeff: 0.1
kl_coeff: 0.1
is_naive: False
gumbel_temperature: 0.5
gumbel_temperature_min: 0.1
gumbel_temperature_decay: 0.5
gumbel_temperature_decay_steps: 25000
gumbel_noise_coeff: 1.
gumbel_noise_coeff_min: 0.1
gumbel_noise_coeff_decay: 0.999
gumbel_noise_coeff_decay_steps: 1000

project: "zoom_base"
