timm_model_name: "timm/vit_base_patch14_reg4_dinov2.lvd142m"

train_batch_size: 2
valid_batch_size: 2
train_loader_workers: 10
valid_loader_workers: 2

random_crop: "rrc"
color_jitter: 0.0
auto_augment: "rand-m9-mstd0.5-inc1"
random_erasing: 0.25
augment_repeats: 3
test_crop_ratio: 0.875

criterion: "ce"
zoom_map_criterion: "kl"

layers: 12
dim: 768
heads: 12
labels: 1000
num_registers: 4

patch_size: 14
image_size: 518
zoomer_image_size: 154
zoomer_depth: 3

init_seed: 0
mixup_seed: 0
dropout_seed: 0
shuffle_seed: 0
patch_select_seed: 0
pretrained_ckpt: ""
label_mapping: ""

grad_accum: 1
optimizer:
  name: "adamw"
  lr: 1e-3
  weight_decay: 0.05
  betas: [0.9, 0.999]
  eps: 1e-8
  lr_decay: 1.0
  clip_grad: 0.0

warmup_steps: 10
training_steps: 50
log_interval: 50
eval_interval: 10

dataset_name: "test"
top_k: None
top_k_range:
  min: 1
  max: 10
patch_selection_method: "topk-zoomer"
distillation_losses:
  cls: 1.0
  reg: 1.0
  patch: 1.0
  map: 1.0
conditioning_tokens: ["cls_token"] # ["cls_token", "reg_token"]
upsample_features:
  type: NN
  K: 5
  distance_power: 2
attn_aggregate: ["cls_token", "reg_token", "patch_token"]
aggregate_layers: "all"

# Just Reinforce
rl_coeff: 0.1
kl_coeff: 0.1
standardize_advantage: True
advantage_clip_coeff: 0.03
logprobs_min_prob: 1e-6 # Clip below this value, ln(0.0001) ~ -13
is_naive: True

do_mean: True

# Gaussian
group_size: 4

# Gumbel & Reinforce
gumbel_temperature: 0.5
gumbel_temperature_min: 0.1
gumbel_temperature_decay: 0.999
gumbel_temperature_decay_steps: 1000

# Just Gumbel
gumbel_noise_coeff: 1.
gumbel_noise_coeff_min: 0.1
gumbel_noise_coeff_decay: 0.999
gumbel_noise_coeff_decay_steps: 1000

project: ""
