# tested on RSC: /checkpoint/dino/qas/rope/vitl16_im1k/
# gives 82.2 im1k-knn, 83.3 im1k-linear
# runs with a total batch size of 2048 (64/gpu, 4 nodes here)
# runs at 0.57s/iter
MODEL:
  META_ARCHITECTURE: SSLMetaArch
  DEVICE: cuda
  WEIGHTS: ''
  DTYPE: float32
compute_precision:
  param_dtype: bf16
  reduce_dtype: fp32
  sharding_strategy: SHARD_GRAD_OP
dino:
  loss_weight: 1.0
  global_ignore_diagonal: true
  head_n_prototypes: 65536
  head_bottleneck_dim: 256
  head_norm_last_layer: false
  head_nlayers: 3
  head_hidden_dim: 2048
  koleo_loss_weight: 0.1
  koleo_loss_distributed: false
  koleo_topk: 1
  koleo_distributed_replicas: 0
  force_weight_norm: false
ibot:
  loss_weight: 1.0
  mask_sample_probability: 0.5
  mask_ratio_min_max:
  - 0.1
  - 0.5
  mask_random_circular_shift: false
  force_masking_even_with_zero_weight: false
  separate_head: true
  head_n_prototypes: 65536
  head_bottleneck_dim: 256
  head_norm_last_layer: false
  head_nlayers: 3
  head_hidden_dim: 2048
train:
  batch_size_per_gpu: 64
  dataset_path: ImageNet:split=TRAIN
  output_dir: /checkpoint/dino/qas/rope/vitl16_im1k
  saveckp_freq: 20
  seed: 0
  num_workers: 10
  OFFICIAL_EPOCH_LENGTH: 1250
  monitor_gradient_norm: false
  chunk_schedule: []
  cache_dataset: true
  use_teacher_head: true
  learn_from_teacher_tokens: false
  centering: sinkhorn_knopp
  checkpointing: false
  compile: true
  cudagraphs: false
  cell_augmentation: false
  cell_augmentation_type: hpa
student:
  arch: vit_large
  patch_size: 16
  drop_path_rate: 0.3
  layerscale: 1.0e-05
  patch_drop: 0.0
  pretrained_weights: ''
  ffn_layer: mlp
  ffn_ratio: 4.0
  resume_from_teacher_chkpt: ''
  qkv_bias: true
  proj_bias: true
  ffn_bias: true
  norm_layer: layernorm
  n_storage_tokens: 0
  mask_k_bias: false
  in_chans: 3
  pos_embed_type: rope
  pos_embed_rope_base: 100.0
  pos_embed_rope_min_period: null
  pos_embed_rope_max_period: null
  pos_embed_rope_normalize_coords: separate  # min, max, separate
  pos_embed_rope_shift_coords: null
  pos_embed_rope_jitter_coords: null
  pos_embed_rope_rescale_coords: null
  pos_embed_rope_dtype: bf16
  fp8_enabled: False  # Convert Linear layers to operate in fp8 precision
  fp8_filter: "blocks"  # Regex that must appear in module path; empty means everything
teacher:
  momentum_teacher: 0.992
  final_momentum_teacher: 1
  warmup_teacher_temp: 0.04
  teacher_temp: 0.07
  warmup_teacher_temp_epochs: 30
  in_chans: 3
distillation:
  enabled: false
  full_cfg_path: ''
  checkpoint_path: ''
multidistillation:
  enabled: false
hrft:
  enabled: false
  checkpoint_path: ''
optim:
  epochs: 100
  optimizer: adamw
  weight_decay: 0.04
  weight_decay_end: 0.4
  lr: 0.001
  warmup_epochs: 10
  min_lr: 1.0e-06
  clip_grad: 3.0
  freeze_last_layer_epochs: 1
  scaling_rule: sqrt_wrt_1024
  patch_embed_lr_mult: 0.2
  dino_head_wd_multiplier: 1.0
  layerwise_decay: 0.9
  multi_tensor_optim: true
  dump_fsdp_weights_path: ''
  adamw_beta1: 0.9
  adamw_beta2: 0.999
crops:
  global_crops_scale:
  - 0.32
  - 1.0
  local_crops_number: 8
  local_crops_scale:
  - 0.05
  - 0.32
  global_crops_size: 224
  local_crops_size: 96
  localcrops_subset_of_globalcrops: false
  share_color_jitter: false
  horizontal_flips: true
evaluation:
  eval_period_iterations: 12500
  low_freq_every: 5
  config_files:
    high_freq: benchmark_high_frequency.yaml
    low_freq: benchmark_low_frequency.yaml
checkpointing:
  period: 3750
  max_to_keep: 3