MODEL:
  META_ARCHITECTURE: MultiDistillationMetaArch
  DEVICE: cuda
  WEIGHTS: ''
  DTYPE: float32
compute_precision:
  param_dtype: bf16
  reduce_dtype: fp32
  sharding_strategy: SHARD_GRAD_OP
dino:
  loss_weight: 1.0
  global_ignore_diagonal: true
  head_n_prototypes: 262144
  head_bottleneck_dim: 512
  head_norm_last_layer: false
  head_nlayers: 3
  head_hidden_dim: 8192
  koleo_loss_weight: 0.1
  koleo_loss_distributed: false
  koleo_topk: 1
  koleo_distributed_replicas: 0
  koleo_distributed_loss_group_size: null
  koleo_distributed_loss_group_data: true
  force_weight_norm: false
  reweight_dino_local_loss: false
  local_loss_weight_schedule:
    start: 0.5
    peak: 0.5
    end: 0.5
    warmup_epochs: 0
ibot:
  loss_weight: 1.0
  mask_sample_probability: 0.5
  mask_ratio_min_max:
  - 0.1
  - 0.5
  mask_random_circular_shift: false
  force_masking_even_with_zero_weight: false
  separate_head: true
  head_n_prototypes: 98304
  head_bottleneck_dim: 384
  head_norm_last_layer: false
  head_nlayers: 3
  head_hidden_dim: 4096
coding_rate_loss:
  use_cls_loss: false
  cls_loss_weight: 0.2
  use_masked_patches_loss: false
  masked_patches_loss_weight: 0.1
  epsilon: 8
gram:
  use_loss: false
  compute_stats: false
  loss_weight: 1.0
  ema_teacher: false
  ckpt: null
  it_load_ema_teacher: -1
  rep_update: true
  update_frequency: 50000
  it_first_update: 0
  max_updates: null
  normalized: true
  img_level: false
  remove_neg: false
  remove_only_teacher_neg: false
  tokens_used: all
  global_teacher_resize_method: bicubic
  global_teacher_resize_antialias: false
  loss_weight_schedule: null
train:
  batch_size_per_gpu: 3
  dataset_path: <TRAIN/DATASET>
  output_dir: <OUTPUT/DIR>
  saveckp_freq: 20
  seed: 0
  num_workers: 2
  OFFICIAL_EPOCH_LENGTH: 1250
  monitor_gradient_norm: false
  chunk_schedule: []
  cache_dataset: true
  use_teacher_head: true
  learn_from_teacher_tokens: false
  centering: sinkhorn_knopp
  checkpointing: true
  checkpointing_full: true
  compile: true
  cudagraphs: false
  cell_augmentation: false
  cell_augmentation_type: hpa
  sharded_eval_checkpoint: false
student:
  arch: vit_large
  patch_size: 16
  drop_path_rate: 0.0
  layerscale: 1.0e-05
  drop_path_uniform: true
  drop_path_shape: uniform
  patch_drop: 0.0
  pretrained_weights: ''
  sin_cos_embeddings: false
  fourier_embeddings: false
  fourier_encoding_dim: 64
  multiple_pos_embeddings: false
  cls_pos_embedding: false
  reg_pos_embedding: false
  ffn_layer: mlp
  ffn_ratio: 4.0
  resume_from_teacher_chkpt: <PATH/TO/HRFT/TEACHER>
  block_chunks: 0
  qkv_bias: true
  proj_bias: true
  ffn_bias: true
  norm_layer: layernormbf16
  n_storage_tokens: 4
  mask_attention: false
  mask_register_attention: false
  untie_cls_and_patch_norms: false
  untie_global_and_local_cls_norm: false
  interpolate_offset: 0.0
  interpolate_antialias: true
  mask_k_bias: true
  init_std_cls: 0.02
  init_std_reg: 0.02
  rescale_weights_by_layer_id: false
  in_chans: 3
  pos_embed_grid_size: 48
  pos_embed_type: ropenew
  pos_embed_rope_gamma: 1.0
  pos_embed_rope_init_multi_frequencies: false
  pos_embed_rope_base: 100
  pos_embed_rope_min_period: null
  pos_embed_rope_max_period: null
  pos_embed_rope_normalize_coords: separate
  pos_embed_rope_shift_coords: null
  pos_embed_rope_jitter_coords: null
  pos_embed_rope_rescale_coords: 2
  pos_embed_rope_dtype: bf16
  sparse24_ranges: []
  sparse24_filter:
  - mlp
  sparse24_default: false
  fp8_enabled: false
  fp8_filter: blocks
teacher:
  momentum_teacher: 0.994
  final_momentum_teacher: 1
  warmup_teacher_temp: 0.04
  teacher_temp: 0.07
  warmup_teacher_temp_epochs: 120
  in_chans: 3
distillation:
  enabled: true
  full_cfg_path: <PATH/TO/TEACHER/CONFIG/config.yaml>
  checkpoint_path: <PATH/TO/TEACHER/checkpoint.pth>
multidistillation:
  enabled: true
  global_batch_size: 1920
  students:
  - name: vits_mlp4_4
    config_path:  <PATH/TO/STUDENT/CONFIG/vits_mlp4_4.yaml>
    ranks_range:
    - 0
    - 48
  - name: vitsp_swiglu6_1
    config_path: <PATH/TO/STUDENT/CONFIG/vitsp_swiglu6_1.yaml>
    ranks_range:
    - 48
    - 96
  - name: vitb_mlp4_3
    config_path: <PATH/TO/STUDENT/CONFIG/vitb_mlp4_3.yaml>
    ranks_range:
    - 96
    - 176
  - name: vitl_mlp4_1
    config_path:  <PATH/TO/STUDENT/CONFIG/vitl_mlp4_1.yaml>
    ranks_range:
    - 176
    - 296
hrft:
  enabled: false
  checkpoint_path: ''
optim:
  epochs: 20
  optimizer: adamw
  weight_decay: 0.04
  weight_decay_end: 0.2
  lr: 0.0002
  warmup_epochs: 0
  min_lr: 1.0e-06
  schedule_trunc_extra: 0.0
  clip_grad: 3.0
  freeze_last_layer_epochs: 0
  scaling_rule: sqrt_wrt_1024
  patch_embed_lr_mult: 0.2
  dino_head_wd_multiplier: 1.0
  layerwise_decay: 0.99
  multi_tensor_optim: true
  dump_fsdp_weights_path: ''
  adamw_beta1: 0.9
  adamw_beta2: 0.999
crops:
  global_crops_scale:
  - 0.32
  - 1.0
  local_crops_number: 8
  local_crops_scale:
  - 0.05
  - 0.32
  global_crops_size: 256
  local_crops_size: 112
  global_local_crop_pairs_ratios: 1.0
  gram_teacher_crops_size: 256
  localcrops_subset_of_globalcrops: false
  share_color_jitter: false
  horizontal_flips: false
  gram_teacher_no_distortions: false
  rgb_mean:
  - 0.485
  - 0.456
  - 0.406
  rgb_std:
  - 0.229
  - 0.224
  - 0.225
checkpointing:
  period: 3750
  max_to_keep: 3
  keep_every: 99999999999999999
schedules:
  weight_decay:
    start: 0.04
    peak: 0.04
    end: 0.04
    warmup_epochs: 0
  teacher_temp:
    start: 0.04
    peak: 0.07
    end: 0.07
    warmup_epochs: 0
  lr:
    start: 0
    peak: 0
    end: 5.0e-05
    warmup_epochs: 0
    freeze_last_layer_epochs: 0
    cosine_epochs: 10
  momentum:
    start: 0.994
    peak: 0.994
    end: 1.0
    warmup_epochs: 0
