train:
  datasets:

    replica_test_space:
      type: multimodal

      # Input and output domain names, separated by hyphen
      in_domains: rgb@224-tok_rgb@224-det-tok_clip@224-tok_depth@224-tok_normal@224-tok_semseg@224-tok_canny_edge@224-tok_sam_edge@224-tok_imagebind@224
      out_domains: tok_rgb@224-det-tok_clip@224-tok_depth@224-tok_normal@224-tok_semseg@224-tok_canny_edge@224-tok_sam_edge@224-tok_imagebind@224

      # Dirichlet alphas concentration parameter for input and output. 
      # Can be either one value, or one value per input modality separated by hyphen.
      input_alphas: null
      target_alphas: null
      # Path to specific alphas configuration to enable mixture of Dirichlets. 
      # If provided, overrides input_alphas and target_alphas
      alphas_config: "cfgs/alphas_mixture/mix_rgb2all_all2all_a0.5.yaml"

      # Optionally, min_input_tokens, min_target_tokens, num_input_tokens, num_target_tokens can be specified here
      # If so, they will override the values provided in the main config

      # Data can either be local or on cloud storage (e.g. S3), see data docs for more info
      # Use braceexpand notation to indicate shard range (e.g. shard-{0000..9999}.tar)
      # Use brackets to indicate multiple modalities (e.g. [modality1,modality2,modality3]) 

      data_path: './datasets/TST-ProcTHOR/pretrain/test_spaces' 
      use_wds: False # Use webdataset
      wds_n_repeats: 4 # Number of repeats for webdataset loader to improve efficiency
      wds_shuffle_buffer_tar: 5 # Webdatasets shuffle buffer after loading tar files
      wds_shuffle_buffer_repeat: 5 # Webdatasets shuffle buffer after repeating samples
      
      main_augment_domain: rgb@224 # Select from which modality to get the original full image size (mostly important for resizing bounding boxes)
      aligned_captions: True # Align captions to crop_settings
      tok_train_aug: True # Apply data augmentation to tokens (if multiple crop settings are available)
      num_crops: 8

    transfer:
      type: multimodal

      # Input and output domain names, separated by hyphen
      in_domains: rgb@224-tok_rgb@224-det-tok_clip@224-tok_depth@224-tok_normal@224-tok_semseg@224-tok_canny_edge@224-tok_sam_edge@224-tok_imagebind@224
      out_domains: tok_rgb@224-det-tok_clip@224-tok_depth@224-tok_normal@224-tok_semseg@224-tok_canny_edge@224-tok_sam_edge@224-tok_imagebind@224

      # Dirichlet alphas concentration parameter for input and output. 
      # Can be either one value, or one value per input modality separated by hyphen.
      input_alphas: null
      target_alphas: null
      # Path to specific alphas configuration to enable mixture of Dirichlets. 
      # If provided, overrides input_alphas and target_alphas
      alphas_config: "cfgs/alphas_mixture/mix_rgb2all_all2all_a0.5.yaml"

      # Optionally, min_input_tokens, min_target_tokens, num_input_tokens, num_target_tokens can be specified here
      # If so, they will override the values provided in the main config

      # Data can either be local or on cloud storage (e.g. S3), see data docs for more info
      # Use braceexpand notation to indicate shard range (e.g. shard-{0000..9999}.tar)
      # Use brackets to indicate multiple modalities (e.g. [modality1,modality2,modality3]) 
      data_path: './datasets/TST-ProcTHOR/pretrain/transfer' 
      use_wds: False # Use webdataset
      wds_n_repeats: 4 # Number of repeats for webdataset loader to improve efficiency
      wds_shuffle_buffer_tar: 5 # Webdatasets shuffle buffer after loading tar files
      wds_shuffle_buffer_repeat: 5 # Webdatasets shuffle buffer after repeating samples
      
      main_augment_domain: rgb@224 # Select from which modality to get the original full image size (mostly important for resizing bounding boxes)
      aligned_captions: True # Align captions to crop_settings
      tok_train_aug: True # Apply data augmentation to tokens (if multiple crop settings are available)
      num_crops: 8

  weights: [0.5, 0.5] # Sampling weights for the training datasets