_target_: mdt.models.mdtv_agent.MDTVAgent
_recursive_: false

defaults:
  - language_goal: clip
  - visual_goal: clip
  - img_gen: masked_transformer
  - model: mdtv_transformer 


latent_dim: 384
multistep: 10
sampler_type: 'ddim'
num_sampling_steps: 10
sigma_data: 0.5
sigma_min: 0.001
sigma_max: 80
noise_scheduler: 'exponential'
sigma_sample_density_type: 'loglogistic'
use_lr_scheduler: true
act_window_size: 10
cont_alpha: 1.0
masked_beta: 1.0
use_distributed_clip: True
use_text_not_embedding: True
ckpt_path: null
seed: ${seed}
perceiver_depth: 6
perceiver_heads: 8
perceiver_dim_head: 64
perceiver_num_time_embeds: 1
perceiver_dim: 384
num_latents: 3

voltron_cache: /data2/zehao/cache/voltron_cache

optimizer:
  _target_: torch.optim.AdamW
  transformer_weight_decay: 0.05
  obs_encoder_weight_decay: 0.05
  learning_rate: 1e-4
  betas: [0.9, 0.9]

lr_scheduler:
  lr_scheduler:
    init_lr: 1e-4  # This is the peak or maximum learning rate
    init_lr_scale: 0.1  # This is the ratio of initial learning rate to peak learning rate
    final_lr_scale: 1e-6  # This is the ratio of final learning rate to peak learning rate
    total_steps: 50000  # Example total steps, adjust as needed
    phase_ratio: "(0.02, 0.08, 0.9)"
    lr: 1e-4 