workspace: /path/to/workspace
name: 512T_NC=16384
log_path: ${workspace}/outputs/${name}
world_size: 1
seed: 42

data:
  train_data_path: /path/to/converted_imagenet_train
  val_data_path: /path/to/converted_imagenet_val
  batch_size: 256
  world_size: ${world_size}

model:
  vae_path: ${workspace}/ckpt/dc_ae
  log_path: ${log_path}
  input_data_size: [32, 8, 8]
  quantizer:
    TYPE: vq
    num_code: 16384
    num_group: 1
    tokens_per_data: 512
  decoder:
    TYPE: dc_ae
    in_channels: 32
    latent_channels: 32
    attention_head_dim: 32
    block_type: [EfficientViTBlock]
    block_out_channels: [512]
    layers_per_block: [3]
    qkv_multiscales: [[5]]
    norm_type: rms_norm
    act_fn: silu
    upsample_block_type: interpolate

train:
  log_path: ${log_path}
  batch_size: ${data.batch_size}
  world_size: ${world_size}
  num_epochs: 100
  lr: 0.01
  visualize_interval: 1
  image_log_path: ${log_path}/images