diffusion:
    architecture_max_layer: 6
    architecture_n_vocab: 5
    weight_max_size: 64
    patch_size: 4
    hidden_size: 1152
    depth: 12
    num_heads: 16
    mlp_ratio: 4
    learn_sigma: True
    use_swiglu: True
    use_swiglu_large: True


data:
    path: "dataset/pushcube/train_set.h5"
    env_name: "PushCube-v1"

train:
    name : "NNiT_PushCube"
    checkpoint_dir: None
    results_dir: "results"
    epochs: 1000
    global_batch_size: 16
    global_seed: 42
    num_workers: 8
    log_every: 100
    ckpt_every: 100000

    lr: 7e-5
    warmup_steps: 500
    lr_schedule: cosine  # Options: cosine, linear
    min_lr: 3e-5 

