zipf_offset: 5
outputs: exp/cogvideox/baseline
profile_path: exp/cogvideox/profile/baseline
sp_size: 8
dummy_dataset: true
dummy_data_size: 2000
verbose: true
calculate_imbalance: true


# ==== training config ====

# preprocess embedding
data_path: "./assets/example_data/demo_preprocess.csv"
preprocessed_data: true
drop_last: true

# train
ckpt_path: "THUDM/CogVideoX-5b"
grad_checkpoint: True
num_workers: 8
dtype: "bf16"

# log
seed: 42
epochs: 1
log_every: 1e10

# optimization
grad_clip: 1.0
lr: 1e-8
ema_decay: 0.99
adam_eps: 1e-15
warmup_steps: 10

# data
# image_mixing_frac: 50
num_bucket_build_workers: 16
bucket_config:
  "144p": {1: [1.0, 345], 25: [1.0, 48], 49: [1.0, 25], 73: [1.0, 12], 97: [1.0, 6]}
  "240p": {1: [1.0, 128], 25: [1.0, 16], 49: [1.0, 8], 73: [1.0, 4], 97: [1.0, 2]}
  "360p": {1: [1.0, 64], 25: [1.0, 7], 49: [1.0, 4], 73: [1.0, 2], 97: [1.0, 1]}
  "480p": {1: [1.0, 32], 25: [1.0, 4], 49: [1.0, 2], 73: [1.0, 1], 97: [1.0, 1]}
  "720p": {1: [1.0, 14], 25: [1.0, 1], 49: [1.0, 1], 73: [1.0, 1], 97: [1.0, 1]}

# override default common ar
# for benchmark, we use single ar for all resolutions
# otherwise the data will be too sparse
common_ar:
  "144p": {"0.56": [144, 256]}
  "240p": {"0.56": [240, 426]}
  "360p": {"0.56": [360, 640]}
  "480p": {"0.56": [480, 720]}
  "720p": {"0.56": [720, 1280]}

# mask
mask_ratios: {
    "random": 0.01,
    "intepolate": 0.002,
    "quarter_random": 0.002,
    "quarter_head": 0.002,
    "quarter_tail": 0.002,
    "quarter_head_tail": 0.002,
    "image_random": 0.0,
    "image_head": 0.22,
    "image_tail": 0.005,
    "image_head_tail": 0.005,
}
