seed: null

data:
  dataset: butterfly  #tinyimagenet, celeba, celebahq, etc.
  channels: 3
  image_size: 64
  random_flip: false
  n_samples: 702 # max number of samples to use for training
  #num_classes: 10


method: dlpm # dlpm

dlpm:
  alpha: 2.0
  reverse_steps: 4000
  rescale_timesteps: true
  input_scaling: false


diffusion:
  T: 1.0
  process_type: VP # VE or VP
  schedule: cosine
  rescale_timesteps: true # rescale the timesteps to [0, 1]
  
  learn_variance: false # learn the variance
  conditional: false # conditional diffusion

eval:
  data_to_generate: 702
  real_data: 702
  batch_size: 256

  diffusion:
    epsilon: 0.00001
    deterministic: false # set to true for ode sampling
    reverse_steps: 500
    clip_denoised: true # clip denoised data to [0, 1]
    ei_integrator: true

  dlpm:
    deterministic: false # set to true for ddim sampling
    reverse_steps: 500
    clip_denoised: true # clip denoised data to [0, 1]
    ei_integrator: true


# normal
# model:
#   architecture: 'unet'
#   unet:
#     model_type: ddpm
#     attn_resolutions: [4, 8, 16]
#     channel_mult: [1, 2, 4, 4]
#     dropout: 0.0 #0.1
#     model_channels: 64
#     num_heads: 4
#     num_res_blocks: 2


# improved
model:
  architecture: 'unet'
  unet:
    model_type: ddpm
    attn_resolutions: [4, 8, 16]
    channel_mult: [1, 2, 2, 2, 4]
    dropout: 0.0 #0.1
    model_channels: 128
    num_heads: 4
    num_res_blocks: 2

# improved 1
# model:
#   architecture: 'unet'
#   unet:
#     model_type: ddpm
#     attn_resolutions: [16, 8, 4]
#     channel_mult: [1, 2, 4, 8]
#     dropout: 0.0 #0.1
#     model_channels: 192
#     num_heads: 4
#     num_res_blocks: 3


# for 128x128
# model:
#   architecture: 'unet'
#   unet:
#     model_type: ddpm
#     attn_resolutions: [32, 16, 8]       # attention at larger scales
#     channel_mult: [1, 2, 4, 8, 8]       # deeper pyramid to 4×4
#     dropout: 0.1
#     model_channels: 256                 # higher capacity for 128×128
#     num_heads: 8                        # more heads for global context
#     num_res_blocks: 3

training:
  batch_size: 64
  num_workers: 0

  diffusion:
    ema_rates:
    - 0.9999
    grad_clip: 5.0
  
  dlpm:
    ema_rates:
    - 0.9999
    grad_clip: 5.0

optim:
  optimizer: adamw
  momentum: 0.
  temperature: 1.0 # 0.00001
  schedule: null #steplr
  lr: 0.0001
  warmup: 200 
  lr_steps: 1000000
  lr_step_size: 5000
  lr_gamma: 0.95

run:
  steps: 10000
  eval_freq: null
  checkpoint_freq: null
  progress: false # print progress bar
  id: null # experiment id
  fp16: false # use fp16
  load_dataset_to_gpu: false # load dataset to gpu

