model:
  params:
    linear_start: 0.00085
    linear_end: 0.012
    conditioning_key: crossattn
    timesteps: 1000
    scale_factor: 0.18215

bert:
  params:
    n_embed: 1280
    n_layer: 32

unet:
  params:
    image_size: 16
    in_channels: 3
    out_channels: 3
    model_channels: 224
    num_res_blocks: 2
    attention_resolutions: [ 4, 2 ] # 16, 8, 4
    # note: this isn\t actually the resolution but
    # the downsampling factor, i.e. this corresnponds to
    # attention on spatial resolution 8,16,32, as the
    # spatial reolution of the latents is 64 for f4
    channel_mult: [ 1, 2, 3 ]
    # num_head_channels: 32
    num_heads: 8

    # 3d
    dims: 3

    # cond_model params
    use_spatial_transformer: true
    transformer_depth: 1
    context_dim: 1280
    use_checkpoint: true
    legacy: False
