
# put sd1.4 ckpt under this dir
sd14_ckpt_path: xxx/GLIGEN_DATA

# gligen ckpt input dir
ckpt: xxx/GLIGEN_DATA/checkpoint_generation_text.pth

# path to diagram dataset 
DIAGRAM_DATA_ROOT: xxx/Diagram_DATA/

# path to ckpt output dir
OUTPUT_ROOT: xxx/ckpt/Diagram_AI2D_Training_Nov20



alpha_type: [0.3, 0, 0.7]


new_image_size: 512
resize_min_edge: 224 
resize_max_edge: 224



diffusion:
  target: ldm.models.diffusion.ldm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.012
    timesteps: 1000
   

model:
  target: ldm.modules.diffusionmodules.openaimodel.UNetModel
  params:
    image_size: 64 # unused in the unet, but will be used when create xT
    in_channels: 4
    out_channels: 4
    model_channels: 320
    attention_resolutions: [ 4, 2, 1 ]
    num_res_blocks: 2
    channel_mult: [ 1, 2, 4, 4 ]
    num_heads: 8
    transformer_depth: 1
    context_dim: 768
    fuser_type: gatedSA  #  gatedCA or gatedSA. We have ablate this, self-attention is better than cross-attention, thus please set this as gatedSA usually
    use_checkpoint: True

    grounding_tokenizer:
      target: ldm.modules.diffusionmodules.text_grounding_net.PositionNet
      params:
        in_dim: 768 # this is pre-processing feature dim from CLIP Text encoder; penultimate feature  
        out_dim: 768 # Not constrained to this, as one linear project is appiled at each Gated layer to match visual dimension 


autoencoder:
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    scale_factor: 0.18215 
    embed_dim: 4
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0


text_encoder:
  target: ldm.modules.encoders.modules.FrozenCLIPEmbedder


grounding_tokenizer_input:
  target: grounding_input.text_grounding_tokinzer_input.GroundingNetInput
