
pretrained_gligen_modelscope_path: # checkpoint for our trained model
INFERENCE_DATA_ROOT: # folder for video plan generated in the first stage
INFERENCE_DATA_OUTPUT: # path for output directory

position_net_point_or_box: box
text_embedding_dim: 1024
image_embedding_dim: 1024

pretrained_sd14_path: # path for sd-v1-4.ckpt: https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/tree/main
pretrained_modelscope_path: # path for model_scope_diffusers: https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/tree/main
karlo_v1_alpha_path: # path for karlo-v1-alpha: https://huggingface.co/kakaobrain/karlo-v1-alpha/tree/main


multi_scene_cross_frame_attn: False 
multi_scene_single_batch: False  

alpha_type: [0.1, 0.0, 0.9]
gpt_alpha: False 
 
use_videoldm: False 
use_video_data: False
use_video_bbox_data: True
enable_fuser: True
new_image_size: 256

position_net_mid_dim: 1024 
position_net_out_dim: 1024

n_sample_frames: 16

use_fp16: True


diffusion:
  target: ldm.models.diffusion.ldm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.012
    timesteps: 1000
   

model:
  target: ldm.modules.diffusionmodules.openaimodel.UNetModel
  params:
    image_size: 32 # unused in the unet, but will be used when create xT
    in_channels: 4
    out_channels: 4
    model_channels: 320
    attention_resolutions: [ 4, 2, 1 ]
    num_res_blocks: 2
    channel_mult: [ 1, 2, 4, 4 ]
    num_heads: 8
    transformer_depth: 1
    context_dim: 768
    fuser_type: gatedSA  #  gatedCA or gatedSA. We have ablate this, self-attention is better than cross-attention, thus please set this as gatedSA usually
    use_checkpoint: True

    grounding_tokenizer:
      target: ldm.modules.diffusionmodules.text_grounding_net.PositionNet
      params:
        in_dim: 768 # this is pre-processing feature dim from CLIP Text encoder; penultimate feature  
        out_dim: 768 # Not constrained to this, as one linear project is appiled at each Gated layer to match visual dimension 


autoencoder:
  target: ldm.models.autoencoder.AutoencoderKL
  params:
    scale_factor: 0.18215 
    embed_dim: 4
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0


text_encoder:
  target: ldm.modules.encoders.modules.FrozenCLIPEmbedder


grounding_tokenizer_input:
  target: grounding_input.text_grounding_tokinzer_input.GroundingNetInput_ImgTextEmb
