model:
  vit_model: "clip_L"

  qformer_num_query_token: 16
  qformer_cross_attention_freq: 1

  sd_train_text_encoder: False
  sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 

  load_finetuned: False
  load_pretrained: True
  # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"

  controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"

preprocess:
  vis_processor:
    train:
      name: "blip_diffusion_inp_image_eval"
    eval:
      name: "blip_diffusion_inp_image_eval"
  text_processor:
    train:
      name: "blip_caption"
    eval:
      name: "blip_caption"
