# ==================== 1. set up data paths ==================== #

# this will be the path where all training checkpoints will be stored
DATA_PATH: xxx ##### ACTION NEEDED: set this path before training #####
 
# the folder where training data is stored
train_data_path: sample_data/images ##### ACTION NEEDED: path for the training image folder #####

# the csv file for training prompts
train_prompt_path: sample_data/image_captions.csv ##### ACTION NEEDED: path for the training prompt csv file #####

# batch size per gpu. we use batch size of 1 by default. 
# you can adjust it based on your GPU memory available
train_batch_size: 1

# this parameter represents that we resize the input image to 512 * 512 before giving to SDv1.5 ControlNet
# adding support for different resolutions is left for future work
use_size_512: True 

# train and generate images at resolution 1024 by 1024
height: 1024 
width: 1024 

# for image generation models (e.g., SDXL), need to set both of the following as 1
n_sample_frames: 1
output_fps: 1

# for image generation models (e.g., SDXL), need to set both of the following as images
input_data_type: images # will use corresponding image data loader
eval_input_type: images # will evaluate on specified image dataset



# ==================== 2. Ctrl-Adapter configurations ==================== #
model_name: sdxl

# notice that we need to set the value of this cross_attention_dim same as the prompt/image embedding channel dimension of the backbone model 
# e.g., for SDXL, the prompt dimension (encoder_hidden_states) is 2048
cross_attention_dim: 2048

# by default, we activate spatial resnet and spatial attention/transformer blocks in SDXL Ctrl-Adapter
# comparison of different architecture design is shown in the Ablation Studies section in our main paper
add_spatial_resnet: True
add_temporal_resnet: False
add_spatial_transformer: True
add_temporal_transformer: False

# by default, we use one spatial resnet block and one spatial attention/transformer block in SDXL Ctrl-Adapter
# comparison of different architecture design is shown in the Ablation Studies section in our main paper
num_blocks: 1

# by default, we use output blocks A, B, C for SDXL Ctrl-Adapter 
adapter_locations: [A, B, C]
# by default, we add three Ctrl-Adapters for each output block (see Appendix Additional Quantitative Analysis for details)
num_adapters_per_location: 3

# controlnet-related parameters
skip_conv_in: False # only set this as True for sparse-control training and models trained with continuous noise scheduler (e.g. SVD) 
skip_time_emb: False # False by default
fixed_controlnet_timestep: -1  # -1 by default. if it is set as value within range [0, 1000], we will only give this fixed_controlnet_timestep to SDv1.5 ControlNet during training and inference


# ==================== 3. specify the control type you want to train ==================== #
# represents which control condition we want to train and evaluate. Can be one of depth, canny, normal, segmentation, softedge, lineart, openpose
control_types: [depth] # can be one of depth, canny, normal, segmentation, softedge, lineart, openpose

# if you want to train a single adapter that works for multiple control conditions, you can set the following as a list: 
# such as [depth, canny, normal, segmentation, softedge, lineart, openpose]
# then in each training step, a control condition will be randomly selected from the list
# if you just want to train an adapter that only works for a single control conditiono (e.g., depth), just leave this as empty list
mixed_control_types_training: [depth, canny, normal, segmentation, softedge, lineart, openpose, scribble]
