# config for prompt tuning: 
# vision_depth: 12
# language_depth: 12
# vision_ctx: 2
# language_ctx: 2
optimizer: 
    # opt: sgd
    opt: adamp
    betas: [0.9, 0.999]
    eps: 1e-8
    lr: 0.0001
    weight_decay: 0.0001
    momentum: 0.9
schedular:
    sched: cosine
    lr: 0.0001
    epochs: 5
    min_lr: 1e-6
    decay_rate: 1
    warmup_lr: 1e-5
    warmup_epochs: 0.3
    cooldown_epochs: 0
    freeze_backbone_epochs: 0

INPUT:
    SIZE: [224, 224]

# probablity distribution encoder
pde:
    negative_scale: 0.005
    shift: 4
    mul_lr: 10

# attack config
attack:
    MMA: 
        # step 1: prepare texts for text supervision in image attack
        is_use_gt_caps: False
        txt_sup_k: 5
        alpha_sr: 0.1
        alpha_ri: 0.1
        alpha_rs: 0.1
        p_rd: 0.1
        # step 2: image attack
        scale_ver: 0
        alpha_unsup: 0
        alpha_sup: 1
        # step 3: text attack
        is_txt_aug: True
        txt_aug: sr

pcmepp:
    augment: 
        img_size_augment: 1.0
    model:
        is_probabilistic_model: true
        n_unc_layers: 1
        backbone_source: clip_ViT-B/16
        precomp_enc_type: backbone
        embed_size: 1024
        img_dim: 768
        no_imgnorm: false
        no_txtnorm: false
        sigma_ln_init: null
    criterion:
        name: pcmepp
        init_negative_scale: 5
        init_shift: 5
        prob_distance: csd
        vib_beta: 0.0001
        smoothness_alpha: 0.1