resume:
pretrain:
seed: 1024
data:
    dataset: SSv2
    modality: video
    num_segments: 8
    seg_length: 1       # no use
    batch_size: 2
    workers: 8
    num_classes: 87
    image_tmpl: 'image_{:06d}.jpg' # no used
    train_root: '/media/h/HH/DATASET/SSv2/video_mp4/train_mp4/'
    train_list: 'lists/SSv2/B2N/SSv2_base_trainlist.txt' 
    val_root: '/media/h/HH/DATASET/SSv2/video_mp4/val_mp4/'
    val_list: 'lists/SSv2/B2N/SSv2_base_vallist.txt'
    label_list: 'lists/SSv2/B2N/SSv2_base_labels.csv'
    input_size: 224
    random_shift: True
    output_path: exps_
    shot: 16            # B2N setting
network:
    arch: ViT-B/16      #ViT-B/32 ViT-B/16
    init: True
    tm: False           # no use
    drop_out: 0.0 
    emb_dropout: 0.0
    sim_header: None    # [Transf, None] 'Transf'：6-layer temporal transformer  'None': mean temporal pooling
    interaction: DP     # [DP] 'DP': mean temporal pooling
    joint_st: False     # whether use joint space-time attention in the transformer (default: False)
    drop: 0      
    fix_text: True
    fix_video: False
    temporal_layer: 4
solver:
    type: cosine
    epochs: 30
    start_epoch: 0
    epoch_offset: 0
    optim: adamw
    lr: 5.e-5
    lr_warmup_step: 5
    weight_decay: 0.2
    loss_type: CE
    evaluate: False     # only run evaluation
    clip_ratio: 0.07
    grad_accumulation_steps: 1
logging:
    print_freq: 10
    eval_freq: 1