--model openllama_peft --stage 3y --data_path null --val_data_path null --audio_data_path /mnt/bn/audio-visual-llm-data/datasets/multitask_json/bs_how2_300h_train_longest200.json --audio_val_data_path /mnt/bn/audio-visual-llm-data/datasets/multitask_json/bs_how2_300h_val.json --image_root_path null --data_type audio --imagebind_ckpt_path /mnt/bn/audio-visual-llm-data/guangzhisun/audio_visual_llm/pandagpt2/pretrained_ckpt/imagebind_ckpt --vicuna_ckpt_path /mnt/bn/audio-visual-llm-data/yuwenyi/ckpt/vicuna/vicuna.13b --max_tgt_len 2000 --save_path /mnt/bn/audio-visual-llm-data/yuwenyi/playground/pandagpt/code/output/debug/ --log_path /mnt/bn/audio-visual-llm-data/yuwenyi/playground/pandagpt/code/output/debug/log/ --use_lora true --image_data_path null --image_val_data_path null --llava_root_path null --qformer true --use_blip true --use_whisper true --instructblip true --early_align false --alignmode 2 --num_video_query 32 --groupsize 10 --causal_attention false --diversity_loss true --diversity_loss_factor 0.01 --divsche 1 --pure_aud True --speech_qformer true --num_speech_query 1 --second_per_frame 0.333333 --second_stride 0.333333