#!/bin/bash
export PYTHONPATH=/apdcephfs_cq12/share_302080740/user/raytseng/research/Auden-refactor-online/Auden:/apdcephfs_cq12/share_302080740/user/raytseng/research/lhotse:$PYTHONPATH
valid_sets='[/apdcephfs_cq12/share_302080740/user/raytseng/research/Auden-refactor-online/Auden/egs/audio_captioning/manifests/audiocaps_val.jsonl.gz,/apdcephfs_cq12/share_302080740/user/raytseng/data/ParaSpeechCaps/dev_500.jsonl.gz,/apdcephfs_cq12/share_302080740/user/raytseng/data/WavCaps/manifest/MusicCaps_eval_500.jsonl.gz]'
# train a audio captioning model completely from scratch, batch size 6400, lr 5e-3, fp16, WITH PARALLEL CAPTIONING GENERATION
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 \
        --master_port=29503 \
        train.py \
        exp_dir=exp/CaptionStew_4M_masked_captioning_Au_Tu_5e-3_fp16_bsz5120_shuffled_no_reduction \
        model.name="zipformer-masked-captioning" \
        data.train_data_config=configs/captionstew_4M/train_data_config_captionstew_4M.yaml \
        data.max_duration=640 \
        data.valid_sets=$valid_sets \
        data.use_infinite_dataset=true \
        data.num_workers=12 \
        trainer.use_fp16=true \
        trainer.lr_steps_per_epoch=10000 \
        trainer.base_lr=0.005 \
        trainer.warmup_batches=5000 \

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 /apdcephfs_cq12/share_302080740/user/raytseng/run_gpu.py --size 40000 --gpus 8 --interval 0.01