#!/bin/bash

. /opt/conda/bin/activate # activate conda env
cd /apdcephfs_cq10/share_1603164/user/yiwenyshao/independent/auden/egs/asr
# Ensure Python paths and debugging flags are set
export PYTHONPATH=/apdcephfs_cq10/share_1603164/user/yiwenyshao/independent/auden:$PYTHONPATH
export PYTHONPATH=/apdcephfs_cq10/share_1603164/user/yiwenyshao/lhotse:$PYTHONPATH

# export TORCH_DISTRIBUTED_DEBUG=OFF  # Reduce log overhead
# export NCCL_DEBUG=WARN  # Log only important NCCL issues
# export NCCL_SOCKET_IFNAME=eth1  # Use the correct network interface
# export NCCL_IB_DISABLE=0  # Enable InfiniBand if available
# export NCCL_P2P_DISABLE=0  # Allow GPU peer-to-peer
# export NCCL_ASYNC_ERROR_HANDLING=1  # Keep this for stability
# export NCCL_TIMEOUT=600  # Keep this in case of slow nodes

# export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1
export NCCL_IB_SL=3
export NCCL_CHECK_DISABLE=1
export NCCL_P2P_DISABLE=0
export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_IB_DISABLE=0


# export TORCH_DISTRIBUTED_DEBUG=INFO
# torchrun --nnodes=2 --nproc_per_node=8 \
#     --rdzv_id=100 --rdzv_backend=c10d \
#     --rdzv_endpoint="11.215.65.151:23456" \
#     train.py exp_dir=exp/auden_test_qy


# Run `torchrun` using automatically detected MASTER_ADDR
# MASTER_ADDR=9.206.62.130
# MASTER_PORT=12345
# torchrun --nnodes=2 --nproc_per_node=8 \
#          --rdzv_id=100 \
#          --rdzv_backend=c10d \
#          --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \
#          train.py \
#          +exp_dir=exp/auden_zh_en_r3large_fp16_16gpu_icefall_standard_scaledadam \
#          ++data.max_duration=600 \
#          ++model.name=zipformer_adam \

# torchrun --nnodes=3 --nproc_per_node=8 \
#          --rdzv_id=100 \
#          --rdzv_backend=c10d \
#          --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \
#         train.py \
#         ++exp_dir=exp/auden_zh_dialect_r3large_mid_24gpu \
#         data.train_data_config=configs/train_data_dialect_config.yaml \
#         data.valid_sets="[data/test/aishell/cuts_fbank.jsonl.gz]" \


torchrun --nnodes=3 --nproc_per_node=8 \
         --rdzv_id=100 \
         --rdzv_backend=c10d \
         --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \
        train.py \
        ++exp_dir=exp/auden_zh_dialect_r3large_st_english \
        data.train_data_config=configs/train_data_dialect_config.yaml \
        data.valid_sets="[data/test/aishell/cuts_fbank.jsonl.gz]" \
        ++model.special_tokens='[mandarin,min,wu,yue,english,unk_lang]' \
        ++model.use_attention_decoder=true