#!/bin/bash

. /opt/conda/bin/activate # activate conda env
cd /apdcephfs_cq10/share_1603164/user/yiwenyshao/independent/auden/egs/asr
# Ensure Python paths and debugging flags are set
export PYTHONPATH=/apdcephfs_cq10/share_1603164/user/yiwenyshao/independent/auden:$PYTHONPATH

# export TORCH_DISTRIBUTED_DEBUG=OFF  # Reduce log overhead
# export NCCL_DEBUG=WARN  # Log only important NCCL issues
# export NCCL_SOCKET_IFNAME=eth1  # Use the correct network interface
# export NCCL_IB_DISABLE=0  # Enable InfiniBand if available
# export NCCL_P2P_DISABLE=0  # Allow GPU peer-to-peer
# export NCCL_ASYNC_ERROR_HANDLING=1  # Keep this for stability
# export NCCL_TIMEOUT=600  # Keep this in case of slow nodes

# export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1
export NCCL_IB_SL=3
export NCCL_CHECK_DISABLE=1
export NCCL_P2P_DISABLE=0
export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_IB_DISABLE=0


# export TORCH_DISTRIBUTED_DEBUG=INFO
# torchrun --nnodes=2 --nproc_per_node=8 \
#     --rdzv_id=100 --rdzv_backend=c10d \
#     --rdzv_endpoint="11.215.65.151:23456" \
#     train.py exp_dir=exp/auden_test_qy


# Run `torchrun` using automatically detected MASTER_ADDR
# MASTER_ADDR=9.206.62.130
# MASTER_PORT=12345
torchrun --nnodes=3 --nproc_per_node=8 \
         --rdzv_id=100 \
         --rdzv_backend=c10d \
         --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \
         train_ds.py \
         +exp_dir=exp/auden_zh_en_r3large_noam_24gpu \
         ++trainer.save_every_n=4 \
         ++data.max_duration=400 \
         ++trainer.num_epochs=100 \
         ++deepspeed_config=configs/ds_config_zero2_new.json \
         ++trainer.start_epoch=3 \
         ++trainer.scheduler=noam \
         ++trainer.base_lr=0.001 \
         # ++model.feedforward_dim="[2304, 3072, 6144, 9216, 6144, 3072]"