MUSTC_ROOT=/path/to/datasets/MuST-C/fine
lang=fr
echo "1. clean the source sentences in the corpus"
python3 preprocess_scripts/clean_mustc.py --data-root ${MUSTC_ROOT} --lang ${lang}
echo "2. convert raw data into tsv manifest"
python3 examples/speech_to_text/prep_mustc_data_raw.py --data-root ${MUSTC_ROOT} --tgt-lang ${lang}
echo "3. split audio files"
# mkdir -p ${MUSTC_ROOT}/en-${lang}/segment/
# python3 examples/speech_to_text/seg_mustc_data.py --data-root ${MUSTC_ROOT}/ --task st --lang ${lang} --output ${MUSTC_ROOT}/en-${lang}/segment/train --split train
# python3 examples/speech_to_text/seg_mustc_data.py --data-root ${MUSTC_ROOT}/ --task st --lang ${lang} --output ${MUSTC_ROOT}/en-${lang}/segment/dev --split dev
# python3 examples/speech_to_text/seg_mustc_data.py --data-root ${MUSTC_ROOT}/ --task st --lang ${lang} --output ${MUSTC_ROOT}/en-${lang}/segment/tst-COMMON --split tst-COMMON
# python3 examples/speech_to_text/seg_mustc_data.py --data-root ${MUSTC_ROOT}/ --task st --lang ${lang} --output ${MUSTC_ROOT}/en-${lang}/segment/tst-HE --split tst-HE

python3 examples/speech_to_text/seg_mustc_data.py --data-root ${MUSTC_ROOT}/ --task st --lang ${lang} --output ${MUSTC_ROOT}/en-${lang}/data/tst-COMMON/test_data --split tst-COMMON
