# ------------------ chinese data------------------
- hours: 10005
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/disturb/wenetspeech_denoise/wenetspeech_denoise_cuts.jsonl.gz
  lang: zh
- hours: 10005
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/open_source/wenetspeech/wenetspeech_cuts.jsonl.gz
  lang: zh
- hours: 1000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/open_source/aishell2/aishell2_cuts.jsonl.gz
  lang: zh
- hours: 150
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/open_source/aishell1/aishell1_cuts.jsonl.gz
  lang: zh
- hours: 237
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/zh-CN/zh-CN_train_cuts.jsonl.gz
  lang: zh
- hours: 222
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_zh_filtered_train_cuts.jsonl.gz
  lang: zh

# ---------------------- ailab private code-switch data -------------------------------------
- hours: 125
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/ManEng_King-ASR-065/ManEng_King-ASR-065_cuts.jsonl.gz
- hours: 2328
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/chonglang_7000h_maneng_part1/chonglang_7000h_maneng_part1_cuts.jsonl.gz
- hours: 199
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/datatang_maneng_2019_200h_traindata/datatang_maneng_2019_200h_traindata_cuts.jsonl.gz
- hours: 3985
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/chonglang_7000h_maneng_part2/chonglang_7000h_maneng_part2_cuts.jsonl.gz
- hours: 1000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/ManEng_201712/ManEng_201712_cuts.jsonl.gz
- hours: 732
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/datatang_maneng_2019_700h_traindata/datatang_maneng_2019_700h_traindata_cuts.jsonl.gz

# ------------------ailab old private chinese data----------------
- hours: 863
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/aishu_spontaneous_MDT2018S019_part1/aishu_spontaneous_MDT2018S019_part1_cuts.jsonl.gz
- hours: 117
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/younger_children_mobile_1611/younger_children_mobile_1611_cuts.jsonl.gz
- hours: 714
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/datatang_accent_132_662/datatang_accent_132_662_cuts.jsonl.gz
- hours: 1476
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/datatang_1505h/datatang_1505h_cuts.jsonl.gz
- hours: 5219
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/aishu_spontaneous_MDT2017S019/aishu_spontaneous_MDT2017S019_cuts.jsonl.gz
- hours: 1980
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/datatang_chinese_sopn_1980h/datatang_chinese_sopn_1980h_cuts.jsonl.gz
- hours: 3015
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/shichen_kouyin_3000h/shichen_kouyin_3000h_cuts.jsonl.gz
- hours: 236
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/xiaoshuo_ximalay_2022/xiaoshuo_ximalay_2022_cuts.jsonl.gz
- hours: 2475
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/aishu_spontaneous_201911_2500h/aishu_spontaneous_201911_2500h_cuts.jsonl.gz
- hours: 2095
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/datatang_chinese_sopn_1200h/datatang_chinese_sopn_1200h_cuts.jsonl.gz
- hours: 187
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/chinese_children_1611/chinese_children_1611_cuts.jsonl.gz
- hours: 11993
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/data_split/split_all_cuts.jsonl.gz
- hours: 1045
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/shicen_spon_1180_202007/shicen_spon_1180_202007_cuts.jsonl.gz
- hours: 997
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/datatang_accent_1000h/datatang_accent_1000h_cuts.jsonl.gz
- hours: 461
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/spontaneous_datatang/spontaneous_datatang_cuts.jsonl.gz
- hours: 1927
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/spon_King-ASR-113/spon_King-ASR-113_cuts.jsonl.gz
- hours: 419
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/spontaneous_demao/spontaneous_demao_cuts.jsonl.gz
- hours: 40
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/infosec_corpus_4/infosec_corpus_4_cuts.jsonl.gz
- hours: 767
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_2/wxzl_corpus_2_cuts.jsonl.gz
- hours: 997
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/sppd_corpus_1/sppd_corpus_1_cuts.jsonl.gz
- hours: 517
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_6/wxzl_corpus_6_cuts.jsonl.gz
- hours: 247
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/infosec_corpus_3/infosec_corpus_3_cuts.jsonl.gz
- hours: 999
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/sppd_corpus_3/sppd_corpus_3_cuts.jsonl.gz
- hours: 5500
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/teg_ailab_corpus_3/teg_ailab_corpus_3_cuts.jsonl.gz
- hours: 2067
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_4/wxzl_corpus_4_cuts.jsonl.gz
- hours: 698
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_7/wxzl_corpus_7_cuts.jsonl.gz
- hours: 3961
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/infosec_corpus_2/infosec_corpus_2_cuts.jsonl.gz
- hours: 999
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/sppd_corpus_2/sppd_corpus_2_cuts.jsonl.gz
- hours: 1523
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_5/wxzl_corpus_5_cuts.jsonl.gz
- hours: 1476
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/teg_ailab_corpus_2/teg_ailab_corpus_2_cuts.jsonl.gz
- hours: 141
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_3/wxzl_corpus_3_cuts.jsonl.gz
- hours: 1000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/teg_ailab_corpus_1/teg_ailab_corpus_1_cuts.jsonl.gz
- hours: 731
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/infosec_corpus_1/infosec_corpus_1_cuts.jsonl.gz
- hours: 22
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/500h_label_mig_musicbox_20180316/500h_label_mig_musicbox_20180316_cuts.jsonl.gz
- hours: 1016
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/tencent_ad_202104_part1/tencent_ad_202104_part1_cuts.jsonl.gz
- hours: 349
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181206/wangzheshouban_20181206_cuts.jsonl.gz
- hours: 1798
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_202012_part1/qq_data_202012_part1_cuts.jsonl.gz
- hours: 141
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181130/wangzheshouban_20181130_cuts.jsonl.gz
- hours: 3
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181030/wangzheshouban_20181030_cuts.jsonl.gz
- hours: 2267
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/tencent_ad_2020_part2/tencent_ad_2020_part2_cuts.jsonl.gz
- hours: 183
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_201906_3/qq_data_201906_3_cuts.jsonl.gz
- hours: 594
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/tencent_ad_202104_part2/tencent_ad_202104_part2_cuts.jsonl.gz
- hours: 194
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_201906_2/qq_data_201906_2_cuts.jsonl.gz
- hours: 60
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/500h_label_mig_musicbox_20180329/500h_label_mig_musicbox_20180329_cuts.jsonl.gz
- hours: 6328
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_202101/qq_data_202101_cuts.jsonl.gz
- hours: 100
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181220/wangzheshouban_20181220_cuts.jsonl.gz
- hours: 67
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_1120/wangzheshouban_1120_cuts.jsonl.gz
- hours: 14
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/500h_label_mig_musicbox_20180312/500h_label_mig_musicbox_20180312_cuts.jsonl.gz
- hours: 165
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181116/wangzheshouban_20181116_cuts.jsonl.gz
- hours: 27
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20181227/wangzheshouban_20181227_cuts.jsonl.gz

# High-WER parts: start
- hours: 1178
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/oteam/wxzl_corpus_1/wxzl_corpus_1_cuts.jsonl.gz
- hours: 216
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/hepingjingying_202108/hepingjingying_202108_cuts.jsonl.gz

- hours: 426
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0611/htrs_yinxiang_0611_cuts.jsonl.gz
- hours: 443
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0817/htrs_yinxiang_0817_cuts.jsonl.gz
- hours: 283
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/1000h_label_yinxiang_0326/1000h_label_yinxiang_0326_cuts.jsonl.gz
- hours: 219
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0530/htrs_yinxiang_0530_cuts.jsonl.gz
- hours: 350
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0522/htrs_yinxiang_0522_cuts.jsonl.gz
- hours: 222
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0716/htrs_yinxiang_0716_cuts.jsonl.gz
- hours: 267
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/aishu_yinxinag_20181207/aishu_yinxinag_20181207_cuts.jsonl.gz
- hours: 112
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0523_vad/htrs_yinxiang_0523_vad_cuts.jsonl.gz
- hours: 262
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0725/htrs_yinxiang_0725_cuts.jsonl.gz
- hours: 129
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0517/htrs_yinxiang_0517_cuts.jsonl.gz
- hours: 359
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0523/htrs_yinxiang_0523_cuts.jsonl.gz
- hours: 422
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_yinxiang_0827/htrs_yinxiang_0827_cuts.jsonl.gz

- hours: 6
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_tv_20181123/htrs_tv_20181123_cuts.jsonl.gz
- hours: 173
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_tv_20181128/htrs_tv_20181128_cuts.jsonl.gz
- hours: 33
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_tv_20181203/htrs_tv_20181203_cuts.jsonl.gz
- hours: 349
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/htrs_tv_20181211/htrs_tv_20181211_cuts.jsonl.gz
# High-WER parts: end

- hours: 20
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/500h_label_mig_musicbox_20180305/500h_label_mig_musicbox_20180305_cuts.jsonl.gz
- hours: 273
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_201906_1/qq_data_201906_1_cuts.jsonl.gz
- hours: 2403
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/qq_data_202012_part2/qq_data_202012_part2_cuts.jsonl.gz
- hours: 1052
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/tencent_ad_2020_part1/tencent_ad_2020_part1_cuts.jsonl.gz
- hours: 29
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/online/wangzheshouban_20190103/wangzheshouban_20190103_cuts.jsonl.gz
- hours: 1837
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/disturb/chongliang_net_data_2000h_20200710/chongliang_net_data_2000h_20200710_cuts.jsonl.gz
- hours: 1792
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/disturb/chongliang_net_data_2000h_202003/chongliang_net_data_2000h_202003_cuts.jsonl.gz
- hours: 551
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/disturb/zhongkouyin_ten_city_201210/zhongkouyin_ten_city_201210_cuts.jsonl.gz
- hours: 2921
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/precise/qq_data/qq_data_add_patch_cuts.jsonl.gz
- hours: 2525
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/precise/spon_talk/spon_talk_cuts.jsonl.gz
- hours: 3671
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/precise/normal_reading/normal_reading_cuts.jsonl.gz
- hours: 7118
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/precise/collection_reading/collection_reading_cuts.jsonl.gz
- hours: 751
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_191107_1/net_video_191107_1_cuts.jsonl.gz
- hours: 1289
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_xinxi_202006/net_video_xinxi_202006_cuts.jsonl.gz
- hours: 89
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_191223/net_video_191223_cuts.jsonl.gz
- hours: 163
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_kuaishou_20190610/net_video_kuaishou_20190610_cuts.jsonl.gz
- hours: 370
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/htrs_wangyi_xueyuan_20180704/htrs_wangyi_xueyuan_20180704_cuts.jsonl.gz
- hours: 179
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_ocr_1111/net_video_ocr_1111_cuts.jsonl.gz
- hours: 118
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/biaobei_xueyuan_0401/biaobei_xueyuan_0401_cuts.jsonl.gz
- hours: 83
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_ocr_1029/net_video_ocr_1029_cuts.jsonl.gz
- hours: 918
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_xinxi_202007/net_video_xinxi_202007_cuts.jsonl.gz
- hours: 1566
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_xinxi_20191221/net_video_xinxi_20191221_cuts.jsonl.gz
- hours: 913
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/3000h_label_xueyuan_0524/3000h_label_xueyuan_0524_cuts.jsonl.gz
- hours: 876
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_191107_2/net_video_191107_2_cuts.jsonl.gz
- hours: 163
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/datatang_xueyuan_0402/datatang_xueyuan_0402_cuts.jsonl.gz
- hours: 751
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_xinxi_202004/net_video_xinxi_202004_cuts.jsonl.gz
- hours: 42
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_liyongle_20191118/net_video_liyongle_20191118_cuts.jsonl.gz
- hours: 815
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_from_auszhang_202003/net_video_from_auszhang_202003_cuts.jsonl.gz
- hours: 83
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/net_video_ocr_1105/net_video_ocr_1105_cuts.jsonl.gz
- hours: 1239
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/net_data/htrs_xueyuan_201901/htrs_xueyuan_201901_cuts.jsonl.gz
# # ---------------------------- ailab new private chinese data ---------------------------------
- hours: 2000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/buy_20240331_zhibo/zhibo_22k_2000h.jsonl.gz
- hours: 5000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/buy/buy_20240331_zhibo/zhibo_16k_5000h.jsonl.gz
# ---------------------------- yuanbao private data --------------------------------
- hours: 40
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/yuanbao/yuanbao_20241102_1st_part1.jsonl.gz
- hours: 352
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/yuanbao/yuanbao_20241204_2st_3st.jsonl.gz

# ------------------ japanese data------------------
- hours: 35389
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/japanese/ReazonSpeech/ReazonSpeech_cuts.jsonl.gz
  lang: ja
- hours: 3578
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/japanese/csig_ytb_2022/filtered/japanese_csig_ytb_2022.jsonl.gz
  lang: ja
- hours: 941
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/japanese/mdt_2020/japanese_mdt_2020_filtered_cuts_fix.jsonl.gz
  lang: ja
- hours: 19
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/ja/ja_train_cuts.jsonl.gz
  lang: ja
- hours: 499
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_ja_train_cuts_filtered.jsonl.gz
  lang: ja

# ------------------ korean data------------------
- hours: 4324
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/csig_ytb_2022/filtered/korean_csig_ytb_2022.jsonl.gz
  lang: ko
- hours: 965
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/KsponSpeech/KsponSpeech_cuts.jsonl.gz
  lang: ko
- hours: 2906
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/KrespSpeech/KrespSpeech_cuts.jsonl.gz
  lang: ko
- hours: 2928
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/KconfSpeech/KconfSpeech_cuts.jsonl.gz
  lang: ko
- hours: 4962
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/MeetingSpeech/MeetingSpeech_cuts.jsonl.gz
  lang: ko
- hours: 2481
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/korean/GyeongsangSpeech/GyeongsangSpeech_cuts.jsonl.gz
  lang: ko
- hours: 1
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/ko/ko_train_cuts.jsonl.gz
  lang: ko
- hours: 1528
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_ko_train_cuts_filtered.jsonl.gz
  lang: ko

# ------------------ English data------------------
- hours: 45751
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/libriheavy/libriheavy_large_cuts.jsonl.gz
  lang: en
  weights: 0.8
- hours: 44659
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/mls/mls_english/mls_english_train_cuts.jsonl.gz
  lang: en
  weights: 0.8
- hours: 1778
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/en/en_train_cuts.jsonl.gz
  lang: en
  weights: 7
- hours: 3426
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_en_filtered_train_cuts.jsonl.gz
  lang: en
  weights: 2
- hours: 960
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/librispeech/librispeech_train.jsonl.gz
  lang: en
- hours: 10000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/gigaspeech/gigaspeech_train.jsonl.gz
  lang: en
- hours: 522
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/voxpopuli/en_train_cuts.jsonl.gz
  lang: en
- hours: 453
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/TEDLIUM/TEDLIUM_train_cuts.jsonl.gz
  lang: en
- hours: 77
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/AMI/AMI_train_cuts.jsonl.gz
  lang: en

# ------------------ french data------------------
- hours: 589
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/french/datatang_french_500h/datatang_french_500h_cuts_fix.jsonl.gz
  lang: fr
- hours: 1076
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/mls/mls_french/mls_french_train_cuts.jsonl.gz
  lang: fr
- hours: 831
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/fr/fr_train_cuts.jsonl.gz
  lang: fr
- hours: 1423
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_fr_train_cuts_filtered.jsonl.gz
  lang: fr
- hours: 205
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/voxpopuli/fr_train_cuts.jsonl.gz
  lang: fr

# ------------------ spanish data------------------
- hours: 627
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/spanish/datatang_spanish_500h/datatang_spanish_500h_cuts_fix.jsonl.gz
  lang: es
- hours: 917
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/mls/mls_spanish/mls_spanish_train_cuts.jsonl.gz
  lang: es
- hours: 502
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/es/es_train_cuts.jsonl.gz
  lang: es
- hours: 2399
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_es_train_cuts_filtered.jsonl.gz
  lang: es
- hours: 151
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/voxpopuli/es_train_cuts.jsonl.gz
  lang: es

# ------------------ Portuguese data------------------
- hours: 565
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/portuguese/datatang_portuguese_500h/datatang_portuguese_500h_cuts_fix.jsonl.gz
  lang: pt
- hours: 160
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/mls/mls_portuguese/mls_portuguese_train_cuts.jsonl.gz
  lang: pt
- hours: 25
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/pt/pt_train_cuts.jsonl.gz
  lang: pt
- hours: 852
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_pt_train_cuts_filtered.jsonl.gz
  lang: pt

# ------------------ russian data------------------
- hours: 8048
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/russian/csig_ytb_2022/filtered/russian_csig_ytb_2022.jsonl.gz
  lang: ru
- hours: 1221
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/russian/Golos/Golos_cuts.jsonl.gz
  lang: ru
- hours: 874
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/russian/buriy_audiobook_2/buriy_audiobook_2_train_cuts_filtered.jsonl.gz
  lang: ru
- hours: 1651
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/russian/public_speech_and_radio/public_speech_and_radio_train_cuts_filtered.jsonl.gz
  lang: ru
- hours: 809
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/russian/public_youtube1120/public_youtube1120_train_cuts_filtered.jsonl.gz
  lang: ru
- hours: 37
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/ru/ru_train_cuts.jsonl.gz
  lang: ru
- hours: 2606
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_ru_train_cuts_filtered.jsonl.gz
  lang: ru

# ------------------ vietnamese data------------------
- hours: 1614
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/vietnamese/csig_ytb_2022/filtered/vietnamese_csig_ytb_2022.jsonl.gz
  lang: vi
- hours: 101
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/vietnamese/vlsp2020_vinai_100h/vlsp2020_vinai_100h_cuts.jsonl.gz
  lang: vi
- hours: 324
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/vietnamese/viet_bud500/viet_bud500_train_cuts_filtered.jsonl.gz
  lang: vi
- hours: 80
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/vietnamese/LSVSC/LSVSC_cuts.jsonl.gz
  lang: vi
- hours: 81
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/vietnamese/ViMD/ViMD_cuts.jsonl.gz
  lang: vi
- hours: 2
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/vi/vi_train_cuts.jsonl.gz
  lang: vi
- hours: 6048
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/gigaspeech2/vi/vi_train_cuts.jsonl.gz
  lang: vi
- hours: 140
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_vi_train_cuts_filtered.jsonl.gz
  lang: vi

# ------------------ Indonesian data------------------
- hours: 1437
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/indonesian/csig_ytb_2022/filtered/indonesian_csig_ytb_2022.jsonl.gz
  lang: id
- hours: 7
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/commonvoice/cv-corpus-20.0/id/id_train_cuts.jsonl.gz
  lang: id
- hours: 6352
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/gigaspeech2/id/id_train_cuts.jsonl.gz
  lang: id
- hours: 442
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/multilingual/yodas/yodas_id_train_cuts_filtered.jsonl.gz
  lang: id