# ------------------ public dialect data --------------------------
- hours: 860
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/kespeech/kespeech.jsonl.gz
  weights: 1
  source: wav fbank
- hours: 536
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/kespeech/kespeech_phase2.jsonl.gz
  weights: 1
  source: wav fbank
# - hours: 6
#   manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/gan/gan_cuts_train.jsonl.gz
#   weights: 1
#   note: GCDC https://aclanthology.org/L18-1036.pdf Bad transliteration, dont use!

# ------------------public chinese data------------------
- hours: 10005
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/disturb/wenetspeech_denoise/wenetspeech_denoise_cuts.jsonl.gz
  weights: 1
  source: patch
- hours: 1000
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/open_source/aishell2/aishell2_cuts.jsonl.gz
  weights: 1
  source: patch
- hours: 150
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/chinese/open_source/aishell1/aishell1_cuts.jsonl.gz
  weights: 1
  source: patch

# ------------------- ailab private dialect data -------------------
- hours: 4528
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/cantonese/cantonese_cuts.jsonl.gz
  weights: 1
  source: patch
# ------------------- hunyuan/ailab buy private dialect data -------------
- hours: 189
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_guan_郑州_TF-B202302151728236755.jsonl.gz
  weights: 1
  source: patch

- hours: 544
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_jin_兰州_King-ASR-384-18.jsonl.gz
  weights: 1
  source: patch

- hours: 141
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_jin_宁夏_King-ASR-384-23.jsonl.gz
  weights: 1
  source: patch

- hours: 490
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_jin_山西_King-ASR-384-16.jsonl.gz
  weights: 1
  source: patch

- hours: 86
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_jin_山西_TF-B202107131525220981.jsonl.gz
  weights: 1
  source: patch

- hours: 1495
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_东北_Liaoning_384_14_21.jsonl.gz
  weights: 1
  source: patch

- hours: 141
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_东北_dongbei_dialogue.jsonl.gz
  weights: 1
  source: patch

- hours: 993
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_云南_163.jsonl.gz
  weights: 1
  source: patch

- hours: 401
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_云南_TF-B202302151734369686.jsonl.gz
  weights: 1
  source: patch

- hours: 31
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_全国_132.jsonl.gz
  weights: 1
  source: patch

- hours: 47
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_全国_662.jsonl.gz
  weights: 1
  source: patch

- hours: 163
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_合肥_TF-B202302081451278939.jsonl.gz
  weights: 1
  source: patch

- hours: 491
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_四川_sichuan.jsonl.gz
  weights: 1
  source: patch

- hours: 569
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_四川_raw_data.jsonl.gz
  weights: 1
  source: patch

- hours: 225
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_天津_King-ASR-384-10.jsonl.gz
  weights: 1
  source: patch

- hours: 427
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_天津_tianjin.jsonl.gz
  weights: 1
  source: patch

- hours: 530
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_山东_King-ASR-384-12.jsonl.gz
  weights: 1
  source: patch

- hours: 190
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_桂林_TF-B202302151729522297.jsonl.gz
  weights: 1
  source: patch

- hours: 190
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_武汉_1235.jsonl.gz
  weights: 1
  source: patch

- hours: 209
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_武汉_wuhan_dialogue.jsonl.gz
  weights: 1
  source: patch

- hours: 489
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_江西_jiangxi.jsonl.gz
  weights: 1
  source: patch

- hours: 86
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_江西_jiangxi_dialogue.jsonl.gz
  weights: 1
  source: patch

- hours: 269
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_河北_King-ASR-384-20.jsonl.gz
  weights: 1
  source: patch

- hours: 144
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_河北_hebei_dialogue.jsonl.gz
  weights: 1
  source: patch

- hours: 283
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_济南_jinan_dialogue.jsonl.gz
  weights: 1
  source: patch

- hours: 496
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_浙，江苏，安徽_wuyu.jsonl.gz
  weights: 1
  source: patch

- hours: 234
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_湖南_159.jsonl.gz
  weights: 1
  source: patch

- hours: 496
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_湖南_hunan.jsonl.gz
  weights: 1
  source: patch

- hours: 495
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_福建_fujian.jsonl.gz
  weights: 1
  source: patch

- hours: 186
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_西安_TF-B202303201443492179.jsonl.gz
  weights: 1
  source: patch

- hours: 421
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_长沙_TF-B202302151434423506.jsonl.gz
  weights: 1
  source: patch

- hours: 352
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_min_潮汕_TF-B202107131524463806.jsonl.gz
  weights: 1
  source: patch

- hours: 968
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_min_闽南_TF202212221533215938.jsonl.gz
  weights: 1
  source: patch

- hours: 141
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_min_闽南_raw_data.jsonl.gz
  weights: 1
  source: patch

- hours: 525
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_wu_上海_157.jsonl.gz
  weights: 1
  source: patch

- hours: 493
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_wu_上海_TF-B202107131523103677.jsonl.gz
  weights: 1
  source: patch

- hours: 234
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_wu_杭州_159.jsonl.gz
  weights: 1
  source: patch

- hours: 162
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_wu_苏州_APY161101007_176h_suzhou.jsonl.gz
  weights: 1
  source: patch

- hours: 635
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_yue_香港_A202103101615378105.jsonl.gz
  weights: 1
  source: patch

- hours: 381
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_yue_香港_1643.jsonl.gz
  weights: 1
  source: patch

# - hours: 500
#   manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_yue_香港_1644.jsonl.gz
#   weights: 1
#   source: patch
#   note: sampling rate 48000

- hours: 321
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_mandarin_武汉_2094.jsonl.gz
  weights: 1
  source: patch


# # -------------has alignment issue, DON'T USE!!!!!! -----------------
#   - hours: 
#   manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/dialect/fangyan_buy/fangyan_yue_香港_2087.jsonl.gz
#   weights: 1
#   source: patch
# # ------------------------------------------------------------------------

# -----------------------------------Public English--------------------------------------------
- hours: 500
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/librispeech/librispeech_train-other-500.jsonl.gz
  weights: 1
  source: wav
- hours: 360
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/librispeech/librispeech_train-clean-360.jsonl.gz
  weights: 1
  source: wav
- hours: 100
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/english/librispeech/librispeech_train-clean-100.jsonl.gz
  weights: 1
  source: wav
- hours: 555
  manifest: /apdcephfs_cq12/share_302080740/data/asr_train_data/manifests/code_switch/talcs/tal_csasr_train_set.jsonl.gz
  weights: 1
  source: wav
  note: code-switch
