import json
import gzip

input_path = "/apdcephfs_cq12/share_302080740/user/raytseng/data/ParaSpeechCaps/dev_1000.jsonl.gz"
output_path = "/apdcephfs_cq12/share_302080740/user/raytseng/data/ParaSpeechCaps/dev_1000_merged.jsonl.gz"

with gzip.open(input_path, 'rt') as fin, gzip.open(output_path, 'wt') as fout:
    for line in fin:
        cut = json.loads(line)
        captions = [s["custom"]["caption"] for s in cut["supervisions"]]
        merged_sup = cut["supervisions"][0]
        merged_sup["custom"] = {"caption": captions}
        cut["supervisions"] = [merged_sup]
        fout.write(json.dumps(cut) + "\n")