#!/bin/bash
python tools/preprocess_data.py \
       --input /workspace/dataset/dataset/pile_OpenWebText2_onefile_clean_after_dedup.json \
       --output-prefix my-gpt2 \
       --vocab-file /workspace/dataset/gpt2_vocabulary/gpt2-vocab.json \
       --tokenizer-type GPT2BPETokenizer \
       --merge-file /workspace/dataset/gpt2_vocabulary/gpt2-merges.txt \
       --workers 20 \
       --append-eod