CUDA_VISIBLE_DEVICES=1 python train.py\
    --model_name gpt2\
    --train_path ../wikitext103/wikitext103_raw_v1_train.txt\
    --dev_path ../wikitext103/wikitext103_raw_v1_validation.txt\
    --test_path ../wikitext103/wikitext103_raw_v1_test.txt\
    --margin 0.0\
    --max_len 256\
    --number_of_gpu 1\
    --batch_size_per_gpu 8\
    --gradient_accumulation_steps 16\
    --effective_batch_size 128\
    --total_steps 40000\
    --print_every 200\
    --save_every 1000\
    --learning_rate 2e-5\
    --save_path_prefix ./mle_wikitext103/