export WANDB_PROJECT="ZETA"

CUDA_VISIBLE_DEVICES="0,1,2,3" MIXED_PRECISION="fp16" torchrun --nproc_per_node=4 --nnodes=1 train_1dformer.py \
        --model_name_or_path model_path \
        --dataset_name dataset_path \
        --dataset_config_name wikitext-103-raw-v1 \
        --per_device_train_batch_size 8  \
        --per_device_eval_batch_size 12 \
        --do_train True \
        --do_eval True \
        --do_predict True \
        --fp16 True \
        --model_max_position_embeddings 1024 \
        --output_dir /outputs \
        --gradient_accumulation_steps 1 \
        --evaluation_strategy "steps" \
        --eval_steps 2000 \
        --save_steps 1000 \
        --num_train_epochs 8 \
        --seed=2222 \
        --save_total_limit 3 \
        --warmup_steps 4000 \
        --learning_rate 2e-4 \
