CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --num_processes=8 --main_process_port=29699 /Code/Continued_Pretrain.py \
    --dataset_name "cp_dataset_tok" \
    --token "HF_TOKEN" \
    --wandb_token "WANDB_TOKEN" \
    --project_name "Obscura" \
    --run_name "CP_2794M" \
    --do_eval True \
    --do_train True \
    --trust_remote_code True \
    --low_cpu_mem_usage True \
    --gradient_accumulation_step 2 \
    --optim "adamw_torch" \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
    --tf32 True \
    --logging_steps 50 \
    --logging_strategy "steps" \
    --eval_steps 1000 \
    --evaluation_strategy "steps" \
    --lr_scheduler_type "cosine" \
    --max_eval_samples 16384 \
    --model_name_or_path "BASE_2794M" \
    --num_train_epochs 1.0 \
    --output_dir "/Outputs/Continued_Pretraning/Obscura/BASE_2794M" \
    --overwrite_output_dir True \
    --per_device_eval_batch_size 16 \
    --per_device_train_batch_size 16 \
    --preprocessing_num_workers 12 \
    --report_to "wandb" \
    --save_strategy "steps" \
    --save_steps 1000 \
    --seed 7789 \
    --validation_split_percentage 10 \
    --warmup_ratio 0.05 \
    --dataloader_drop_last True \
    --dataloader_num_workers 4 \
    --dataloader_pin_memory True \
    --dataloader_persistent_workers True \
    --ddp_find_unused_parameters False \
    --llm_int8_threshold 6.0 \
    --lora_alpha 128 \
    --lora_r 256 \
    --lora_dropout 0.05
