{   
    "data_paths": "data/processed/pretrain/arrow/all",
    "valid_paths": null,
    "resume_path": null,
    "dataloader_num_workers": 8,
    "block_size": 4096,
    "global_rope_theta": 10000.0,
    "enhanced": false,
    "encoder_layers_num": 10,
    "decoder_layers_num": 2,
    "hidden_size": 768,
    "intermediate_size": 3072,
    "num_attention_heads": 8,
    "num_key_value_heads": 4,
    "head_dim": 128,
    "data_size": 1.0,
    "output_dir": "./models/pretrain/",
    "overwrite_output_dir": true,
    "max_steps": 40000,
    "num_train_epochs": 1,
    "save_steps": 5000,
    "logging_steps": 10,
    "eval_steps": 5000,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "lr_scheduler_type": "cosine",
    "learning_rate": 3e-4,
    "warmup_steps": 2500,
    "save_total_limit": 5,
    "prediction_loss_only": true,
    "bf16": true,
    "report_to": "wandb",
    "logging_dir": "~/tf-logs/",
    "eval_strategy": "steps",
    "save_strategy": "steps",
    "logging_strategy": "steps",
    "logging_first_step": true
}
