{
    "train_batch_size": 256,
    "gradient_accumulation_steps": 1,
    "steps_per_print": 50,

    "optimizer": {
	"type": "Adam",
	"params": {
	    "lr": 0.001,
	    "betas": [
		0.8,
		0.999
	    ],
	    "eps": 1e-8,
	    "weight_decay": 3e-7
	}
    },

    "zero_optimization": {
	"stage": 0
    },
    "zero_allow_untested_optimizer": true,
    "fp16": {
	"enabled": true,
	"auto_cast": true
    }
    "gradient_clipping": 0,
    "prescale_gradients": false,
    "cuda_visible_devices": 0,
    "wall_clock_breakdown" : false
}
