BASE_LR: 1e-3
BATCH_SIZE: 65536
HIDDEN_DIM: 2048
LEARNING_RATE_DECAY_SCALE: 200000
NUM_STEPS: 1500000
N_INNER_LAYERS: 1
RECORD_INTERVAL: 100
TEACHER_PATH: teachers/teacher_16_1
