batch_size: 8
gradient_accumulation_steps: 1
val_batch_size: 32
adam_epsilon: 1e-8
learning_rate: 5e-5
weight_decay: 5e-6
checkpoint_every_step: 400
num_training_steps: 4000
max_length: 512
warmup_step: 20
lm_type: bert-base-uncased
label_path: datasets/rte/labels.txt
test_path: datasets/rte/valid_whole.txt
dev_path: datasets/rte/valid_whole.txt
