training_modifiers:
  - !EpochRangeModifier
    start_epoch: 0
    end_epoch: 8

  - !LearningRateFunctionModifier
      start_epoch: 0
      end_epoch: 8
      lr_func: linear
      init_lr: 1.5e-4
      final_lr: 1.5e-6

pruning_modifiers:
  - !ConstantPruningModifier
    start_epoch: 0
    end_epoch: 8
    params:
      - re:bert.encoder.layer.*.attention.self.query.weight
      - re:bert.encoder.layer.*.attention.self.key.weight
      - re:bert.encoder.layer.*.attention.self.value.weight
      - re:bert.encoder.layer.*.attention.output.dense.weight
      - re:bert.encoder.layer.*.intermediate.dense.weight
      - re:bert.encoder.layer.*.output.dense.weight

distillation_modifiers:
  - !DistillationModifier
     hardness: 1.0
     temperature: 5.5
     distill_output_keys: [start_logits, end_logits]