modifiers:
  - !EpochRangeModifier
    start_epoch: 0
    end_epoch: 30

training_modifiers:
  - !LearningRateFunctionModifier
    start_epoch: 0.0
    end_epoch: 2.0
    lr_func: linear
    init_lr: 8e-5
    final_lr: 8e-6
  - !LearningRateFunctionModifier
    start_epoch: 2.0
    end_epoch: 30.0
    lr_func: cyclic_linear
    cycle_epochs: 4.0
    init_lr: 8e-5
    final_lr: 8e-6

  - !OBSPruningModifier
    params: [
      "re:bert.encoder.layer.*.attention.self.query.weight",
      "re:bert.encoder.layer.*.attention.self.key.weight",
      "re:bert.encoder.layer.*.attention.self.value.weight",
      "re:bert.encoder.layer.*.attention.output.dense.weight",
      "re:bert.encoder.layer.*.intermediate.dense.weight",
      "re:bert.encoder.layer.*.output.dense.weight",
    ]
    init_sparsity: 0.3
    final_sparsity: 0.8
    start_epoch: 2
    end_epoch: 26
    update_frequency: 4.0
    inter_func: cubic
    global_sparsity: True
    mask_type: block4
    num_grads: 1024
    damp: 1e-7
    fisher_block_size: 32
    grad_sampler_kwargs:
      batch_size: 8

distillation_modifiers:
  - !DistillationModifier
     hardness: 1.0
     temperature: 2.0
     distill_output_keys: [start_logits, end_logits]
