modifiers:
  - !EpochRangeModifier
    start_epoch: 0
    end_epoch: 3

training_modifiers:
  - !LearningRateFunctionModifier
    start_epoch: 0.0
    end_epoch: 3.0
    lr_func: cyclic_linear
    cycle_epochs: 0.25
    init_lr: 5e-4
    final_lr: 5e-6

  - !OBSPruningModifier
    params: [
      "re:bert.encoder.layer.*.attention.self.query.weight",
      "re:bert.encoder.layer.*.attention.self.key.weight",
      "re:bert.encoder.layer.*.attention.self.value.weight",
      "re:bert.encoder.layer.*.attention.output.dense.weight",
      "re:bert.encoder.layer.*.intermediate.dense.weight",
      "re:bert.encoder.layer.*.output.dense.weight",
    ]
    init_sparsity: 0.7
    final_sparsity: 0.97
    start_epoch: 0
    end_epoch: 2.75
    update_frequency: 0.25
    inter_func: cubic
    global_sparsity: True
    mask_type: unstructured
    num_grads: 1024
    damp: 1e-7
    fisher_block_size: 50
    grad_sampler_kwargs:
      batch_size: 16

distillation_modifiers:
  - !DistillationModifier
     hardness: 1.0
     temperature: 5.5
     distill_output_keys: [logits]
