defaults:
  - training-default-attention-test

#nb_epochs_train: 4
nb_epochs_train: 800
#nb_epochs_pretrain: 2
nb_epochs_pretrain: 800

#early_stopping
#earlystop: True
#earlystop_patience: 120
#earlystop_min_delta: 0.00001
#restore_best_weights: True
earlystop: loss
earlystop_patience: 120
#earlystop_patience: 2
earlystop_min_delta: 0.00002
restore_best_weights: True
earlystop_min_epochs: 2
earlystop_ema_alpha: 0.5  # EMA平滑系数 (0~1)
earlystop_restart_epochs: 0  # 早停后微调轮次
earlystop_restart_lr_factor: 0.1  # 重启学习率比例

# Regularizers
LB_L2_strength: 100
LB_L2_thresh: 1e-3
UB_L2_strength: 0
UB_L2_thresh: 10

# LR scheduler
lr_scheduler: 'cosine'  # 'cosine' or None / 'null' are the only options for now

# Learning rate
lr: 1e-2

# pruning
is_prune: False
nb_epochs_retrain: 200
#nb_epochs_retrain: 2
prune_percentage_start: 0.40
tolerance: 0.02 # how many percentages of performance drop is tolerable, 0.03 -> 3%
prune_precision: [0.1, 0.05] # The pruning process will go through the list one by one, for example [0.1, 0.05], 
                             # it will first find the max pruned model of connection sparsity in 0.1 precision (like 50%), when the performance drop is more than the set tolerance,
                             # it will go to the next precision, i.e., 0.05 (5%) in this case. 
max_prune_percentage: 0.8
is_plot_pruning: True
is_pruning_ver: True