model_type: TransformerDistiller
model_conf:
    pos_enc: sinusoidal # choice: null, sinusoidal
    embed_unit: 256 # equal to att_unit
    # att_unit: 512
    head: 8
    unit: 1024
    layer: 6
    dropout_rate: 0.0
    tie_weights: false
    
lm_dataset_conf:
    token_conf:
        min_len: 5
        max_len: 100
        bpe_model: data/bpe_model/spm_giga_xmly_1500.model
        token_type: 'bpe' # chara bpe word_en
        to_lower: false # bpe model only support upper, default: false
        cn_en_symbol: true # add cn en symbol or not, default: true
    shuffle: true
    shuffle_conf:
        shuffle_size: 50000
    sort: true
    sort_conf:
        sort_size: 30000  # sort_size must less than shuffle_size
    batch_conf:
        batch_type: dynamic
        batch_size: 64
        max_tokens_in_batch: 20000
    batch_shuffle: true
    batch_shuffle_conf:
        batch_shuffle_size: 100
    data_aug: false
    sub_prob: 0.01  # 随机置换为同音字概率
    del_prob: 0.01  # 随机删除token

max_epoch: 50
grad_clip: 5.0
accum_grad: 4  # 梯度累积
log_interval: 200
save_steps: 100
keep_last_k_ckpt: 10
avg_ckpt_num: 5
keep_topk_dir: true

optim: Adam
optim_conf:
    lr: 0.001
    # betas: [0.9, 0.999]
    weight_decay: 1.0e-06
scheduler_conf:
    early_stop_patient_n_epochs: 10
    lower_better: true
    warmup_n_steps: 5000
    noam: true
    save_checkpoints_topk: 5

# freeze intermedia params of student model, default: false
freeze_intermedia_params: false

distill_type: Tiny
distill_loss_conf:
    convert_epoch: 10
    alpha_soft: 1.0
    alpha_nwp: 0.0
distiller_conf:
    using_minilm_distill: false
    using_tiny_distill: true

model_init_type: finetune
pretrained_checkpoint: checkpoints/backbone/finetune/transformer_v6_3_ft_v5_4_no_drop/TransformerLM_9/TransformerLM_avg_5.pt

teacher_model: checkpoints/distill/teacher/finetune/transformer_256_1024_18_v6_3_ft_v5_4/TransformerLM_avg_5.pt
teacher_config: checkpoints/distill/teacher/finetune/transformer_256_1024_18_v6_3_ft_v5_4/train.yaml
