{
    "n_filters": 64,
    "strides": [8,5,4,2],
    "dimension": 1024,
    "time_dimension": 150,
    "semantic_dimension": 768,
    "bidirectional": true,
    "dilation_base": 2,
    "residual_kernel_size": 3,
    "n_residual_layers": 1,
    "lstm_layers": 2,
    "activation": "ELU",
    "codebook_size": 1024,
    "n_q": 8,
    "attention_heads": 8,

    "num_mels": 80,
    "num_freq": 1025,
    "n_fft": 1024,
    "hop_size": 240,
    "win_size": 1024,
    "fmin": 0,
    "fmax": 8000,
    "fmax_for_loss": null,
    "mel_loss_lambdas": [45, 1, 1, 1],
    "recon_loss_lambda": 500,
    "commitment_loss_lambda": 10,

    "sample_rate": 16000,
    "betas":[0.9, 0.99],
    "adam_b2": 0.9,
    "lr_decay": 0.98,
    "seed": 1234,
    "segment_size": 48000,
    "wd": 0,
    "num_workers": 4,
    "showpiece_num": 6,
    
    "project_name": "speechtokenizer-dooku",
    "epochs": 10, 
    "num_warmup_steps": 0,
    "num_ckpt_keep": 3,
    "batch_size": 6, 
    "learning_rate": 1e-4, 
    "intial_learning_rate":1e-3, 
    "log_steps": 10,
    "stdout_steps": 10,
    "save_model_steps": 1000,
    "distill_type": "t_axis",
    "distill_loss_lambda": 120
}
