data:
    sampling_rate: 32000
    segment_seconds: 10
    tokenizer_type: "HuggingFaceTB/SmolLM2-135M"
    text_tokenization_len: 129

model:
    encoder:
        audioenc_name: 'HTSAT'
        transformer_embed_dim: 768
        out_emb: 768
        d_proj: 576
    decoder:
      text_decoder: "HuggingFaceTB/SmolLM2-135M"
      prefix_length: 389
    model_type: Mellow
