# @package __global__
defaults:
  - _self_

lm_model: transformer_hier_lm_tts

codebooks_pattern_long:
  modeling: delay
  delay:
    flatten_first: 0
    empty_initial: 0

transformer_lm_hier_long:
  dim: 512
  num_heads: 8
  num_layers: 8
  hidden_scale: 4
  n_q: 8
  card: 1024
  dropout: 0.
  emb_lr: null
  activation: gelu
  norm_first: false        # use pre-norm instead of post-norm
  bias_ff: true            # use bias for the feedforward
  bias_attn: true          # use bias for the attention
  bias_proj: true          # use bias for the output projections
  past_context: null
  causal: true
  custom: false                 # use custom MHA implementation
  memory_efficient: false       # use flash attention
  attention_as_float32: false   # use float32 for the attention part,
                                # recommended at the moment when memory_efficient is True.
  layer_scale: null
  positional_embedding: sin     # positional embedding strategy (sin, rope, or sin_rope).
  xpos: false                   # apply xpos decay (rope only).
  sep_pos: true                # separate positional embeddings for text and audio.
  checkpointing: none      # layer checkpointing method, can be none, torch, xformers_default.
                           # torch is the slowest but uses the least memory,
                           # xformers_default is somewhere in between.
  weight_init: null     # weight initialization (null, gaussian or uniform)
  depthwise_init: null  # perform depthwise initialization (null, current, global)
  zero_bias_init: false # initialize bias to zero if bias in linears and
                        # if a weight_init method is used.
  norm: layer_norm             # normalization method to use in transformer.
  cross_attention: false
  qk_layer_norm: false
  qk_layer_norm_cross: false
  attention_dropout: null
  kv_repeat: 1
