_target_: tasks.mt3_net.MT3Net
config: # a hacky way to make this config works for T5 class
  architectures:
    - T5ForConditionalGeneration
  d_ff: 1024
  d_kv: 64
  d_model: 512
  decoder_start_token_id: 0
  dropout_rate: 0.1
  pad_token_id: 0
  eos_token_id: 1
  unk_token_id: 2
  feed_forward_proj: gated-gelu
  initializer_factor: 1.0
  is_encoder_decoder: true
  layer_norm_epsilon: 1e-06
  model_type: t5
  num_heads: 6
  num_decoder_layers: 8
  num_layers: 8
  output_past: true
  tie_word_embeddings: false
  vocab_size: 1536
  encoder_vocab_size: 1024
  xl_context_length: 2048
  use_cache: False