model:
  _target_: tnp.models.tnpa.TNPA
  encoder: ${tnp_encoder}
  decoder: ${tnp_decoder}
  likelihood: ${likelihood}
  no_samples_rollout_pred: ${params.num_samples_pred}

tnp_encoder:
  _target_: tnp.models.tnpa.ARTNPEncoder
  transformer_encoder: ${transformer_encoder}
  xy_encoder: ${xy_encoder}

transformer_encoder:
  _target_: tnp.networks.transformer.TransformerEncoder
  mhsa_layer: ${mhsa_layer}
  num_layers: ${params.num_layers}

mhsa_layer:
  _target_: tnp.networks.attention_layers.MultiHeadSelfAttentionLayer
  embed_dim: ${params.embed_dim}
  num_heads: ${params.num_heads}
  head_dim: ${params.head_dim}
  feedforward_dim: ${params.embed_dim}
  norm_first: ${params.norm_first}

xy_encoder:
  _target_: tnp.networks.mlp.MLP
  in_dim: ${eval:'1 + ${params.dim_y} + ${params.dim_x}'}
  out_dim: ${params.embed_dim}
  num_layers: 2
  width: ${params.embed_dim}

tnp_decoder:
  _target_: tnp.models.tnp.TNPDecoder
  z_decoder: ${z_decoder}

z_decoder:
  _target_: tnp.networks.mlp.MLP
  in_dim: ${params.embed_dim}
  out_dim: ${eval:'2 * ${params.dim_y}'}
  num_layers: 2
  width: ${params.embed_dim}

likelihood:
  _target_: tnp.likelihoods.gaussian.HeteroscedasticNormalLikelihood

optimiser:
  _target_: torch.optim.AdamW
  _partial_: True
  lr: 5.0e-4

params:
  # Model + Training Params
  epochs: 200
  embed_dim: 128
  num_heads: 8
  head_dim: 16
  norm_first: True
  num_layers: 5

  # Samples to be used when using monte carlo prediction (ie unrolling loop without teacher forcing for tnpa)
  num_samples_pred: 50


misc:
  project: tnpa-rbf-rangesame
  name: TNPA-L${params.num_layers}-H${params.num_heads}-D${params.embed_dim}
  resume_from_checkpoint: null
  gradient_clip_val: 0.5
  plot_interval: 10
