# Add global parameters section at the top
globals:
  latent_cond_len: &latent_cond_len 2  # Define anchor with "&" symbol

dataset:
  nice_name: Wiki
  name: wiki
  dataset_size: 10000 # Way larger than this
  dim: 1
  seq_length: 390
  pred_length: 30
  train_proportion: 0.8
  val_proportion: 0.2
  seed: 42
  train_batch_size: 64
  gradient_accumulation_batch_size_multiplier: 4
  val_batch_size: 64

  noise_std: 0.0001
  latent_sigma: 0.1

  old_dataloader:
    model: unconditional
    diffusion_config: diffusion_small_config
    normalization: mean
    use_features: False
    use_lags: False
    dataset: wiki2000_nips
    freq: 1D
    context_length: 360 # 360 for `D`
    prediction_length: 30 # 30 for `D`
    lr: 1.e-3
    init_skip: False
    gradient_clip_val: 0.5
    max_epochs: 1000
    num_batches_per_epoch: 128
    batch_size: 64
    # Used only in callback,
    # the final evaluation uses 100 samples
    num_samples: 4
    sampler: ddpm
    sampler_params:
      guidance: quantile
      scale: 2
    use_validation_set: True
    eval_every: 50
    device: cuda:0
    setup: forecasting

  metric_to_compute:
    - crps
    - nll
    - nrmse

  evaluation_settings:
    - future_latent
    - future_observation


baseline_diffusion_model:

  objective: flow_matching
  model_type: MyBaselineDiffusionModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


true_baseline_autoregressive:

  objective: ml
  model_type: MyBaselineAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

baseline_autoregressive:

  objective: ml
  model_type: MyAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


my_autoregressive:

  objective: mse
  model_type: MyAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: False

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_non_probabilistic:

  objective: mse
  model_type: MyNonProbabilisticModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: False

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_neural_sde:

  objective: drift_matching
  model_type: MyNeuralSDE

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    predict_flow_or_drift: drift

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_neural_ode:

  objective: flow_matching
  model_type: MyNeuralSDE

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    predict_flow_or_drift: flow

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


my_diffusion_model:

  objective: flow_matching
  model_type: MyDiffusionModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-5
    max_train_steps: 300_000
    warmup_steps: 1000

my_autoregressive_rnn:

  objective: mse
  model_type: MyReparameterizedAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

true_baseline_autoregressive_rnn:

  objective: ml
  model_type: MyBaselineAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

baseline_autoregressive_rnn:

  objective: ml
  model_type: MyBaselineAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_diffusion_model_rnn:

  objective: flow_matching
  model_type: MyDiffusionRNNModel

  model:
    hidden_size: 128

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_non_probabilistic_rnn:

  objective: mse
  model_type: MyNonProbabilisticRNNModel

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

baseline_diffusion_model_rnn:

  objective: flow_matching
  model_type: MyBaselineDiffusionRNNModel

  model:
    hidden_size: 128

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_neural_sde_rnn:

  objective: drift_matching
  model_type: MyNeuralSDERNN

  model:
    hidden_size: 128
    predict_flow_or_drift: drift

  latent_cond_len: *latent_cond_len

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000