# Add global parameters section at the top
globals:
  latent_cond_len: &latent_cond_len 2  # Define anchor with "&" symbol
  use_encoder_prior: &use_encoder_prior False

# Define templates for different parameter types
definitions:
  int_param: &int_param
    type: int
    min: 0  # Default min
    max: 100  # Default max
    step: 1
    log: false

  float_param: &float_param
    type: float
    min: 0.0  # Default min
    max: 1.0  # Default max
    log: false
    step: null  # Default no step (continuous)

  categorical_param: &categorical_param
    type: categorical
    choices: []  # Default empty choices list


##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################


dataset:
  nice_name: Double Pendulum
  name: noisy_double_pendulum
  dataset_size: 10000
  dim: 4
  seq_length: 64
  pred_length: 32
  train_proportion: 0.8
  val_proportion: 0.1
  seed: 42
  train_batch_size: 128
  gradient_accumulation_batch_size_multiplier: 2 # 256 effective training batch size
  val_batch_size: 128

  noise_std: 0.3
  tracking_sigma: 0.1
  time_scale_mult: 8.0


  sample_rate: 8

  hyper_params:
    train_batch_size:
      optuna_hyper_param:
        <<: *int_param
        min_val: 16
        max_val: 1024
        step: 16

  metric_to_compute:
    - crps
    - nll
    - nrmse

  evaluation_settings:
    - future_latent
    - future_observation
    - future_denoised_observation

##############################################
my_autoregressive_small:

  objective: mse
  model_type: MyAutoregressiveModel

  model:
    n_layers: 2
    filter_width: 4
    hidden_channel_size: 8
    num_transformer_heads: 4
    parametrization: std
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    name: adam
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

  override_params:
    dataset:
      train_batch_size: 16
      gradient_accumulation_batch_size_multiplier: 1

  hyper_params:
    optimizer:

      name:
        optuna_hyper_param:
          <<: *categorical_param
          choices: ['adam', 'adamw']

      lr:
        optuna_hyper_param:
          <<: *float_param
          min_val: 1e-6
          max_val: 1e-3
          log: True

    model:

      n_layers:
        optuna_hyper_param:
          <<: *int_param
          min_val: 1
          max_val: 10
          step: 2
##############################################
my_reparameterized_autoregressive_small:

  objective: mse
  model_type: MyReparameterizedAutoregressiveModel

  model:
    n_layers: 2
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    name: adam
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

  override_params:
    dataset:
      train_batch_size: 128
      gradient_accumulation_batch_size_multiplier: 1


  hyper_params:
    optimizer:

      lr:
        optuna_hyper_param:
          <<: *float_param
          min_val: 1e-6
          max_val: 1e-2
          log: True

    model:

      n_layers:
        optuna_hyper_param:
          <<: *int_param
          min_val: 1
          max_val: 10
          step: 1

      hidden_channel_size:
        optuna_hyper_param:
          <<: *int_param
          min_val: 16
          max_val: 1024
          step: 16

      num_transformer_heads:
        optuna_hyper_param:
          <<: *int_param
          min_val: 4
          max_val: 64
          step: 4

##############################################


baseline_diffusion_model:

  objective: flow_matching
  model_type: MyBaselineDiffusionModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

true_baseline_autoregressive:

  objective: ml
  model_type: MyBaselineAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


baseline_autoregressive:

  objective: ml
  model_type: MyAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


my_autoregressive:

  objective: mse
  model_type: MyAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: std
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_autoregressive_reparam:

  objective: mse
  model_type: MyReparameterizedAutoregressiveModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000


my_non_probabilistic:

  objective: mse
  model_type: MyNonProbabilisticModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    parametrization: mixed # Need to to handle 0 covariance for 0 velocity certainty
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: True # Hack to make things stable
  # use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_neural_sde:

  objective: drift_matching
  model_type: MyNeuralSDE

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    predict_flow_or_drift: drift

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

my_neural_ode:

  objective: flow_matching
  model_type: MyNeuralSDE

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32
    predict_flow_or_drift: flow

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000



my_diffusion_model:

  objective: flow_matching
  model_type: MyDiffusionModel

  model:
    n_layers: 10
    filter_width: 4
    hidden_channel_size: 128
    num_transformer_heads: 32

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000



my_small_non_probabilistic:

  objective: mse
  model_type: MyNonProbabilisticModel

  model:
    n_layers: 1
    filter_width: 4
    hidden_channel_size: 4
    num_transformer_heads: 4
    parametrization: std
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-4
    max_train_steps: 300_000
    warmup_steps: 1000

##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################
##############################################

# my_autoregressive_rnn:

#   objective: mse
#   model_type: MyReparameterizedAutoregressiveRNNModel

#   model:
#     hidden_size: 128
#     parametrization: mixed
#     predict_cov: False

#   latent_cond_len: *latent_cond_len
#   use_encoder_prior: *use_encoder_prior

#   optimizer:
#     lr: 1.0e-4
#     max_train_steps: 300_000
#     warmup_steps: 1000

my_autoregressive_reparam_rnn:

  objective: mse
  model_type: MyReparameterizedAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

true_baseline_autoregressive_rnn:

  objective: ml
  model_type: MyBaselineAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

baseline_autoregressive_rnn:

  objective: ml
  model_type: MyBaselineAutoregressiveRNNModel

  model:
    hidden_size: 128
    parametrization: std
    predict_cov: True

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

my_diffusion_model_rnn:

  objective: flow_matching
  model_type: MyDiffusionRNNModel

  model:
    hidden_size: 128

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

my_non_probabilistic_rnn:

  objective: mse
  model_type: MyNonProbabilisticRNNModel

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: True # Hack to make things stable

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

baseline_diffusion_model_rnn:

  objective: flow_matching
  model_type: MyBaselineDiffusionRNNModel

  model:
    hidden_size: 128

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

my_neural_sde_rnn:

  objective: drift_matching
  model_type: MyNeuralSDERNN

  model:
    hidden_size: 128
    predict_flow_or_drift: drift

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

my_neural_ode_rnn:

  objective: flow_matching
  model_type: MyNeuralSDERNN

  model:
    hidden_size: 128
    predict_flow_or_drift: flow

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100


my_neural_ode_rnn2:

  objective: flow_matching
  model_type: MyNeuralSDERNN

  model:
    hidden_size: 128
    predict_flow_or_drift: flow
    n_layers: 3
    intermediate_channels: 128

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

# python main.py --config_file=Config/double_pendulum.yaml --freq=0 --sde_type=brownian --model_name=baseline_autoregressive --group=exp_april_7 --debug

# python main.py --config_file=Config/double_pendulum.yaml --freq=0 --sde_type=brownian --model_name=my_autoregressive --group=check_ar --retrain --train
# python main.py --config_file=Config/double_pendulum.yaml --freq=0 --sde_type=brownian --model_name=my_non_probabilistic --group=check_ar --retrain --train
# python main.py --config_file=Config/double_pendulum.yaml --freq=1 --sde_type=brownian --model_name=my_neural_sde --group=check_ar --retrain --train
# python main.py --config_file=Config/double_pendulum.yaml --freq=1 --sde_type=brownian --model_name=my_neural_ode --group=check_ar --retrain --train
# python main.py --config_file=Config/double_pendulum.yaml --freq=1 --sde_type=brownian --model_name=my_diffusion_model --group=check_ar --retrain --train


# python main.py --config_file=Config/double_pendulum.yaml --freq=1 --sde_type=brownian --model_name=my_small_non_probabilistic --group=asdf --retrain --train



##############################################

my_autoregressive_reparam_rnn_bwd:

  objective: mse
  model_type: MyReparameterizedAutoregressiveRNNBwdModel

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100

my_neural_sde_rnn_bwd:

  objective: mse
  model_type: MyNeuralSDERNNBwd

  model:
    hidden_size: 128
    parametrization: mixed
    predict_cov: False

  latent_cond_len: *latent_cond_len
  use_encoder_prior: *use_encoder_prior

  optimizer:
    lr: 1.0e-3
    max_train_steps: 30_000
    warmup_steps: 100
