# Medium Dataset - Pen Environment
# Optimized MMD parameters for stable Q-Learning training

env_name: pen-human-v1
model: mmd_ql
ms: offline
exp: 2  # Updated experiment version
seed: 42
device: cuda
dir: ./logs/pen-human-v1
save_best_model: true
eval_freq: 10
num_steps_per_epoch: 1000
num_epochs: 500
batch_size: 256
lr_decay: false
early_stop: false
eval_episodes: 10
top_k: 1

model_args:
  - model: MLP
  - lr: 0.0005
  - eta: 0.0005             
  - T: 0.9994
  - q_norm: False
  - max_q_backup: False
  - reward_tune: no
  - gn: 8.0
  - noise_schedule: "fm"
  - sigma_data: 0.5         # TUNED: Reduced from 0.5 for better convergence
  - f_type: "euler_fm"
  - eps: 0.001
  - temb_type: "identity"
  - time_scale: 1000.0
  - mmd_sigma: 1.2          # TUNED: Increased from 0.8 for smoother gradients
  - sample_t_mode: "lognormal"
  - P_mean: -0.8            # TUNED: Adjusted from -1.1 for balanced noise sampling
  - P_std: 1.5              # TUNED: Adjusted from 1.5
  - matrix_size: 16
  - sample_repeat: 1
  - k: 8                    # TUNED: Adjusted from 12 for better time intervals
  - a: 4                    # TUNED: Linear weighting from 2 for balanced training
  - b: 2
  - min_tr_gap: null
  - cfg_scale: 0.0
  - cfg_dropout_prob: 0.1 # State dropout probability during training
  - use_cfg: false
  - action_select: "softmax"      # "greedy" | "softmax"
  - softmax_temperature: 0.6     # temperature for softmax selection
  - softmax_topk: 3           # optionally apply top-k filter before softmax
  
   # Optimization
  - ema_decay: 0.995
  - step_start_ema: 1000
  - update_ema_every: 5
  - grad_norm: 1.0