pen_expert_args = {
    "algo_name": "iql", 
    "domain": "adroit", 
    # "task": "pen-expert-v1", 
    # "seed": 0, 
    "update_reward": True, 
    "select_data": True,

    # transition parameters
    "load_transition_path": None, 
    "transition_lr": 0.001, 
    "max_epochs_since_update": 5, 
    "transition_max_epochs": None, 
    "transition_hidden_dims": [200, 200, 200, 200], 
    "transition_weight_decay": [2.5e-05, 5e-05, 7.5e-05, 7.5e-05, 0.0001], 
    "n_transition_ensemble": 7, 
    "n_elites": 5, 
    "rollout_length": 20,
    "rollout_batch_size": 10000, 
    "rollout_freq": 20000,
    "model_retain_epochs": 5,

    # reward parameters
    "load_reward_path": None, 
    "n_reward_ensemble": 3, 
    "reward_lr": 0.0003, 
    "reward_pretrain_batch_size": 64,
    "reward_pretrain_epoch": 100, 
    "reward_train_epoch": 100,
    "reward_update_freq": 100000, 
    "max_reward_steps": 100000, 
    "reward_train_batch_size": 256, 

    # preference data parameters
    "collect_preference_data": True,
    "preference_data_path": "../dataset/preference_dataset", 
    "num_query": 2000,  # the number of offline preference dataset (public)
    "len_query": 50,   # the length of trjactory of offline preference
    "reward_data_size": 100, # the datsize of preference data used to train
    "mean_probs": 0.95, # the threshold of confidence
    "std_probs": 0.08,  # the threshold of uncertainty 
    "fake_ratio": 0.95, # the ratio of generated preference data

    # policy parameters
    "hidden_dims": [256, 256], 
    "actor_lr": 0.0003, 
    "critic_q_lr": 0.0003, 
    "critic_v_lr": 0.0003, 
    "dropout_rate": 0.1, 
    "gamma": 0.99, 
    "tau": 0.005, 
    "expectile": 0.8, 
    "temperature": 3.0, 
    "lr_scheduler": True, 
    "max_episode_length": 1000, 
    "epoch": 500, 
    "step_per_epoch": 1000, 
    "eval_episodes": 10, 
    "policy_batch_size": 256, 
    # "device": "cuda", 
    }
