# -- PbRL options ---
use_human_pref : False
use_llm_pref : True
sparse : False
use_scripted_teacher : False
reward_model_path : "save_reward_model"
preference_save_path : "get_llm_pref"
replay_buffer_save_path : "replay_buffer"
preference_example_path : "prompt_example"
processing_path : "processing"
reward_model_info : "LLM_1"
update_reward_model_interval : 1500 # reward model update interval
LLM_model : "gpt-35-turbo" 
compare_agents : True
compare_team : True
maxium_update_num : 20000000
gpt_api_key : ''
replay_small : False

n_pref : 75 # get preference per one update
step_n_pref : 75
max_replay_num : 200
test_only : False
stop_epoch : 5 # maximum the number of update reward function & gather preferences
n_repeat : 1 # repeat get preference 
ind_q_learning : False
use_intrinsic_reward_as_contribution : False
scalability_test : False
compare_agents_num : 3

use_kendall : True
use_ori_reward : False
use_extrinsic_reward : False
ori_percent : 0.5
reward_model_lr : 0.001 
reward_model_epoch : 50 
use_team_reward : False
epsilon_reset : False
gather_threshold : 1.0 # case !=1.0 => e.g,)0.0 : save data if kendall < 0.0
n_reward_functions : 3
update_threshold : 0.3
use_std_to_reward : False
use_kendalltau_as_reward : False
save_test_data : False
step_weight : 0.5
sc2_random : False

# --- Defaults ---

# --- GRF ---
num_agents : 4

# --- pymarl options ---
runner: "episode" # Runs 1 env for an episode
mac: "basic_mac" # Basic controller
env: "sc2" # Environment name
env_args: {} # Arguments for the environment
batch_size_run: 1 # Number of environments to run in parallel
test_nepisode: 20 # Number of episodes to test for
test_interval: 2000 # Test after {} timesteps have passed
test_greedy: True # Use greedy evaluation (if False, will set epsilon floor to 0
log_interval: 2000 # Log summary of stats after every {} timesteps
runner_log_interval: 2000 # Log runner stats (not test stats) every {} timesteps
learner_log_interval: 2000 # Log training stats every {} timesteps
t_max: 10000 # Stop running after this many timesteps
use_cuda: True # Use gpu by default unless it isn't available
buffer_cpu_only: True # If true we won't keep all of the replay buffer in vram

# --- Logging options ---
use_wandb : True
project_name : 'default_project_name'
wandb_group_info : 'default_group_info'
use_tensorboard: True # Log results to tensorboard
save_model: False # Save the models to disk
save_model_interval: 2000000 # Save models after this many timesteps
checkpoint_path: "" # Load a checkpoint from this path
evaluate: False # Evaluate model for test_nepisode episodes and quit (no training)
load_step: 0 # Load model trained on this many timesteps (0 if choose max possible)
save_replay: False # Saving the replay of the model loaded from checkpoint_path
local_results_path: "results" # Path for local results

# --- RL hyperparameters ---
gamma: 0.99
batch_size: 32 # Number of episodes to train on
buffer_size: 32 # Size of the replay buffer
lr: 0.0005 # Learning rate for agents
critic_lr: 0.0005 # Learning rate for critics
optim_alpha: 0.99 # RMSProp alpha
optim_eps: 0.00001 # RMSProp epsilon
grad_norm_clip: 10 # Reduce magnitude of gradients above this L2 norm

# --- Agent parameters ---
agent: "rnn" # Default rnn agent
rnn_hidden_dim: 64 # Size of hidden state for default rnn agent
obs_agent_id: True # Include the agent's one_hot id in the observation
obs_last_action: True # Include the agent's last action (one_hot) in the observation

# --- Experiment running params ---
repeat_id: 1
label: "default_label"

run: "default"
