experiment:
    experiment_id: 'Delta-vs-DeltaLimit-UniformTest-CH1e4'
    environment: 'RockPaperScissors-v0'
    number_of_runs: 5
    checkpoint_at_iterations: [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    benchmarking_episodes: 10 
    self_play_training_schemes: ['fullhistoryselfplay','fullhistorylimitselfplay']
    algorithms: ['ppo']
    fixed_agents: ['rockagent','paperagent','scissorsagent','randomagent']

agents:
    deepqlearning:
        learning_rate: 1.0e-3
        epsstart: 0.8
        epsend: 0.05
        epsdecay: 1.0e3
        double: False
        dueling: False
        use_cuda: False
        use_PER: False
        PER_alpha: 0.07
        min_memory: 5.0e1
        memoryCapacity: 25.0e3
        nbrTrainIteration: 32
        batch_size: 256
        gamma: 0.99
        tau: 1.0e-2

    ppo_h200:
        horizon: 200
        nbr_actor: 1
        discount: 0.99
        use_gae: True
        use_cuda: True
        gae_tau: 0.95
        entropy_weight: 0.01
        gradient_clip: 5
        optimization_epochs: 10
        mini_batch_size: 8
        ppo_ratio_clip: 0.2
        learning_rate: 3.0e-4
        adam_eps: 1.0e-5

    ppo_h40:                  
        horizon: 40
        nbr_actor: 1
        discount: 0.99
        use_gae: True
        use_cuda: True
        gae_tau: 0.95
        entropy_weight: 0.01
        gradient_clip: 5
        optimization_epochs: 10
        mini_batch_size: 8
        ppo_ratio_clip: 0.2
        learning_rate: 3.0e-4
        adam_eps: 1.0e-5

    tabularqlearning:
        learning_rate: 0.5
        discount_factor: 0.99
        epsilon_greedy: 0.1
        use_repeated_update_q_learning: False
        temperature: 1

    tabularqlearning_ruql:
        learning_rate: 0.5
        discount_factor: 0.99
        epsilon_greedy: 0.1
        use_repeated_update_q_learning: True
        temperature: 1
