experiment:
    experiment_id: 'SeedsTest'
    environment: 'RockPaperScissors-v0'
    number_of_runs: 5
    checkpoint_at_iterations: [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    benchmarking_episodes: 10
    self_play_training_schemes: ['halfhistorylimitselfplay','lastquarterhistorylimitselfplay']
    algorithms: ['ppo']
    fixed_agents: ['rockagent']

agents:
    deepqlearning:
        learning_rate: 1.0e-3
        epsstart: 0.8
        epsend: 0.05
        epsdecay: 1.0e3
        double: False
        dueling: False
        use_cuda: False
        use_PER: False
        PER_alpha: 0.07
        min_memory: 5.0e1
        memoryCapacity: 25.0e3
        nbrTrainIteration: 32
        batch_size: 256
        gamma: 0.99
        tau: 1.0e-2

    ppo:
        horizon: 32
        nbr_actor: 1
        discount: 0.99
        use_gae: True
        use_cuda: True
        gae_tau: 0.95
        entropy_weight: 0.01
        gradient_clip: 5
        optimization_epochs: 10
        mini_batch_size: 32
        ppo_ratio_clip: 0.2
        learning_rate: 3.0e-4
        adam_eps: 1.0e-5
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'None'

    tabularqlearning:
        learning_rate: 0.5
        discount_factor: 0.99
        epsilon_greedy: 0.1
        use_repeated_update_q_learning: False
        temperature: 1

    tabularqlearning_ruql:
        learning_rate: 0.5
        discount_factor: 0.99
        epsilon_greedy: 0.1
        use_repeated_update_q_learning: True
        temperature: 1
