experiment:
    experiment_id: 'RoboDohyoTest'
    environment: 'RoboschoolSumoWithRewardShaping-v0'
    number_of_runs: 2
    checkpoint_at_iterations: [2, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000]
    benchmarking_episodes: 20
    self_play_training_schemes: ['lastquarterhistorylimitselfplay', 'naiveselfplay']
    algorithms: ['ppo']
    fixed_agents: []

agents:
    deepqlearning:
        learning_rate: 1.0e-3
        epsstart: 0.8
        epsend: 0.05
        epsdecay: 1.0e3
        double: False
        dueling: False
        use_cuda: False
        use_PER: False
        PER_alpha: 0.07
        min_memory: 5.0e1
        memoryCapacity: 25.0e3
        nbrTrainIteration: 32
        batch_size: 256
        gamma: 0.99
        tau: 1.0e-2
    
    ppo_h2048lr4:
        horizon: 2048
        nbr_actor: 16
        discount: 0.99
        use_gae: True
        use_cuda: True
        gae_tau: 0.95
        entropy_weight: 0.01
        gradient_clip: 5
        optimization_epochs: 15
        mini_batch_size: 1024
        ppo_ratio_clip: 0.2
        learning_rate: 3.0e-4
        adam_eps: 1.0e-5
                                
                                
    tabularqlearning:
        learning_rate: 0.5
        discount_factor: 0.99
        epsilon_greedy: 0.1
        use_repeated_update_q_learning: False
        temperature: 1


