This file contains hyperparameter setups for each dataset to reproduce the results stated in the paper:

halfcheetah-random:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 0.5
	ROLLOUT_HORIZON = 4
	ROLLOUT_BATCH_SIZE = 50000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 50
	LOG_STD_BOUNDS = (-5, 0.5)
	alpha = 0.01
	
halfcheetah-medium:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 1.5
	ROLLOUT_HORIZON = 1
	ROLLOUT_BATCH_SIZE = 50000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 0.25)
	alpha = 0.01
	
halfcheetah-medium-replay:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 3
	ROLLOUT_BATCH_SIZE = 50000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 1)
	alpha = 0.01
	
halfcheetah-medium-expert:

	NUM_EPOCHS = 9000
	REWARD_PENALTY_COEFF = 10
	ROLLOUT_HORIZON = 4
	ROLLOUT_BATCH_SIZE = 20000
	ROLLOUT_FREQ = 1
	UPDATES_PER_STEP = 250
	LOG_STD_BOUNDS = (-5, 0.25)
	alpha = 0.1
	
hopper-random:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 1.5
	ROLLOUT_HORIZON = 4
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 0.25)
	alpha = 0.01
			

hopper-medium:

	NUM_EPOCHS = 20000
	REWARD_PENALTY_COEFF = 5
	ROLLOUT_HORIZON = 1
	ROLLOUT_BATCH_SIZE = 20000
	ROLLOUT_FREQ = 100
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 1)
	alpha = 0.1
		
hopper-medium-replay:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 1
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 0.25)
	alpha = 0.01
	
	
hopper-medium-expert:

	NUM_EPOCHS = 50000
	REWARD_PENALTY_COEFF = 5
	ROLLOUT_HORIZON = 1
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 50
	UPDATES_PER_STEP = 30
	LOG_STD_BOUNDS = (-5, 1)
	alpha = 0.1
	
walker2d-random:

	NUM_EPOCHS = 2000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 2
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 1
	UPDATES_PER_STEP = 1000
	LOG_STD_BOUNDS = (-5, 2)
	alpha = 0.01
	
walker2d-medium:

	NUM_EPOCHS = 2000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 4
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 1
	UPDATES_PER_STEP = 1000
	LOG_STD_BOUNDS = (-5, 1)
	alpha = 1
	
walker2d-medium-replay:

	NUM_EPOCHS = 2000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 4
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 1
	UPDATES_PER_STEP = 1000
	LOG_STD_BOUNDS = (-5, 1)
	alpha = 0.01
	
walker2d-medium-expert:

	NUM_EPOCHS = 2000
	REWARD_PENALTY_COEFF = 1
	ROLLOUT_HORIZON = 2
	ROLLOUT_BATCH_SIZE = 100000
	ROLLOUT_FREQ = 1
	UPDATES_PER_STEP = 1000
	LOG_STD_BOUNDS = (-5, 2)
	alpha = 1
