

# Set a unique ID for this experiment (can be passed as an argument, or use $$ for PID/randomness)
if [ -z "$PBRR_UNIQUE_ID" ]; then
	export PBRR_UNIQUE_ID=$$
fi

SEED=${SEED:-1}   # default to 1 if not set

python -m learn_reward.learn_reward_ab_initio_methods with env_to_run=tomato level=7 move_ref_policy=False has_pi_ref_constraint=True reward_fun=proxy exp_algo=ORPO checkpoint_to_load_current_policy=None 'checkpoint_to_load_policies=["'data/base_policy_checkpoints/tomato_base_policy/checkpoint_000300'"]' 'om_divergence_coeffs=['0.08']' experiment_tag=state-action 'om_divergence_type=["'kl'"]' seed=$SEED num_rollout_workers=10 num_gpus=1 num_training_iters=100


