python train.py --config-name _4_countdown system.CUDA_VISIBLE_DEVICES="0" trainer.experiment_name=countdown-base
python train.py --config-name _5_metamathqa system.CUDA_VISIBLE_DEVICES="0" trainer.experiment_name=metamathqa-base


model_names:
Qwen/Qwen2.5-0.5B
Qwen/Qwen2.5-0.5B-Instruct
Qwen/Qwen2.5-1.5B
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Qwen/Qwen2.5-1.5B-Instruct
Qwen/Qwen2.5-3B
Qwen/Qwen2.5-3B-Instruct
Qwen/Qwen2.5-7B
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
Qwen/Qwen2.5-7B-Instruct
Qwen/Qwen2.5-14B
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
Qwen/Qwen2.5-14B-Instruct





python -m ragen.llm_agent.agent_proxy --config-name _2_sokoban system.CUDA_VISIBLE_DEVICES=\"1,2\" \




python train.py --config-name _1_bandit system.CUDA_VISIBLE_DEVICES="1" trainer.experiment_name=bandit-base \
    custom_envs.Bandit.env_config.lo_arm_name="Phoenix" \
    custom_envs.Bandit.env_config.hi_arm_name="Dragon" \
    custom_envs.BanditTest.env_config.lo_arm_name="Trader" \
    custom_envs.BanditTest.env_config.hi_arm_name="Librarian"


python -m ragen.llm_agent.agent_proxy --config-name _1_bandit_generalization system.CUDA_VISIBLE_DEVICES="1" trainer.experiment_name=bandit-base 




MODEL_NAME=Qwen/Qwen2.5-32B-Instruct
python -m ragen.llm_agent.agent_proxy --config-name _2_sokoban_base system.CUDA_VISIBLE_DEVICES=\"0,1,2,3\" trainer.n_gpus_per_node=4 \
    trainer.experiment_name=sokoban-test_32b \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
    model_path=$MODEL_NAME