# # label with rewards and reward model representations
# accelerate launch -m inference_rlhf.code.label_rewards policy=llama-3-3b user=anonymousanonymous task=math evaluation.batch_size=64 reward=armo-rm evaluation.model_type=reward collator.name=armo-rm plot.subsample_size=100
# accelerate launch -m inference_rlhf.code.label_rewards policy=mistral-7b user=anonymousanonymous task=math evaluation.batch_size=64 reward=armo-rm evaluation.model_type=reward collator.name=armo-rm plot.subsample_size=100
# accelerate launch -m inference_rlhf.code.label_rewards policy=phi-35-mini user=anonymousanonymous task=math evaluation.batch_size=64 reward=armo-rm evaluation.model_type=reward collator.name=armo-rm plot.subsample_size=100
# accelerate launch -m inference_rlhf.code.label_rewards policy=qwen-25-3b user=anonymousanonymous task=math evaluation.batch_size=64 reward=armo-rm evaluation.model_type=reward collator.name=armo-rm plot.subsample_size=100

# label with llama representations
# accelerate launch --config_file default_config.yaml -m inference_rlhf.code.label_rewards policy=llama-3-3b user=anonymousanonymous task=math evaluation.batch_size=128 evaluation.model_type=policy collator.name=llama task.TASK_DESC='' evaluation.collect_gradients=false
# accelerate launch -m inference_rlhf.code.label_rewards policy=mistral-7b user=anonymousanonymous task=math evaluation.batch_size=32 evaluation.model_type=policy collator.name=mistral task.TASK_DESC='' evaluation.collect_gradients=false
# accelerate launch -m inference_rlhf.code.label_rewards policy=phi-35-mini user=anonymousanonymous task=math evaluation.batch_size=32 evaluation.model_type=policy collator.name=phi task.TASK_DESC='' evaluation.collect_gradients=false
# accelerate launch -m inference_rlhf.code.label_rewards policy=qwen-25-3b user=anonymousanonymous task=math evaluation.batch_size=128 evaluation.model_type=policy collator.name=qwen task.TASK_DESC='' evaluation.collect_gradients=false

# # label with llama gradients
# accelerate launch -m inference_rlhf.code.label_rewards \
#                 policy=llama-3-3b \
#                 user=anonymousanonymous \
#                 task=math \
#                 evaluation.batch_size=1 \
#                 reward=armo-rm \
#                 evaluation.model_type=policy \
#                 collator.name=llama \
#                 task.TASK_DESC='' \
#                 evaluation.collect_gradients=true

# ## label with qwen 25 3b sft last hidden state
# accelerate launch -m inference_rlhf.code.label_rewards policy=qwen-25-3b user=anonymousanonymous task=math evaluation.batch_size=128 evaluation.model_type=policy collator.name=qwen task.TASK_DESC='' evaluation.collect_gradients=false checkpoint_dir=checkpoints/yi1zme99/checkpoint-160

# # label with qwen 25 3b gradients
# accelerate launch -m inference_rlhf.code.label_rewards \
#                 policy=qwen-25-3b \
#                 user=anonymousanonymous \
#                 task=math \
#                 evaluation.batch_size=1 \
#                 reward=armo-rm \
#                 evaluation.model_type=policy \
#                 collator.name=qwen \
#                 task.TASK_DESC='' \
#                 evaluation.collect_gradients=true \
#                 plot.subsample_size=100

# # label with qwen 25 3b gradients sft
# accelerate launch -m inference_rlhf.code.label_rewards \
#                 policy=qwen-25-3b \
#                 user=anonymousanonymous \
#                 task=math \
#                 evaluation.batch_size=1 \
#                 reward=armo-rm \
#                 evaluation.model_type=policy \
#                 collator.name=qwen \
#                 task.TASK_DESC='' \
#                 evaluation.collect_gradients=true \
#                 checkpoint_dir=checkpoints/yi1zme99/checkpoint-160


# # label llama 3 8b on code contests policy representations
# accelerate launch -m inference_rlhf.code.label_rewards \
#             policy=llama-3-8b \
#             user=anonymousanonymous \
#             task=code_contests \
#             evaluation.batch_size=32 \
#             reward=armo-rm \
#             evaluation.model_type=policy \
#             collator.name=llama \
#             task.TASK_DESC='' \
#             +prompts@policy=code_contests