source ~/miniconda3/etc/profile.d/conda.sh
conda activate tgrpo
# pip install flash-attn --no-build-isolation
# pip install trl
set -x

export VLLM_ATTENTION_BACKEND=XFORMERS
export VLLM_USE_V1=0

MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
export WANDB_API_KEY="09c15b5e7bdab76016bed37df292948e1e323227"
SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
 The reasoning process MUST BE enclosed within <think> </think> tags, including analysis with either specific timestamps (xx.xx) or time ranges (xx.xx to xx.xx). The final answer MUST BE put in <answer> </answer> tags, providing the start and end times (in seconds, precise to two decimal places). Output the final answer in JSON format."""

python3 -m verl.trainer.main \
    config=examples/grpo_example.yaml \
    data.train_files=examples/data_config/tvg.yaml \
    data.val_files=examples/data_config/tvg.yaml \
    data.max_prompt_length=4096 \
    data.max_response_length=2048 \
    data.rollout_batch_size=16 \
    worker.actor.global_batch_size=16 \
    worker.actor.entropy_coeff=1e-3 \
    worker.actor.kl_loss_coef=1e-2 \
    worker.actor.micro_batch_size_per_device_for_update=4 \
    worker.actor.micro_batch_size_per_device_for_experience=8 \
    worker.actor.model.model_path=${MODEL_PATH} \
    worker.rollout.n=8 \
    worker.rollout.tensor_parallel_size=1 \
    worker.rollout.enable_chunked_prefill=false \
    trainer.experiment_name=qwen2_5_vl_7b_tvg_charades_source_all \
    trainer.n_gpus_per_node=8 \
    trainer.val_generations_to_log=10 \
    trainer.save_freq=500 \
    trainer.val_before_train=false \
    trainer.logger=[\"console\",\"wandb\"] \
    data.min_pixels=3136 \
    data.max_pixels=1605632 \
    data.system_prompt="${SYSTEM_PROMPT}"

python3 scripts/model_merger.py --local_dir checkpoints/v3/easy_r1/qwen2_5_vl_7b_tvg_charades_source_all/global_step_1500/actor

