#!/bin/bash
code_root=../MetaResearcher
cd $code_root

export HYDRA_FULL_ERROR=1
export CUDA_LAUNCH_BLOCKING=1
export VLLM_ATTENTION_BACKEND=XFORMERS

export BASE_MODEL='../QwQ-32B'
export PROJECT_NAME='MetaResearcher'
export EXPERIMENT_NAME='grpo-metaesearcher-qwq-32b-gpqa-tool'

# ------------ 日志 配置 ------------
log_path=$code_root/logs/
mkdir -p $log_path

# ------------ Wandb 配置 ------------
export WANDB_MODE="offline"
export WANDB_API_KEY="XXXXXXXXXXXXXXXXXXXXX"

export WANDB_DIR="${log_path}/wandb"
mkdir -p "${WANDB_DIR}"

# ------------ TensorBoard 配置 ------------
export TENSORBOARD_DIR="${code_root}/logs/tensorboard/${EXPERIMENT_NAME}"
mkdir -p "${TENSORBOARD_DIR}"

# ------------ Search_engine 配置 ------------
export SEARCH_ENGINES="GoogleBingOptionalSearch"
export SEARCH_BASE_URL="http://localhost:7575"
export OPTIONAL_SEARCH_ENGINE="GoogleSerperSearch"

echo "Running training…"

python3 -m meta_researcher.src.main_agent \
  algorithm.adv_estimator=grpo \
  'data.train_files=["../data/train_data/Toolcall_Research_train_tool.jsonl"]' \
  'data.val_files=["../data/test_data/Toolcall_dev_GPQA_tool.jsonl"]' \
  data.train_batch_size=64 \
  data.max_prompt_length=32768 \
  data.max_response_length=32768 \
  data.max_response_length_single_turn=4096 \
  data.use_custom_system_prompt=False \
  data.truncation="avaiable" \
  actor_rollout_ref.model.path="${BASE_MODEL}" \
  actor_rollout_ref.actor.optim.lr=2e-7 \
  actor_rollout_ref.model.use_remove_padding=True \
  actor_rollout_ref.actor.ppo_mini_batch_size=32 \
  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
  actor_rollout_ref.actor.use_kl_loss=True \
  actor_rollout_ref.actor.kl_loss_coef=0.001 \
  actor_rollout_ref.actor.kl_loss_type=low_var_kl \
  actor_rollout_ref.model.enable_gradient_checkpointing=True \
  actor_rollout_ref.actor.fsdp_config.param_offload=False \
  actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
  actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
  actor_rollout_ref.rollout.name=vllm \
  'actor_rollout_ref.rollout.stop_token_ids=[151658]' \
  'actor_rollout_ref.rollout.stop=[]' \
  actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
  actor_rollout_ref.rollout.n_repeat=4 \
  actor_rollout_ref.rollout.think_len_belta=1e-4 \
  actor_rollout_ref.rollout.think_len_threshold=2048 \
  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
  actor_rollout_ref.ref.fsdp_config.param_offload=True \
  algorithm.kl_ctrl.kl_coef=0.001 \
  'trainer.logger=["console","wandb","tensorboard"]' \
  trainer.project_name="${WANDB_PROJECT}" \
  trainer.experiment_name="${EXPERIMENT_NAME}" \
  trainer.n_gpus_per_node=8 \
  trainer.nnodes=4 \
  trainer.save_freq=1 \
  trainer.test_freq=3 \
  trainer.total_epochs=10 \
  trainer.val_before_train=True \
  trainer.log_val_generations=0 \
  tool.max_turns=40 \
  tool.max_step_num=3 \
  'tool.tools=["websearch", "Plan", "Reflect"]' \
  tool.max_tool_response_length=1024 2>&1 | tee $log_path/${EXPERIMENT_NAME}.log
