# Additional plot-specific configuration
plot:
  base_paths: 
    - "/oss/public/user/chefengdi/ckpts"
  exp_names:
    - "grpo_8nodes_5ksft_3B_spec_refine_one_score-ensures_fixed"
  global_steps: [20, 40, 60, 80]
  shot_numbers: [8]

# Configuration from verl/inference/config/generation.yaml
trainer:
  nnodes: 1
  n_gpus_per_node: 8
  project_name: verl_examples
  experiment_name: gsm8k
  save_output: False
  output_dir: /nas/shared/sys2/chefengdi/eval_log/

data:
  path: ~/data/rlhf/math/test.parquet
  prompt_key: prompt
  n_samples: 5
  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
  dataloader_num_workers: 0
  batch_size: 32
  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
  return_raw_chat: False
  shuffle: True
  filter_overlong_prompts: False
  filter_overlong_prompts_workers: 1
  truncation: error
  image_key: images
  prompt_type: sft
  num_cpu: 8

parallel_processing:
  tensor_parallel_size: 4
  gpu_memory_utilization: 0.8

model:
  path: ~/models/Qwen2-7B-Instruct
  external_lib: null

reward_model:
  reward_type: one_score

rollout:
  name: vllm
  shot_number: 1  # This will be overridden per evaluation
  temperature: 1.0
  top_k: 50
  top_p: 0.7
  max_prompt_length: 2048
  max_response_length: 2048
  dtype: bfloat16
  gpu_memory_utilization: 0.6
  ignore_eos: False
  enforce_eager: True
  free_cache_engine: True
  load_format: dummy_dtensor
  tensor_model_parallel_size: 1
  max_num_batched_tokens: 8192
  max_model_len: null
  max_num_seqs: 1024
  log_prob_micro_batch_size: null
  log_prob_micro_batch_size_per_gpu: 8
  use_fire_sampling: False
  do_sample: True
  disable_log_stats: True
  enable_chunked_prefill: True

actor:
  strategy: fsdp
  ulysses_sequence_parallel_size: 1
  fsdp_config:
    fsdp_size: -1 