hydra:
  run:
    dir: .
  output_subdir: null

# Experiment settings
exp_name: "LHRL-VGR"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

# # openlm_hub 模型下载
# model_download_type: OPENLM_HUB

# checkpoint_config:
#   type: mos

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

#track_with: wandb
#tracker_kwargs:
#  api_key:
#  project: roll_examples
#  notes: roll_examples
#  tags:
#    - rlvr
#    - baseline

track_with: tensorboard
tracker_kwargs:
  log_dir: ./rl_examples/llm/tensorboard/roll_exp/rlvr

num_gpus_per_node: 8

max_steps: 500
save_steps: 50
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false

# Batch sizes
rollout_batch_size: 16  # prompt
prompt_length: 4096
response_length: 1024

# RL algorithm settings
if_2_stage: true
if_share_reward: false
if_mid_reward: true
mid_domain: sotopia-policy
stage1_sample_mode: 1

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"
k1: 8
k2: 8


# clip
value_clip: 0.5
reward_clip: 10
advantage_clip: 2.0
dual_clip_loss: true

# normalize
reward_norm: null
reward_shift: false
reward_scale: false

# data mask
max_len_mask: true
difficulty_mask: true
difficulty_low_threshold: 0.1
difficulty_high_threshold: 0.95
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

# Model paths
pretrain: idlefish_llm.sotopia-sft/version=1
reward_pretrain: Qwen/Qwen2.5-0.5B-instruct

validation:
  data_args:
    template: qwen2_5
    file_name:
      - data/sotopia/sotopia_2_stage_0911_val.jsonl
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 1

# Actor Train Worker
actor_train:
  model_args:
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
    
  training_args:
    learning_rate: 2.0e-7
    weight_decay: 0
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 16
    warmup_steps: 20
    num_train_epochs: 50
    
  data_args:
    template: qwen2_5
    file_name: 
      - data/LHRL/sotopia_2_stage_0911.jsonl
    domain_interleave_probs:
      sotopia: 1.0
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 8
      
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
    
  device_mapping: list(range(0,8))
  infer_batch_size: 4

# Actor Infer Worker
actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16

  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.7
    num_return_sequences: ${num_return_sequences_in_group}

  data_args:
    template: qwen2_5

  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      max_model_len: 8000

  device_mapping: list(range(0,8))
  infer_batch_size: 1

# Reference Worker
reference:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  
  data_args:
    template: qwen2_5
    
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  
  device_mapping: list(range(0,8))
  infer_batch_size: 8

# Reward Workers
rewards:
  sotopia:
    worker_cls: reward.SotopiaRewardWorker
    judge_model_type: api
    judge_model_name: "Claude-3.5-Sonnet"
    judge_api_url: ${your_api_url}
    judge_api_key: ${your_api_key}
    tag_included: [sotopia]
    world_size: 16
    model_args:
      # model_name_or_path: "Qwen/Qwen2.5-0.5B-instruct"
      # flash_attn: fa2
      # disable_gradient_checkpointing: true
      # dtype: bf16
      # model_type: trl
    generating_args:
      max_new_tokens: 200
      top_p: 0.8
      top_k: 50
      num_beams: 1
      temperature: 0
      num_return_sequences: 1
    data_args:
      template: qwen2_5
    strategy_args:
      strategy_name: hf_infer
      strategy_config: null
    device_mapping: list(range(6,8))
    infer_batch_size: 4
  sotopia-policy:
    worker_cls: reward.SotopiaPolicyRewardWorker
    judge_model_type: api
    judge_model_name: "Claude-3.5-Sonnet"
    judge_api_url: ${your_api_url}
    judge_api_key: ${your_api_key}
    tag_included: [sotopia-policy]
    world_size: 16
    model_args:
      # model_name_or_path: "Qwen/Qwen2.5-0.5B-instruct"
      # flash_attn: fa2
      # disable_gradient_checkpointing: true
      # dtype: bf16
      # model_type: trl
    generating_args:
      max_new_tokens: 200
      top_p: 0.8
      top_k: 50
      num_beams: 1
      temperature: 0
      num_return_sequences: 1
    data_args:
      template: qwen2_5
    strategy_args:
      strategy_name: hf_infer
      strategy_config: null
    device_mapping: list(range(6,8))
    infer_batch_size: 4
