hydra:
  run:
    dir: .
  output_subdir: null

# Experiment settings
exp_name: "bargain-r1-llm-judge-rewards"
seed: 42
logging_dir: ./output/logs/bargain_mixed
output_dir: ./output/bargain-r1-mixed
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

#track_with: wandb
#tracker_kwargs:
#  api_key:
#  project: roll_examples
#  notes: roll_examples
#  tags:
#    - rlvr
#    - baseline

track_with: tensorboard
tracker_kwargs:
  log_dir: /data/oss_bucket_0/rl_examples/llm/tensorboard/roll_exp/rlvr

num_nodes: 1
num_gpus_per_node: 1

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false

# Batch sizes
rollout_batch_size: 32  # prompt
prompt_length: 2048
response_length: 4096

# RL algorithm settings
num_return_sequences_in_group: 1
ppo_epochs: 1
adv_estimator: "reinforce"

# clip
value_clip: 0.5
reward_clip: 10
advantage_clip: 2.0
dual_clip_loss: true

# normalize
reward_norm: null
reward_shift: false
reward_scale: false

# data mask
max_len_mask: true
difficulty_mask: true
difficulty_low_threshold: 0.1
difficulty_high_threshold: 0.95
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

# Model paths
pretrain: ../ROLL/pretrain_model/Qwen2.5-7B-Instruct
reward_pretrain: ../ROLL/pretrain_model/Qwen2.5-7B-Instruct  
# TODO 需要改为idelab ChatGPT

validation:
  data_args:
    template: qwen2_5
    file_name:  # TODO  val
      - data/sample_llm_judge.jsonl
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 1

# Actor Train Worker
actor_train:
  model_args:
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
    
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 1
    warmup_steps: 20
    num_train_epochs: 50
    
  data_args:
    template: qwen2_5
    file_name:  # TODO  要不要加入数据混合训练
      - data/sample_llm_judge.jsonl
      - data/sample_data.jsonl
    domain_interleave_probs:
      bargain_llm_judge: 0.7
      bargain: 0.3
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
      
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
    
  device_mapping: list(range(0,7))  # TODO  debug用 正式训练需要根据具体机器修改
  infer_batch_size: 2

# Actor Infer Worker
actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16

  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}

  data_args:
    template: qwen2_5

  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      max_model_len: 8000

  device_mapping: list(range(0,7))
  infer_batch_size: 1

# Reference Worker
reference:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  
  data_args:
    template: qwen2_5
    
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  
  device_mapping: list(range(0,7))
  infer_batch_size: 1

# Critic Worker  # TODO Critic和Judge有什么区别？
# critic:
#   name: critic
#   worker_cls: roll.distributed.worker.learner.critic_learner_worker.CriticLearnerWorker
#   world_size: 8
  
  # model_args:
  #   model_type: value_model
  #   dtype: bf16
    
  # training_args:
  #   per_device_train_batch_size: 4
  #   gradient_accumulation_steps: 16
  #   learning_rate: 1e-5
  #   weight_decay: 0.01
  #   max_grad_norm: 1.0
    
  # strategy_args:
  #   strategy_name: deepspeed_train
  #   strategy_config: ${deepspeed_zero2}
    
#   device_mapping:
#     critic: [[0, 3, 0], [0, 3, 1], [0, 3, 2], [0, 3, 3], [0, 3, 4], [0, 3, 5], [0, 3, 6], [0, 3, 7]]

# Reward Workers
rewards:
  bargain_llm_judge:
    judge_model_type: api
    judge_bot_id: "1614001"
    worker_cls: reward.BargainLLMJudgeRewardWorker
    
    model_args:
      model_name_or_path: ${pretrain}

    world_size: 4
    tag_included: [bargain_llm_judge]
    
    query_filter_config:
      type: no_filter
  bargain:
    worker_cls: reward.BargainRuleRewardWorker
    reward_type: soft
    response_length_penalty_coef: 0.0
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen2_5
    tag_included: [bargain]
    world_size: 8
    infer_batch_size: 4

#TODO 只有rule base 缺乏llm