# Training Configuration for DeepSeek R1 GRPO Training
# Based on parameters from deepseek_r1_0528_qwen3_(8b)_grpo.py
# This file contains all configurable parameters for training

# Project settings
project:
  dataset_name: "SimuAgent/CompleteSystem"
  wandb_project: "CompleteSystem-0618"

# Model configuration
model:
  name: "unsloth/DeepSeek-R1-0528-Qwen3-8B"  # "unsloth/DeepSeek-R1-0528-Qwen3-8B", "Qwen/Qwen3-8B", "unsloth/Qwen3-4B-Base"
  max_seq_length: 4096
  max_prompt_length: 512  # Calculated as 90% quantile from notebook
  load_in_4bit: true
  fast_inference: true
  gpu_memory_utilization: 0.7

# LoRA configuration
lora:
  rank: 32
  alpha: 64  # lora_rank * 2 as per notebook
  target_modules:
    - "q_proj"
    - "k_proj" 
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  use_gradient_checkpointing: "unsloth"

# Training parameters
training:
  seed: 3407
  learning_rate: 5e-6
  lr_scheduler_type: "linear"
  warmup_ratio: 0.1
  weight_decay: 0.01
  optim: "adamw_8bit"
  num_train_epochs: 1
  max_steps: 100
  temperature: 1.0
  bf16: true  # Based on is_bfloat16_supported() from notebook
  fp16: false  # Complementary to bf16
  max_grad_norm: 1.0  # Standard value, not specified in notebook
  num_iterations: 1  # GRPO specific
  beta: 0.01  # GRPO specific, standard value
  num_generations: 4
  per_device_train_batch_size: 1
  gradient_accumulation_steps: 1
  gradient_checkpointing: true
  
# Saving and logging
saving:
  save_strategy: "steps"
  save_steps: 100
  save_only_model: true
  logging_steps: 1
  log_on_each_node: false
  log_completions: true
  report_to: "wandb"  # Set to "none" as in notebook, change to "wandb" if needed

# VLLM configuration
vllm:
  use_vllm: true
  gpu_memory_utilization: 0.7  # Same as model gpu_memory_utilization
  sampling_params:
    min_p: 0.1
    top_p: 1.0
    top_k: -1
    seed: 3407
    temperature: 1.0
    max_tokens: 1024

# Environment configuration
environment:
  max_steps: 1
  use_reflection: false
  suppress_logs: true

# Evaluation
evaluation:
  eval_steps: 100

# Reward weights - Based on DeepSeek R1 notebook reward functions
reward_weights:
  # Power System Evaluation Components
  connectivity: 0.0                    # Generator-to-load connectivity
  validation: 0.0                      # Basic graph validation (errors, warnings, unconnected ports)
  parameter: 0.0                       # Correctness of block parameters
  conversion: 0.0                      # Success of converting to Pandapower format
  diagnostic: 0.0                      # Power flow and electrical validity
  load_satisfaction: 1.0               # Whether loads are adequately supplied (primary reward)
  structure: 0.0                       # Overall network structure quality
  tool_execution: 0.10                 # Success rate of tool calls
  format: 0.05                         # XML formatting correctness
  xml: 0.05                            # XML structure validity
  connection_addition: 0.0             # Reward for successfully adding connections (max: 3)
  block_addition: 0.0                  # Reward for successfully adding blocks (max: 2)
  frequency_coherence: 0.0             # Consistency of frequency values across blocks
  voltage_coherence: 0.5               # Compatibility of voltage levels between connected blocks
  block_effectiveness: 0.5             # Effectiveness of individual blocks based on connectivity
  
# Resume training settings
resume:
  # Path to resume from (set this when resuming)
  checkpoint_path: ""
  # Whether to resume from the latest checkpoint
  resume_from_checkpoint: false

# Files to upload to wandb
wandb_upload_files:
  - "envs/environments/tool_environment.py"
  - "rewards/power_system_reward.py"
  - "configs/training_config.yaml" 