# Training Configuration for SimuAgent
# This file contains all configurable parameters for training

# Project settings
project:
  dataset_name: "SimuAgent/CompleteSystem"
  wandb_project: "CompleteSystem-0618"

# Model configuration
model:
  name: "Qwen/Qwen2.5-7B-Instruct"
  max_seq_length: 4096  # 512 * 8
  max_prompt_length: 512  # 512 * 1
  load_in_4bit: true
  fast_inference: true
  gpu_memory_utilization: 0.6

# LoRA configuration
lora:
  rank: 32
  alpha: 32
  target_modules:
    - "q_proj"
    - "k_proj" 
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
  use_gradient_checkpointing: "unsloth"

# Training parameters
training:
  seed: 1000
  learning_rate: 5e-6
  lr_scheduler_type: "constant_with_warmup"
  warmup_steps: 10
  num_train_epochs: 1
  max_steps: 2000
  temperature: 1.0
  bf16: true
  max_grad_norm: 0.1
  num_iterations: 2
  beta: 0.002
  num_generations: 4
  per_device_train_batch_size: 4  # Same as num_generations
  gradient_accumulation_steps: 1
  gradient_checkpointing: true
  
# Saving and logging
saving:
  save_strategy: "steps"
  save_steps: 100
  save_only_model: true
  logging_steps: 1
  log_on_each_node: false
  log_completions: true
  report_to: "wandb"

# VLLM configuration
vllm:
  use_vllm: true
  gpu_memory_utilization: 0.9

# Environment configuration
environment:
  max_steps: 3
  use_reflection: false
  suppress_logs: true

# Evaluation
evaluation:
  eval_steps: 100

# Reward weights (default values from PowerSystemReward)
reward_weights:
  # Power System Evaluation Components
  connectivity: 0.0                    # Generator-to-load connectivity
  validation: 0.0                      # Basic graph validation (errors, warnings, unconnected ports)
  parameter: 0.0                       # Correctness of block parameters
  conversion: 0.0                      # Success of converting to Pandapower format
  diagnostic: 0.0                      # Power flow and electrical validity
  load_satisfaction: 1.0               # Whether loads are adequately supplied (primary reward)
  structure: 0.0                       # Overall network structure quality
  tool_execution: 0.10                 # Success rate of tool calls
  format: 0.05                         # XML formatting correctness
  xml: 0.05                            # XML structure validity
  connection_addition: 0.5             # Reward for successfully adding connections (max: 3)
  block_addition: 0.5                  # Reward for successfully adding blocks (max: 2)
  frequency_coherence: 0.5             # Consistency of frequency values across blocks
  voltage_coherence: 0.5               # Compatibility of voltage levels between connected blocks
  port_connectivity: 0.0               # Connectivity of ports
  block_effectiveness: 0.5             # Effectiveness of individual blocks based on connectivity
  
# Resume training settings
resume:
  # Path to resume from (set this when resuming)
  checkpoint_path: "outputs/CompleteSystem0611/grpo-seed1000-AW/checkpoint-1000"
  # Whether to resume from the latest checkpoint
  resume_from_checkpoint: true

# Files to upload to wandb
wandb_upload_files:
  - "envs/environments/tool_environment.py"
  - "rewards/power_system_reward.py"
  - "configs/training_config.yaml" 