# Set global random seed before imports
import unsloth
import os, random, numpy as np, torch
import logging
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

# Configuration for log suppression
suppress_logs = True
if suppress_logs:
    # Suppress all logging messages from system_parser and related modules
    logging.getLogger('system_parser').setLevel(logging.CRITICAL)
    logging.getLogger('system_parser.system_graph').setLevel(logging.CRITICAL)
    logging.getLogger('system_parser.pandapower').setLevel(logging.CRITICAL)
    logging.getLogger('system_parser.pandapower.pandapower_converter').setLevel(logging.CRITICAL)
    
    # Suppress pandapower library logs
    logging.getLogger('pandapower').setLevel(logging.CRITICAL)
    logging.getLogger('pandapower.diagnostic_reports').setLevel(logging.CRITICAL)
    
    # Also suppress other potential noisy loggers
    logging.getLogger('transformers').setLevel(logging.ERROR)
    logging.getLogger('torch').setLevel(logging.ERROR)
    logging.getLogger('unsloth').setLevel(logging.ERROR)
    
    # Set root logger to only show critical messages
    logging.getLogger().setLevel(logging.CRITICAL)
else:
    # Suppress system_parser module logs when not fully suppressing
    logging.getLogger('system_parser.system_graph').setLevel(logging.CRITICAL)

# Import the refactored ToolEnvironment class
from envs.environments import ToolEnvironment
from envs.validation_config import ValidationConfig
from trainers.grpo_env_trainer_unsloth import UnslothGRPOEnvTrainer
from rewards.power_system_reward import PowerSystemReward

# Environment variables and settings
SEED = 42

# Set random seeds for reproducibility
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Import dependencies
from trl import GRPOConfig
from unsloth import FastLanguageModel
from utils.data_utils import preprocess_dataset
from utils.wandb_callbacks import create_script_upload_callback
from tools.search_blocks import search_blocks

# Project configuration
dataset_name = "simuagent_dataset"
wandb_project = "simuagent_qwen3_training"
os.environ["WANDB_PROJECT"] = wandb_project

# Generate run name
run_name = "qwen3_grpo_run_001"

print(f"Run name: {run_name}")

# Tool prompt for reasoning
SYSTEM_PROMPT = """
Think step-by-step inside <think>...</think> tags. Provide your final answer inside <answer>...</answer> tags.

You have access to tools to help solve problems:
{tool_descriptions}

Call tools using a JSON command within <tool> tags, including:

"name": tool name
"args": tool arguments
Tool output will appear in <result> tags. Multiple tool calls are allowed if needed.
<answer>...</answer> tags must contain only the final answer.</answer>
"""

# Setup environment
dataset = preprocess_dataset(dataset_name, "train")

# Create validation configuration
validation_config = ValidationConfig()

# Reward weights
reward_weights = {
    'convergence': 0.3,
    'voltage_violations': 0.2,
    'thermal_violations': 0.2,
    'power_balance': 0.15,
    'reactive_power': 0.1,
    'frequency': 0.05
}

# Create the refactored ToolEnvironment
env = ToolEnvironment(
    dataset=dataset,
    system_prompt=SYSTEM_PROMPT,
    tools=[search_blocks],
    max_steps=2000,
    reward=PowerSystemReward(
        tools=[search_blocks],
        power_system_weights=reward_weights
    ),
    validation_config=validation_config,
)

print(env.system_prompt)

# Load and configure model - Using Qwen2.5-Coder-3B for faster training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit",
    max_seq_length=8192,
    load_in_4bit=True,
    fast_inference=True,
    max_lora_rank=64,
    gpu_memory_utilization=0.8,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=128,
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
)

# Training configuration
training_args = GRPOConfig(
    seed=SEED,
    output_dir=f"outputs/{wandb_project}/{run_name}",
    run_name=f"{run_name}",
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    num_train_epochs=2,
    temperature=0.7,
    max_steps=1500,
    bf16=True,
    max_grad_norm=1.0,
    num_iterations=15,
    beta=0.1,
    max_prompt_length=4096,
    max_completion_length=4096,
    per_device_train_batch_size=2,
    num_generations=6,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    save_strategy="steps",
    save_steps=150,
    save_only_model=True,
    use_vllm=False,
    vllm_gpu_memory_utilization=0.8,
    logging_steps=15,
    log_on_each_node=False,
    log_completions=True,
    report_to=["wandb"],
    reward_weights=env.get_reward_weights(),
)

# Create callback to upload script to wandb
script_callback = create_script_upload_callback(
    script_path=__file__,
    additional_files=[]
)

# Initialize trainer
trainer = UnslothGRPOEnvTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=env.get_reward_funcs(),
    env=env,
    args=training_args,
    train_dataset=env.get_dataset(),
    eval_dataset=env.get_eval_dataset(),
    my_eval_steps=75,
    callbacks=[script_callback],
)

if __name__ == "__main__":
    trainer.train()