import os
import sys
from tqdm import tqdm
import torch

project_root = os.environ.get("PROJECT_ROOT")
if project_root and project_root not in sys.path:
    sys.path.append(project_root)
from env import Agent, Math500Dataset

data_root = os.environ.get("DATA_ROOT")

# Get model name from path
model_path = f"{data_root}/DeepSeek-R1-Distill-Qwen-7B"
# model_path = f"{data_root}/Qwen3-8B"
model_name = os.path.basename(model_path)

model = Agent(model_path=model_path, is_anyprecision=False, device="cuda:3")

dataset = Math500Dataset(dataset_path=f"{data_root}/efficient-reasoning/MATH-500", prompt_type="better", shuffle=False, xverify_path=f"{data_root}/xVerify-9B-C", device="cuda:3")
dataset_name = "MATH-500"

prompt = dataset.get_prompt()

answers = []
lengths = []
for i in tqdm(range(len(prompt)), desc="Processing prompt without kv-cache"):
    # print(prompt[i])
    answer, length = model(prompt[i])
    print(answer)
    answers.append(answer)
    lengths.append(length)
    
    # # Clear KV-Cache after each inference
    # if hasattr(model.model, 'clear_kv_cache'):
    #     model.model.clear_kv_cache()
    # else:
    #     # If the model doesn't have clear_kv_cache method, try to clear CUDA cache
    #     torch.cuda.empty_cache()

# Get evaluation results
# results = dataset.result_eval(answers)
results = dataset.eval_xverify(answers)
mean_length = sum(lengths) / len(lengths)

# Create results directory if it doesn't exist
results_dir = "./results"
os.makedirs(results_dir, exist_ok=True)

# Generate filename using dataset and model names
result_file = os.path.join(results_dir, f"{dataset_name}_{model_name}_results_xverify.txt")

# Save results to file
with open(result_file, "w") as f:
    f.write(f"Evaluation Results:\n")
    f.write("=" * 50 + "\n")
    f.write(f"Dataset: {dataset_name}\n")
    f.write(f"Model: {model_name}\n")
    f.write(f"Accuracy: {results}\n")
    f.write(f"Mean Length: {mean_length}\n")
    f.write("\n" + "=" * 50 + "\n")
