import sys
import time
sys.path.append('src/rocketeval')

from rocketeval.data.data_loader import load_target_models


data_dir = "data/"
config_dir = "config/"
dataset = "wildbench"
judge="Qwen2.5-0.5B-Instruct"

train_model_names=load_target_models(
    data_dir=data_dir,
    config_dir=config_dir,
    dataset_name=dataset,
    split="full"
)
test_model_names=load_target_models(
    data_dir=data_dir,
    config_dir=config_dir,
    dataset_name=dataset,
    split="test"
)


train_model_names = [model for model in train_model_names if model not in test_model_names]
test_model_rank = {model: rank for rank, model in enumerate(test_model_names)}

import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import spearmanr, kendalltau
from rocketeval.task import score_task, ranking_task

random.seed(0)
results = []

for k in tqdm(range(5, 50), leave=False):

    for i in tqdm(range(10), leave=False):
        score_task(
            dataset=dataset,
            train_model_names=random.sample(train_model_names, k),
            test_model_names=test_model_names,
            task_id="",
            judge=judge,
            data_dir=data_dir
        )

        ranking = ranking_task(
            dataset=dataset,
            model_names=test_model_names,
            judge=judge,
            data_dir=data_dir
        )['model_test'].to_list()
        result_rank = [test_model_rank[model] for model in ranking]

        results.append(
            (spearmanr(result_rank, list(range(len(test_model_names))))[0],
            kendalltau(result_rank, list(range(len(test_model_names))))[0])
        )
        time.sleep(1)

    df = pd.DataFrame(results, columns=["spearmanr", "kendalltau"]).agg(['mean', 'std', 'min', 'max'], axis=0)
    df['K'] = k
    df.to_csv(f"n_training_models.csv", index=False, mode='a', header=False, sep=",")


