
from openai import OpenAI
import copy
import os
from typing import Dict, List
import os
from bs4 import BeautifulSoup
import re
import requests
import json
from collections import defaultdict
from json_repair import repair_json
from metric import eval_acc, eval_em, eval_f1
import numpy as np

def printf(*texts):
    if os.environ.get('DEBUG', 'True') == 'False':
        return
    if texts[0].startswith('TASK>'):
        format = "\033[1;34m{text}\033[0m"
    elif texts[0].startswith("AGENT>"):
        format = "\033[32m{text}\033[0m"
    elif texts[0].startswith('ROUND>') or texts[0].startswith('==='):
        format = "\033[1;33m{text}\033[0m"
    elif texts[0].startswith('ERROR>') or texts[0].startswith('ANSWER>') or texts[0].startswith('ENV>'):
        format = "\033[1;31m{text}\033[0m"
    else:
        format = "\033[1;37m{text}\033[0m"
    for text in texts:
        print(format.format(text=text))


def load_data(filename, file_type=['json']):
    if filename.endswith('.jsonl'):
        return [json.loads(line) for line in open(filename)]
    elif filename.endswith('.json'):
        return json.load(open(filename, 'r'))
    elif os.path.isdir(filename):
        files = [os.path.join(filename, f) for f in os.listdir(filename) if any(f.endswith(ext) for ext in file_type)]
        return [load_data(f) for f in files]
    elif filename.endswith('.txt'):
        with open(filename, 'r') as f:
            data = [line.strip() for line in f]
            return data
    else:
        raise "no suitable function to load data"


def write_file(filename, data):
    if filename.endswith('.jsonl'):
        with open(filename, 'w') as f:
            for line in data:
                f.write(json.dumps(line) + '\n')
    elif filename.endswith('.txt'):
        with open(filename, 'w') as f:
            for line in data:
                f.write(line + '\n')

    elif filename.endswith('.json'):
        with open(filename, 'w') as f:
            json.dump(data, f, indent=4)
    else:
        raise "no suitable function to write data"

def _parse(outputs: str, tag="task"):
    soup = BeautifulSoup(outputs, 'lxml')

    agents = soup.find_all(re.compile(rf'{tag}\d*', re.IGNORECASE))

    extracted_texts = [agent.get_text(strip=True).strip() for agent in agents]
    if extracted_texts == []:
        tmp = outputs.split('\n')
        for line in tmp:
            for i in range(0, 10):
                line = line.replace(f'agent {i}', '').replace(f'agent{i}','').strip()
        return tmp
    return extracted_texts

def extract_json(content):
    try:
        tmp = json.loads(repair_json(content))
        if isinstance(tmp, dict):
            return True, tmp
        elif isinstance(tmp, list):
            tmp = [e for e in tmp if isinstance(e, dict)]
            return True, tmp[0]
        else:
            return False, content
    except:
        return False, content


def extract_boxed_answer(solution_string: str) -> list:
    """
    """
    pattern = r"\\boxed\{(.*?)\}"
    answers = re.findall(pattern, solution_string)

    return answers


api_search = "http://11.219.2.7:8893"  # default retriever port

def wiki_search(query, top_k=3):
    url = f'{api_search}/api/search?query={query}&k={top_k}'
    response = requests.get(url)
    res = response.json()
    knowledge = []
    for doc in res['topk']:
        text = doc['text'][doc['text'].index('|') + 1:].replace('"', '').strip()
        title = doc['text'][:doc['text'].index('|')].replace('"', '').strip()
        knowledge.append(f"Title: {title}. Content: {text}")
    return knowledge



class LLMServer:
    def __init__(self, model_config):
        self.model_config = model_config
        self.model_name = model_config['model_name']
        self.client = OpenAI(
            api_key=model_config.get('api_key', 'EMPTY'),
            base_url=model_config['base_url'],
        )

    def __call__(self, messages, *args, **kwargs):
        _kwargs = copy.deepcopy(kwargs)
        _kwargs['model'] = self.model_name
        _kwargs['messages'] = messages
        response = self.client.chat.completions.create(
            **_kwargs
        )
        content = response.choices[0].message.content
        return content


def get_command(
        file: str,
        arguments: Dict,
        env_var: Dict,
        output_file: str
):
    _arguments = copy.deepcopy(arguments)
    _arguments['output_file'] = output_file
    _args = ' '.join([f"--{k} {v}" for k, v in _arguments.items()])
    _env = ' '.join([f"{k}={v}" for k, v in env_var.items()])
    # 构建命令
    command = f"{_env} python {file} {_args}"
    return command


MODELS = [
    {
        "model_name": "api_azure_openai_o3",
        "name": "openai-o3",
        "index": "SSSO3", 
        "description": "openai-o3 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },    
    {
        "model_name": "gpt-4o",
        "name": "gpt-4o",
        "description": "gpt-4o is the most powerful model in our lineup, offering state-of-the-art performance across various tasks. With its advanced architecture, it excels in complex problem-solving, natural language understanding, and generation tasks. It consistently ranks high on the leaderboard, delivering accurate and efficient results in a wide range of applications.",
        "cost": 10,
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "gpt-4o-mini",
        "name": "gpt-4o-mini",
        "description": "gpt-4o-mini is a compact and optimized version of gpt-4o, providing a balance of high performance and lower resource consumption. While it has a slightly reduced leaderboard ranking compared to its full version, it still delivers exceptional results for most use cases, making it a cost-effective choice for users needing a powerful model with lower computational demands.",
        "cost": 175,
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "gpt-3.5-turbo",
        "name": "gpt-3.5-turbo",
        "description": "gpt-3.5-turbo is a highly efficient model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 175,
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "DeepSeek-V3.1",
        "name": "V3.1",
        "index": "XXXR1",
        "description": "DeepSeek-V3.1 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 671,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "DeepSeek Company"
    },
    {
        "model_name": "api_ali_qwen3-max-preview",
        "name": "api_ali_qwen3-max-preview",
        "index": "api_ali_qwen3-max-preview",
        "description": "api_ali_qwen3-max-preview is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 671,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "Qwn Company"
    },
    {
        "model_name": "DeepSeek-R1",
        "name": "R1",
        "index": "XXXR1", 
        "description": "DeepSeek-R1 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 671,
        'base_url': ['http://28.12.129.184:8081/v1',],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "DeepSeek Company"
    },
    {
        "model_name": "api_openai_gpt-5",
        "name": "api_openai_gpt-5",
        "index": "HYHYHY", 
        "description": "api_openai_gpt-5 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "Tencent"

    },
    {
        "model_name": "hunyuan-turbos-latest",
        "name": "hunyuan-turbo",
        "index": "HYHYHY", 
        "description": "hunyuan-turbos-latest is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "Tencent"

    },
    {
        "model_name": "api_azure_openai_gpt-4.1",
        "name": "api_azure_openai_gpt-4.1",
        "index": "GGGPRO", 
        "description": "api_azure_openai_gpt-4.1 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "api_xai_grok-4-0709",
        "name": "api_xai_grok-4-0709",
        "index": "GGGPRORKX4", 
        "description": "api_xai_grok-4-0709 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "XAI Company"
    },
    {
        "model_name": "api_azure_openai_gpt-5",
        "name": "api_azure_openai_gpt-5",
        "index": "api_azure_openai_gpt-5", 
        "description": "api_azure_openai_gpt-5 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },   
    {
        "model_name": "api_azure_openai_gpt-5-chat-latest",
        "name": "api_azure_openai_gpt-5-chat-latest",
        "index": "api_azure_openai_gpt-5-chat-latest", 
        "description": "api_azure_openai_gpt-5-chat-latest is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "api_openai_gpt-5-chat-latest",
        "name": "api_openai_gpt-5-chat-latest",
        "index": "api_openai_gpt-5-chat-latest", 
        "description": "api_openai_gpt-5-chat-latest is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "api_google_gemini-2.5-pro",
        "name": "api_google_gemini-2.5-pro",
        "index": "GGGPRO", 
        "description": "api_google_gemini-2.5-pro is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Google & DeepMind"
    },
    {
        "model_name": "api_google_gemini-2.5-flash",
        "name": "api_google_gemini-2.5-flash",
        "index": "GGGPRO", 
        "description": "api_google_gemini-2.5-flash is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Google & DeepMind"
    },
    {
        "model_name": "GLM-4.5",
        "name": "GLM-4.5",
        "index": "GLMM4.5", 
        "description": "GLM-4.5 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': ['http://28.12.24.153:8081/v1',],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "Zhipu AI"
    },
    {
        "model_name": "api_google_gemini-2.5-pro-preview-06-05",
        "name": "google_gemini-2.5",
        "index": "GGGPRO", 
        "description": "google_gemini-2.5 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Google & DeepMind"
    },
    {
        "model_name": "api_openai_chatgpt-4o-latest",
        "name": "api_openai_chatgpt-4o-latest",
        "index": "CCC4oLastest", 
        "description": "api_openai_chatgpt-4o-latest is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    },
    {
        "model_name": "api_aws_anthropic.claude-opus-4-20250514-v1:0",
        "name": "opus",
        "index": "WWWOPUS", 
        "description": "opus is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Anthropic"
    },
    {
        "model_name": "api_azure_openai_o4-mini",
        "name": "api_azure_openai_o4-mini",
        "index": "WWWSON", 
        "description": "api_azure_openai_o4-mini is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "OpenAI"
    }, 
    
    {
        "model_name": "api_anthropic_claude-opus-4-1-20250805",
        "name": "api_anthropic_claude-opus-4-1-20250805",
        "index": "AAA20250805", 
        "description": "api_anthropic_claude-opus-4-1-20250805 is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Anthropic"
    },
    # meta-ll    
    {
        "model_name": "api_aws_anthropic.claude-sonnet-4-20250514-v1:0",
        "name": "sonnet",
        "index": "WWWSON", 
        "description": "sonnet is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": None,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Anthropic"
    },
    {
        "model_name": "api_moonshot_kimi-k2-0711-preview",
        "name": "kimi-k2-0711-preview",
        "index": "XXX102", 
        "description": "api_moonshot_kimi-k2-0711-preview is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 405, 
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "China",
        "Origin": "Moonshot from China"
    },
    {
        "model_name": "Llama-4-Scout-17B-16E-Instruct",
        "name": "Llama102B",
        "index": "XXX102", 
        "description": "Llama102B is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 405, 
        'base_url': [
            'http://29.164.185.253:8081/v1'
            ,],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Meta Company"
    },
    {
        "model_name": "Llama-4-Maverick-17B-128E-Instruct",
        "name": "Llama402B",
        "index": "XXX402", 
        "description": "Llama405B is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 405,
        'base_url': ['http://28.12.24.153:8081/v1',],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Meta Company"
    },
    {
        "model_name": "Llama-3.1-405B-Instruct",
        "name": "Llama405B",
        "index": "XXX405", 
        "description": "Llama405B is powerful model, offering solid performance at a lower cost. Though its leaderboard ranking is lower compared to the gpt-4o models, it still excels in various tasks like conversational AI and text generation, making it an excellent choice for cost-sensitive projects without sacrificing much on quality.",
        "cost": 405,
        'base_url': ['http://28.12.24.153:8081/v1',],
        'api_key': "EMPTY",
        "speed": [(1000, 8)],
        "Country": "United States American (USA)",
        "Origin": "Meta Company"
    },
    {
        "model_name": "Mistral-Small-Instruct-2409",
        "name": "Mistral24B",
        "index": "AAA",
        "description": "Mistral-Small-Instruct-2409 is a lightweight model that balances performance with efficiency. It costs 24 per token.",
        "cost": 24,
        "Average": 29.92,
        "IFEval": 62.83,
        "BBH": 40.56,
        "MMLU": 20.39,
        "GPQA": 11.07,
        "MuSR": 10.23,
        "MATH": 34.43,
        'base_url': ['http://11.219.2.7:9081/v1'],
        'api_key': "EMPTY",
        "speed": [(4557.78, 4)],
        "Country": "France (Europe)",
        "Origin": "Mixtral Company"
    },
    {
        "model_name": "Qwen3-235B-A22B",
        "name": "Qwen235B",
        "index": "BBB",
        "description": "Qwen235B is a highly advanced model, excelling in complex problem-solving and deep analysis tasks. With a high leaderboard ranking, it provides top-tier performance, making it ideal for scenarios where accuracy and reliability are paramount. Its cost is slightly higher, reflecting its superior capabilities.",
        "cost": 37,
        "Average": 50.00,
        "IFEval": 70,
        "BBH": 70,
        "MMLU": 70,
        "GPQA": 70,
        "MuSR": 70,
        "MATH": 70,
        'base_url': [
        'http://29.81.244.19:8082/v1',
        #   'http://28.12.24.153:8082/v1'
          ],
        'api_key': "EMPTY",
        "Country": "China",
        "Origin": "Alibaba Cloud"
    },
    {
        "model_name": "Qwen3-235B-A22B-Instruct-2507",
        "name": "Qwen3-235B-A22B-Instruct-2507",
        "index": "BBB",
        "description": "Qwen235B is a highly advanced model, excelling in complex problem-solving and deep analysis tasks. With a high leaderboard ranking, it provides top-tier performance, making it ideal for scenarios where accuracy and reliability are paramount. Its cost is slightly higher, reflecting its superior capabilities.",
        "cost": 37,
        "Average": 50.00,
        "IFEval": 70,
        "BBH": 70,
        "MMLU": 70,
        "GPQA": 70,
        "MuSR": 70,
        "MATH": 70,
        'base_url': [
            'http://29.164.185.253:8082/v1',
          ],
        'api_key': "EMPTY",
        "Country": "China",
        "Origin": "Alibaba Cloud"
    },
    {
        "model_name": "DeepSeek-V3",
        "name": "DeepSeek671B",
        "index": "CCC",
        "description": "DeepSeek-V3 is a highly advanced model, excelling in complex problem-solving and deep analysis tasks. With a high leaderboard ranking, it provides top-tier performance, making it ideal for scenarios where accuracy and reliability are paramount. Its cost is slightly higher, reflecting its superior capabilities.",
        "cost": 37,
        "Average": 50.00,
        "IFEval": 70,
        "BBH": 70,
        "MMLU": 70,
        "GPQA": 70,
        "MuSR": 70,
        "MATH": 70,
        'base_url': ['http://28.12.131.215:8081/v1'],
        'api_key': "EMPTY",
        "Country": "China",
        "Origin": "DeepSeek Company"
    },
    {
        "model_name": "Qwen2-1.5B-Instruct",
        "name": "Qwen1.5B",
        "index": "DDD",
        "description": "Qwen2-1.5B-Instruct is a small model that performs relatively well across various tasks.",
        "cost": 1.5,
        "Average": 14.14,
        "IFEval": 33.71,
        "BBH": 13.70 ,
        "MATH": 7.18,
        "GPQA": 1.57,
        "MuSR": 12.03,
        "MMLU": 16.68,
        'base_url': ['http://11.222.165.18:8081/v1'],
        'api_key': "EMPTY",
        "speed": [(19952.18, 1), (17497.67 , 1)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "Qwen2.5-3B-Instruct",
        "name": "Qwen3B",
        "index": "EEE",
        "description": "Qwen2.5-3B-Instruct is a small model that performs relatively well across various tasks.",
        "cost": 3,
        "Average": 27.16,
        "IFEval": 64.75,
        "BBH": 25.80,
        "MATH": 36.78,
        "GPQA": 3.02,
        "MuSR": 7.57,
        "MMLU": 25.05,
        'base_url': ['http://11.222.165.18:8082/v1'],
        'api_key': "EMPTY",
        "speed": [(12680.62, 1), (12583.20, 1)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "phi-4",
        "name": "phi4",
        "index": "FFF",
        "description": "phi-4 is a small model that performs relatively well across various tasks.",
        "cost": 7,
        "Average":  41.76,
        "IFEval": 69.00,
        "BBH": 55.80,
        "MATH": 46.37,
        "GPQA": 13.53,
        "MuSR": 16.68,
        "MMLU": 49.15,
        'base_url': ['http://11.222.165.18:8084/v1'],
        'api_key': "EMPTY",
        "speed": [(4523.51, 2)],
        "Country": "United States American (USA)",
        "Origin": "Microsoft"
    },
    {
        "model_name": "Phi-4-reasoning-plus",
        "name": "phi4plus",
        "index": "GGG",
        "description": "phi-4 is a small model that performs relatively well across various tasks.",
        "cost": 7,
        "Average": 40.95,
        "IFEval": 67.36,
        "BBH": 55.88,
        "MATH": 45.69,
        "GPQA": 12.53,
        "MuSR": 15.14,
        "MMLU": 49.12,
        'base_url': ['http://11.219.0.41:8083/v1'],
        'api_key': "EMPTY",
        "speed": [],
        "Country": "United States American (USA)",
        "Origin": "Microsoft"
    },
    {
        "model_name": "Qwen2.5-7B-Instruct",
        "name": "Qwen7B",
        "index": "HHH",
        "description": "Qwen2.5-7B-Instruct is a small model that performs relatively well across various tasks.",
        "cost": 7,
        "Average": 35.20,
        "IFEval": 75.85,
        "BBH": 34.89,
        "MATH": 50.00,
        "GPQA": 5.48,
        "MuSR": 8.45,
        "MMLU": 36.52,
        'base_url': ['http://11.219.2.7:8081/v1'],
        'api_key': "EMPTY",
        "speed": [(7942.57, 1)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    { 
        "model_name": "Llama-3.1-8B-Instruct",
        "name": "Llama8B",
        "index": "III",
        "description": "Llama-3.1-8B-Instruct is a small model that performs relatively well across various tasks.",
        "cost": 8,
        "Average": 23.76,
        "IFEval": 49.22,
        "BBH": 29.38,
        "MATH": 15.56,
        "GPQA": 8.72,
        "MuSR": 8.61,
        "MMLU": 31.09,
        'base_url': [
                      'http://11.219.2.7:9082/v1',
                     ],
        'api_key': "EMPTY",
        "speed": [(7005.68, 1)],
        "Country": "United States American (USA)",
        "Origin": "Meta"
    },
    {
        "model_name": "gemma-2-9b-it",
        "name": "Gemma9B",
        "index": "JJJ",
        "description": "Gemma9B is a model that performs relatively well across various tasks.",
        "cost": 9,
        "Average":  32.07,
        "IFEval": 74.36,
        "BBH": 42.14,
        "MATH": 19.49,
        "GPQA": 14.77,
        "MuSR": 9.74,
        "MMLU": 31.95,
        'base_url': ['http://11.222.165.18:8085/v1'],
        'api_key': "EMPTY",
        "speed": [(6518.22, 2), (8170.07, 2)],
        "Country": "United States American (USA)",
        "Origin": "Google & DeepMind"
    },
    {
        "model_name": "gemma-3-12b-it",
        "name": "Gemma12B",
        "description": "Gemma12B is a model that performs relatively well across various tasks.",
        "cost": 12,
        "Average":  32.07,
        "IFEval": 74.36,
        "BBH": 42.14,
        "MATH": 19.49,
        "GPQA": 14.77,
        "MuSR": 9.74,
        "MMLU": 31.95,
        'base_url': ['http://11.222.165.18:8086/v1'],
        'api_key': "EMPTY",
        "speed": [],
        "Country": "United States American (USA)",
        "Origin": "Google & DeepMind"
    },
    {
        "model_name": "Qwen2.5-14B-Instruct",
        "name": "Qwen14B",
        "index": "KKK",
        "description": "Qwen2.5-14B-Instruct is a model that performs relatively well across various tasks.",
        "cost": 14,
        "Average": 41.31,
        "IFEval": 81.58,
        "BBH": 48.36,
        "MATH": 54.76,
        "GPQA": 9.62,
        "MuSR": 10.16,
        "MMLU": 43.38,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(6518.22, 2), ],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "Qwen3-235B-A22B-Thinking-2507",
        "name": "Qwen3-235B-A22B-Thinking-2507",
        "index": "Qwen3-235B-A22B-Thinking-2507",
        "description": "Qwen3-235B-A22B-Thinking-2507 is a model that performs relatively well across various tasks.",
        "cost": 32,
        "Average": 46.60,
        "IFEval": 83.46,
        "BBH": 56.49,
        "MATH": 62.54,
        "GPQA": 11.74,
        "MuSR": 13.50,
        "MMLU": 51.85,
        'base_url': ['http://29.164.185.253:8082/v1', 'http://28.12.129.242:8082/v1', 'http://29.81.244.19:8082/v1'],
        'api_key': "EMPTY",
        "speed": [(5478.04, 4), (5901.12, 4)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "Qwen2.5-32B-Instruct",
        "name": "Qwen32B",
        "index": "LLL",
        "description": "Qwen2.5-32B-Instruct is a model that performs relatively well across various tasks.",
        "cost": 32,
        "Average": 46.60,
        "IFEval": 83.46,
        "BBH": 56.49,
        "MATH": 62.54,
        "GPQA": 11.74,
        "MuSR": 13.50,
        "MMLU": 51.85,
        'base_url': ['http://11.222.154.165:8081/v1', 'http://11.222.154.165:8082/v1'],
        'api_key': "EMPTY",
        "speed": [(5478.04, 4), (5901.12, 4)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "hunyuan-t1-latest",
        "name": "hunyuan-t1-latest",
        "index": "hunyuan-t1-latest",
        "description": "hunyuan-t1-latest is a model that performs well across various tasks.",
        "cost": 32,
        "Average": 22.96,
        "IFEval": 41.86,
        "BBH": 17.15,
        "MATH": 17.07,
        "GPQA":4.59,
        "MuSR":16.14,
        "MMLU": 40.96,
        'base_url': [],
        'api_key': "EMPTY",
        "speed": [(3947.77, 4), (5624.25, 4), (6095.65, 4)],
        "Country": "China",
        "Origin": "Tencent Company"
    },    
    {
        "model_name": "DeepSeek-R1-Distill-Qwen-32B",
        "name": "DSQwen32B",
        "index": "MMM",
        "description": "Qwen32B is a model that performs well across various tasks.",
        "cost": 32,
        "Average": 22.96,
        "IFEval": 41.86,
        "BBH": 17.15,
        "MATH": 17.07,
        "GPQA":4.59,
        "MuSR":16.14,
        "MMLU": 40.96,
        'base_url': ['http://11.219.2.7:8082/v1'],
        'api_key': "EMPTY",
        "speed": [(3947.77, 4), (5624.25, 4), (6095.65, 4)],
        "Country": "China",
        "Origin": "DeepSeek Company"
    },
    {
        "model_name": "QwQ-32B-Preview",
        "name": "QwQ32B",
        "index": "NNN",
        "description": "QwQ-32B-Preview is a model that performs well across various tasks.",
        "cost": 32,
        "Average": 34.12 ,
        "IFEval": 40.35,
        "BBH": 53.39,
        "MATH": 44.94,
        "GPQA": 4.25,
        "MuSR": 9.81,
        "MMLU": 51.98,
        'base_url': ['http://11.219.9.133:8081/v1', 'http://11.219.9.133:8082/v1'],
        'api_key': "EMPTY",
        "speed": [(4157.55, 8), (4201.77, 8)],
        "Country": "United States American (USA)",
        "Origin": "Huggingface Community"
    },
   {
        "model_name": "api_doubao_DeepSeek-V3.1-250821",
        "name": "api_doubao_DeepSeek-V3.1-250821",
        "index": "OOO",
        "description": "api_doubao_DeepSeek-V3.1-250821 is a model that performs well across various tasks. It costs 72 per token.",
        "cost": 72,
        "Average": 43.59 ,
        "IFEval": 79.89,
        "BBH": 57.48 ,
        "MATH": 41.77,
        "GPQA": 16.33,
        "MuSR": 17.17,
        "MMLU": 48.92,
        'base_url': ["http://11.222.185.154:8081/v1"],
        'api_key': "EMPTY",
        "speed": [(4157.55, 8), (4201.77, 8)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "Qwen2-72B-Instruct",
        "name": "Qwen72B",
        "index": "OOO",
        "description": "Qwen72B-Instruct is a model that performs well across various tasks. It costs 72 per token.",
        "cost": 72,
        "Average": 43.59 ,
        "IFEval": 79.89,
        "BBH": 57.48 ,
        "MATH": 41.77,
        "GPQA": 16.33,
        "MuSR": 17.17,
        "MMLU": 48.92,
        'base_url': ["http://11.222.185.154:8081/v1"],
        'api_key': "EMPTY",
        "speed": [(4157.55, 8), (4201.77, 8)],
        "Country": "China",
        "Origin": "Alibaba cloud"
    },
    {
        "model_name": "Llama-3.1-70B-Instruct",
        "name": "Llama70B",
        "index": "PPP",
        "description": "Llama70B is a model that performs well across various tasks. It costs 72 per token.",
        "cost": 70,
        "Average":  43.41,
        "IFEval": 86.69,
        "BBH": 55.93,
        "MATH": 38.07,
        "GPQA": 14.21,
        "MuSR": 17.69,
        "MMLU": 47.88,
        'base_url': ['http://30.159.162.134:8081/v1'],
        'api_key': "EMPTY",
        "speed": [(5279.23, 8), (5097.09, 8)],
        "Country": "United States American (USA)",
        "Origin": "Meta"
    },
    {
        "model_name": "Llama-3.3-70B-Instruct",
        "name": "Llama70B1",
        "index": "PPP",
        "description": "Llama70B is a model that performs well across various tasks. It costs 72 per token.",
        "cost": 70,
        "Average":  43.41,
        "IFEval": 86.69,
        "BBH": 55.93,
        "MATH": 38.07,
        "GPQA": 14.21,
        "MuSR": 17.69,
        "MMLU": 47.88,
        'base_url': ['http://30.159.163.237:8081/v1'],
        'api_key': "EMPTY",
        "speed": [(5279.23, 8), (5097.09, 8)],
        "Country": "United States American (USA)",
        "Origin": "Meta"
    },
    {
        "model_name": "Mixtral-8x7B-v0.1",
        "name": "Mixtral56B",
        "index": "QQQ",
        "description": "Mixtral is a model that performs well across various tasks. It costs 20 per token.",
        "leaderboard": "19.67",
        "cost": 22,
        "Average": 19.67,
        "IFEval": 23.26,
        "BBH": 30.40,
        "MATH": 9.37,
        "GPQA": 9.40,
        "MuSR": 13.66,
        "MMLU": 31.90,
        'base_url': [],
        'api_key': "EMPTY",
        "device": 8,
        "Country": "France (Europe)",
        "Origin": "Mixtral Company"
    }
]

ID = os.environ.get('ID', 'name')
if ID == 'index':
    printf("AGENT> Using the index instead of name as ID...")
    for config in MODELS:
        config['name'] = config.get('index', config['name'])
        printf(config['name'])

country = os.environ.get('COUNTRY', 'true')
if country == 'pseudo':
    printf("AGENT> Using the pseudo country of true country...")
    pseudo_map = {
        "France (Europe)": 'Group 1 Country',
        "China": 'Group 2 Country',
        "United States American (USA)": 'Group 3 Country',
    }
    for config in MODELS:
        config['Country'] = pseudo_map[config['Country']]
        config['Origin'] = config['Country'] + "' s Company"
        printf(config['Country'], config['Origin'])


DOCUMENTATIONS = {
    line['model_name']: line for line in MODELS
}

DOCUMENTATIONS['baseline'] = {
    "model_name": "baselines",
    "name": "baselines",
    "Country": "None",
    "Origin": "None"
}


DOCUMENTATIONS_R = {
    line['name']: line for line in MODELS
}

LEADERBOARD = {}

LEADERBOARD['IFEval'] = """Test the model's ability to follow explicit formatting instructions.
Scoring: Accuracy – Was the format strictly followed as requested?"""

LEADERBOARD['BBH'] = """A collection of challenging tasks for members across domains, such as language understanding, mathematical reasoning, common sense, and world knowledge.
Scoring: Accuracy – Was the correct choice selected from the available options?"""

LEADERBOARD['MATH'] = """High school-level mathematical problems, including topics such as complex algebra, geometry, and advanced calculus.
Scoring: Exact match – Was the solution both correct and in the expected format?"""

LEADERBOARD['GPQA'] = """PhD-level multiple choice questions across science domains, such as chemistry, biology, and physics.
Scoring: Accuracy – Was the correct choice selected from the available options?"""

LEADERBOARD['MuSR'] = """Multistep soft reasoning (MuSR) task. Involves reasoning and understanding of long texts, such as language understanding, reasoning capabilities, and long-context reasoning.
Scoring: Accuracy – Was the correct choice selected from the available options?"""

LEADERBOARD['MMLU'] = """Massive Multitask Language Understanding – Professional (MMLU-Pro). A collection of expertly reviewed multiple choice questions across domains such as medicine, healthcare, law, ethics, engineering, and mathematics.
Scoring: Accuracy – Was the correct choice selected from the available options?"""

LEADERBOARD['CO2'] = """Overview: Evaluation of CO2 emissions associated with members inference. This score focuses on model inference within our specific setup, considering factors like data center location and energy mix. It allows for the equivalent comparison of models in our use case.
Why it matters: Large models can have significant carbon footprints, and this evaluation helps make informed choices about model selection based on their environmental impact."""

LEADERBOARD['Average'] = """Purpose: General capability evaluation score."""

LEADERBOARD['Country'] = """Purpose: Country of the members."""

LEADERBOARD['Origin'] = """Purpose: Origin of the members."""



def load_leaderboard(
        names: List[str],
        leaderboards: List[str] = ["IFEval", "MATH", "GPQA", "MuSR", "MMLU", "Average"],
        width: int = 10,
    ):
    header = ["name"] + leaderboards
    collects = [header]
    for name in names:
        collect = [DOCUMENTATIONS_R[name][col] for col in header]
        collects.append(collect)
    
    collects = [
        ' '.join([collect[0].rjust(width+2)] + [str(e).ljust(width) for e in collect[1:]])
        for collect in collects
    ]
    collects = '\n'.join(collects)
    collects += "\n\nCaption: **Overview** of each column." + '\n\n' + '\n\n'.join([f'**{col}**: ' + LEADERBOARD[col] for i, col in enumerate(leaderboards, 1)])
    collects += f"\nYou can only select from {names} with HP > 0 (enclosed in <agent> <agent_name>)."
    return collects


def gini_coefficient(wealth):
    wealth = np.sort(wealth)
    total_wealth = np.sum(wealth)
    n = len(wealth)
    cumulative_wealth = np.cumsum(wealth)
    if total_wealth == 0:
        return 0
    gini = (n + 1 - 2 * np.sum(cumulative_wealth) / total_wealth) / n
    
    return gini

def theil_index(incomes):
    """
    """
    incomes = np.array(incomes)
    total_income = np.sum(incomes)
    mean_income = total_income / len(incomes)
    
    T = np.sum((incomes / total_income) * np.log(total_income / incomes))
    
    return T

def coefficient_of_variation(data):
    """
    """
    data = np.array(data)
    mean = np.mean(data)
    std_dev = np.std(data)
    
    cv = (std_dev / mean) * 100
    return cv

if os.environ.get('TEST', 'False') == 'True':
    model = os.environ.get('MODEL', 'Qwen2.5-7B-Instruct')
    print(model)
    config = DOCUMENTATIONS[model]
    for base_url in config['base_url']:
        print(base_url)
        client = OpenAI(
            base_url=base_url,
            api_key='empty'
        )

        messages=[{"role": "user", "content": "hello world"}]
        full_reply = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.6,
            top_p=0.95
        )
        print(full_reply.choices[0].message.content)

index2model = {model['index']: model['model_name']  for model in MODELS if 'index' in model}
name2model = {model['name']: model['model_name']  for model in MODELS if 'name' in model}

def dazzy_map(folder):
    if 'r1' in folder.lower():
        return 'DeepSeek-R1'
    if 'deepseek-v3' in folder.lower() and '3.1' not in folder.lower():
        return 'DeepSeek-V3'
    if 'v3.1' in folder.lower():
        return 'api_doubao_DeepSeek-V3.1-250821'
    if 'api_ali_qwen3-max-preview' in folder.lower():
        return 'api_ali_qwen3-max-preview'
    if '72' in folder:
        return 'Qwen2-72B-Instruct'
    if '70' in folder:
        return 'Llama-3.1-70B-Instruct'
    if 'google' in folder:
        return "api_google_gemini-2.5-pro-preview-06-05"
    if '405' in folder:
        return 'Llama-3.1-405B-Instruct'
    if 'maverick' in folder.lower():
        return "Llama-4-Maverick-17B-128E-Instruct"
    if 'scout' in folder.lower():
        return "Llama-4-Scout-17B-16E-Instruct"
    if 'hunyuan' in folder.lower():
        return "hunyuan-turbos-latest"
    if '235' in folder.lower():
        return 'Qwen3-235B-A22B'
    if 'baseline' in folder.lower():
        return 'baseline'
    if 'o3' in folder.lower() and 'pro' not in folder.lower():
        return 'api_azure_openai_o3'
    if 'opus' in folder.lower():
        return 'api_aws_anthropic.claude-opus-4-20250514-v1:0'
    if 'GLM-4.5' in folder:
        return 'GLM-4.5'
    if 'api_azure_openai_gpt-4.1' in folder.lower():
        return 'api_azure_openai_gpt-4.1'
    if 'api_xai_grok-4-0709' in folder.lower():
        return 'api_xai_grok-4-0709'
    if 'sonnet' in folder.lower():
        return "api_aws_anthropic.claude-sonnet-4-20250514-v1:0"
    if 'api_moonshot_kimi-k2-0711-preview' in folder.lower():
        return 'api_moonshot_kimi-k2-0711-preview'
    if 'api_azure_openai_gpt-5' in folder.lower():
        return 'api_azure_openai_gpt-5'
    if "api_openai_chatgpt-4o-latest" in folder.lower():
        return 'api_openai_chatgpt-4o-latest'
    if 'api_openai_gpt-5-chat-latest' in folder.lower():
        return 'api_openai_gpt-5-chat-latest'
    if 'api_azure_openai_o4-mini' in folder.lower():
        return 'api_azure_openai_o4-mini'
    if 'kimi-k2-0711-preview' in folder.lower():
        return 'api_moonshot_kimi-k2-0711-preview'
    raise ValueError(f"Unknown folder name: {folder}")
