
















import re
from typing import Callable, List, Optional, Tuple, Dict
from collections import Counter, defaultdict
from math_eval import agent_evaluation



class Agent:
    def __init__(
        self,
        query_llm: Callable,
        temperature=0.7,
    ):
        self.output_format_instructions = "On the final line output only the digits of the answer (0‑999). Provide your final answer enclosed in a LaTeX \\boxed{{...}} command."
        self.query_llm = query_llm
        self.temperature = temperature
        self.num_samples = 5

    def forward(self, problem: str) -> tuple[str, float]:
        
        system_prompt, task_prompt = self.get_prompt_for_task(problem)

        responses = []
        answers = []
        total_cost = 0.0

        
        
        for _ in range(self.num_samples):
            response, cost = self.query_llm(
                prompt=task_prompt,
                system=system_prompt,
                temperature=self.temperature,
            )
            total_cost += cost
            responses.append(response)

            match = re.search(r"\\boxed\{(\d+)\}", response)
            if match:
                answers.append(match.group(1))

        if not answers:
            
            return responses[0] if responses else "", total_cost

        
        try:
            majority_answer = Counter(answers).most_common(1)[0][0]
        except IndexError:
            
            return responses[0] if responses else "", total_cost

        
        for response in responses:
            if f"\\boxed{ {majority_answer}} " in response:
                return response, total_cost

        
        return responses[0], total_cost

    def get_prompt_for_task(self, problem: str) -> tuple[str, str]:
        system_prompt = "You are a skilled mathematician who is an expert in thinking step-by-step to reach a solution."
        task_prompt = (
            f"Solve the following math problem: {problem}\n\n"
            f"Let's think step by step.\n\n"
            f"{self.output_format_instructions}"
        )
        return system_prompt, task_prompt





def run_experiment(**kwargs):
    from utils import query_llm, create_call_limited_query_llm
    from functools import partial

    
    base_query_llm = partial(query_llm, model_name=kwargs["model_name"])

    
    limited_query_llm = create_call_limited_query_llm(
        base_query_llm,
        max_calls=kwargs["max_calls"],
    )

    accuracy, cost_total, processed, num_llm_calls, df = agent_evaluation(
        Agent, limited_query_llm, year=kwargs["year"]
    )
    return accuracy, cost_total, processed, num_llm_calls, df
