


















import re
from typing import Callable, List, Optional, Tuple, Dict
from collections import Counter, defaultdict
from math_eval import agent_evaluation



import re
from collections import Counter


class Agent:
    def __init__(
        self,
        query_llm: Callable,
        temperature=0.0,
    ):
        self.query_llm = query_llm
        self.output_format_instructions = "On the final line output only the digits of the answer (0-999). Provide your final answer enclosed in a LaTeX \\boxed{{...}} command."

        
        self.generation_temperature = 0.7
        self.review_temperature = 0.1
        self.synthesis_temperature = 0.0

        
        self.num_experts = 3
        self.expert_personas = [
            "You are a meticulous and cautious mathematician. Your guiding principle is 'slow and steady wins the race'. You solve problems by breaking them down into the smallest possible steps based on fundamental principles. You avoid leaps of logic and verify each step before proceeding.",
            "You are a brilliant and intuitive mathematician, known for finding elegant, non-obvious solutions. You look for symmetries, invariants, or a change of perspective that radically simplifies the problem. You trust your insights but explain them clearly.",
            "You are a mathematician with a strong background in computer science. You approach problems by trying to frame them algorithmically. You think in terms of states, transitions, and recurrence relations, and you analyze the behavior of these systems to find the solution.",
        ]

    def _extract_answer(self, text: str) -> Optional[str]:
        
        if not text:
            return None
        matches = re.findall(r"\\boxed\{(\d{1,3})\}", text)
        if matches:
            return matches[-1]
        return None

    def forward(self, problem: str) -> tuple[str, float]:
        
        total_cost = 0.0

        
        solutions = []
        for i in range(self.num_experts):
            persona = self.expert_personas[i % len(self.expert_personas)]
            prompt = f"Solve the following AIME problem by thinking step-by-step. {self.output_format_instructions}\n\nPROBLEM:\n{problem}\n\nSOLUTION:"
            try:
                response, cost = self.query_llm(
                    prompt=prompt,
                    system=persona,
                    temperature=self.generation_temperature,
                )
                solutions.append(response)
                total_cost += cost
            except Exception:
                
                solutions.append(f"Expert {i + 1} failed to generate a solution.")

        
        critiques = []
        reviewer_system_prompt = "You are a skeptical peer reviewer examining a proposed solution to an AIME problem. Your task is to be extremely critical. Do not accept any statement at face value. Re-read the original problem carefully. Check calculations. Scrutinize the logical flow. **Pattern Verification:** If the solution relies on a pattern, you MUST test it on several new examples. If you find an error, clearly explain the flaw and provide a corrected line of reasoning and a final corrected answer. If the solution is completely sound, state that and re-state the final answer."
        for sol in solutions:
            prompt = f"Original Problem:\n{problem}\n\nProposed Solution to Review:\n{sol}\n\nYour Critical Review and Corrected Solution:"
            try:
                review, cost = self.query_llm(
                    prompt=prompt,
                    system=reviewer_system_prompt,
                    temperature=self.review_temperature,
                )
                critiques.append(review)
                total_cost += cost
            except Exception:
                critiques.append("Reviewer failed to provide a critique.")

        
        synthesis_prompt_parts = [
            f"You are the Editor-in-Chief of a prestigious mathematics journal, responsible for publishing the final, canonical solution to this AIME problem. You have received {self.num_experts} independent attempts and their corresponding critical reviews. Your task is to produce the definitive solution.\n\nProblem:\n{problem}"
        ]
        for i, (sol, crit) in enumerate(zip(solutions, critiques)):
            synthesis_prompt_parts.append(
                f"\n--- ATTEMPT {i + 1} ---\nSolution: {sol}\nCritique: {crit}\n---"
            )

        synthesis_prompt_parts.append(
            f"\nSYNTHESIS AND FINAL JUDGEMENT:\n1. First, briefly state the final numerical answer proposed by each of the reviewed attempts.\n2. Based on the critiques, determine which approach is the most reliable, or if all are flawed. Explain your reasoning.\n3. Construct the final, clear, step-by-step, correct solution. Leverage insights from the valid parts of the attempts and correct any identified errors. {self.output_format_instructions}"
        )

        synthesizer_prompt = "\n".join(synthesis_prompt_parts)
        synthesizer_system_prompt = "You are a master mathematician and editor, synthesizing multiple reviewed solutions into one canonical, correct answer."

        final_response = ""
        try:
            final_response, cost = self.query_llm(
                prompt=synthesizer_prompt,
                system=synthesizer_system_prompt,
                temperature=self.synthesis_temperature,
            )
            total_cost += cost
        except Exception:
            pass  

        
        if self._extract_answer(final_response) is None:
            
            reviewed_answers = [self._extract_answer(c) for c in critiques]
            valid_reviewed_answers = [
                ans for ans in reviewed_answers if ans is not None
            ]

            if valid_reviewed_answers:
                most_common_answer = Counter(valid_reviewed_answers).most_common(1)[0][
                    0
                ]
                final_response += f"\n\n[Fallback to Majority Vote on Reviewed Solutions]\n\\boxed{ {most_common_answer}} "
            else:
                
                original_answers = [self._extract_answer(s) for s in solutions]
                valid_original_answers = [
                    ans for ans in original_answers if ans is not None
                ]
                if valid_original_answers:
                    most_common_answer = Counter(valid_original_answers).most_common(1)[
                        0
                    ][0]
                    final_response += f"\n\n[Fallback to Majority Vote on Original Solutions]\n\\boxed{ {most_common_answer}} "
                else:
                    
                    final_response += "\n\n[Fallback] Could not determine a final answer from any stage.\n\\boxed{000}"

        return final_response, total_cost





def run_experiment(**kwargs):
    from utils import query_llm, create_call_limited_query_llm
    from functools import partial

    
    base_query_llm = partial(query_llm, model_name=kwargs["model_name"])

    
    limited_query_llm = create_call_limited_query_llm(
        base_query_llm,
        max_calls=kwargs["max_calls"],
    )

    accuracy, cost_total, processed, num_llm_calls, df = agent_evaluation(
        Agent, limited_query_llm, year=kwargs["year"]
    )
    return accuracy, cost_total, processed, num_llm_calls, df
