{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3138cc71",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import subprocess\n",
    "import tempfile\n",
    "import pathlib\n",
    "import logging\n",
    "import argparse\n",
    "import time\n",
    "from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError\n",
    "from typing import List, Dict, Any, Tuple, Callable, Optional\n",
    "from datasets import load_dataset # For Hugging Face's MBPP\n",
    "# You'll need to clone the human-eval repository from OpenAI:\n",
    "# git clone https://github.com/openai/human-eval.git\n",
    "# And install it, or add its path to sys.path\n",
    "# For example:\n",
    "# import sys\n",
    "# sys.path.append(\"path/to/human-eval\")\n",
    "from human_eval.data import read_problems, write_jsonl, stream_jsonl\n",
    "from human_eval.execution import check_correctness, swallow_io, time_limit\n",
    "\n",
    "# --- Configuration ---\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n",
    "DEFAULT_TIMEOUT_SECONDS = 10 # Default timeout for code execution\n",
    "\n",
    "# --- Benchmark Loading ---\n",
    "\n",
    "def load_mbpp_dataset(split=\"test\", N=None) -> List[Dict[str, Any]]:\n",
    "    \"\"\"\n",
    "    Loads the MBPP (Mostly Basic Python Problems) dataset from Hugging Face.\n",
    "    Each problem includes: task_id, text (prompt), code (solution), test_list (assertions).\n",
    "    \"\"\"\n",
    "    logging.info(\"Loading MBPP dataset...\")\n",
    "    try:\n",
    "        # The 'google/mbpp' dataset on Hugging Face has multiple configurations.\n",
    "        # 'full' or 'sanitized' are common choices. Let's try 'full' first.\n",
    "        # Sometimes the direct 'google/mbpp' might have issues or specific subsets.\n",
    "        # 'mbpp' is another common identifier.\n",
    "        try:\n",
    "            dataset = load_dataset(\"mbpp\", \"full\", split=split, trust_remote_code=True)\n",
    "        except Exception:\n",
    "            logging.warning(\"Failed to load 'mbpp' with 'full' config, trying 'sanitized'...\")\n",
    "            try:\n",
    "                dataset = load_dataset(\"mbpp\", \"sanitized\", split=split, trust_remote_code=True)\n",
    "            except Exception as e_sanitized:\n",
    "                logging.warning(f\"Failed to load 'mbpp' with 'sanitized' config, trying 'google/mbpp' directly...\")\n",
    "                try:\n",
    "                     dataset = load_dataset(\"google/mbpp\", split=split, trust_remote_code=True) # Original dataset name\n",
    "                except Exception as e_google:\n",
    "                    logging.error(f\"Could not load MBPP dataset. Errors: {e_sanitized}, {e_google}\")\n",
    "                    raise\n",
    "\n",
    "        problems = [problem for problem in dataset]\n",
    "        if N is not None:\n",
    "            problems = problems[:N]\n",
    "        logging.info(f\"Loaded {len(problems)} problems from MBPP ({split} split).\")\n",
    "        # Standardize structure slightly for consistency if needed\n",
    "        # MBPP from HF already has 'text', 'test_list', 'code', 'task_id'\n",
    "        # Example problem:\n",
    "        # {'task_id': 1, 'text': 'Write a function to find the Nth Fibonacci number.',\n",
    "        #  'code': 'def fib(n):\\n  ...', 'test_list': ['assert fib(0) == 0', 'assert fib(1) == 1'],\n",
    "        #  'test_setup_code': '', 'challenge_test_list': []}\n",
    "        # We primarily need 'text' (as prompt), 'task_id', and 'test_list'.\n",
    "        # The prompt for the agent might just be 'text'.\n",
    "        formatted_problems = []\n",
    "        for i, prob in enumerate(problems):\n",
    "            formatted_problems.append({\n",
    "                \"task_id\": prob.get(\"task_id\", f\"mbpp/{i}\"),\n",
    "                \"prompt\": prob[\"text\"],\n",
    "                \"tests\": prob[\"test_list\"],\n",
    "                \"entry_point\": None, # MBPP often defines function name in prompt/solution\n",
    "                \"canonical_solution\": prob.get(\"code\")\n",
    "            })\n",
    "        return formatted_problems\n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error loading MBPP dataset: {e}\")\n",
    "        return []\n",
    "\n",
    "def load_humaneval_dataset(N=None) -> List[Dict[str, Any]]:\n",
    "    \"\"\"\n",
    "    Loads the HumanEval dataset.\n",
    "    Problems are dictionaries with keys: task_id, prompt, entry_point, canonical_solution, test.\n",
    "    \"\"\"\n",
    "    logging.info(\"Loading HumanEval dataset...\")\n",
    "    try:\n",
    "        problems = read_problems() # Reads from human_eval/data/HumanEval.jsonl.gz\n",
    "        problem_list = [problems[task_id] for task_id in problems]\n",
    "        if N is not None:\n",
    "            problem_list = problem_list[:N]\n",
    "        logging.info(f\"Loaded {len(problem_list)} problems from HumanEval.\")\n",
    "        # HumanEval problems already have a good structure:\n",
    "        # 'task_id', 'prompt', 'entry_point', 'canonical_solution', 'test'\n",
    "        # We will reformat slightly for consistency with MBPP's loaded structure\n",
    "        formatted_problems = []\n",
    "        for prob in problem_list:\n",
    "            formatted_problems.append({\n",
    "                \"task_id\": prob[\"task_id\"],\n",
    "                \"prompt\": prob[\"prompt\"],\n",
    "                \"tests\": prob[\"test\"], # This is a string containing the test function\n",
    "                \"entry_point\": prob[\"entry_point\"],\n",
    "                \"canonical_solution\": prob[\"canonical_solution\"]\n",
    "            })\n",
    "        return formatted_problems\n",
    "    except FileNotFoundError:\n",
    "        logging.error(\"HumanEval dataset not found. Make sure you've cloned the openai/human-eval repository and it's accessible.\")\n",
    "        return []\n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error loading HumanEval dataset: {e}\")\n",
    "        return []\n",
    "\n",
    "# --- Agent Interaction ---\n",
    "\n",
    "def generate_code_with_multi_agent_framework(problem_prompt: str, task_id: str, entry_point: Optional[str]) -> List[str]:\n",
    "    \"\"\"\n",
    "    Placeholder function to interact with your multi-agent framework.\n",
    "    This function needs to be implemented based on your specific framework's API.\n",
    "\n",
    "    Args:\n",
    "        problem_prompt (str): The problem description/prompt.\n",
    "        task_id (str): Unique ID for the task.\n",
    "        entry_point (Optional[str]): The required function name (especially for HumanEval).\n",
    "\n",
    "    Returns:\n",
    "        List[str]: A list of K generated code solutions.\n",
    "    \"\"\"\n",
    "    logging.info(f\"Task {task_id}: Requesting code generation from multi-agent framework...\")\n",
    "    # --- REPLACE THIS WITH YOUR AGENT FRAMEWORK CALL ---\n",
    "    # Example:\n",
    "    # solutions = my_agent_framework.generate(prompt=problem_prompt, num_solutions=3, entry_point=entry_point)\n",
    "    # For now, returning a dummy solution\n",
    "    dummy_solution = f\"# Solution for {task_id} with prompt: {problem_prompt[:50]}...\\n\"\n",
    "    if entry_point:\n",
    "        dummy_solution += f\"def {entry_point}():\\n    pass\\n\"\n",
    "    else: # For MBPP, function name is often in the prompt\n",
    "        # Try to extract function name if possible or let the agent figure it out\n",
    "        dummy_solution += \"def solve():\\n    pass\\n\"\n",
    "\n",
    "    # Simulate generating 3 solutions (for pass@3)\n",
    "    solutions = [\n",
    "        dummy_solution + f\"# Attempt 1\",\n",
    "        dummy_solution + f\"# Attempt 2 (potentially better)\",\n",
    "        dummy_solution + f\"# Attempt 3 (maybe incorrect)\"\n",
    "    ]\n",
    "    # --- END OF PLACEHOLDER ---\n",
    "    logging.info(f\"Task {task_id}: Received {len(solutions)} solutions from agent.\")\n",
    "    return solutions\n",
    "\n",
    "# --- Code Extraction and Sanitization ---\n",
    "\n",
    "def extract_python_code(agent_output: str) -> str:\n",
    "    \"\"\"\n",
    "    Extracts Python code from the agent's raw output.\n",
    "    Often, models output code within markdown blocks (```python ... ```).\n",
    "    \"\"\"\n",
    "    if \"```python\" in agent_output:\n",
    "        code = agent_output.split(\"```python\")[1].split(\"```\")[0]\n",
    "        return code.strip()\n",
    "    elif \"```\" in agent_output: # Generic markdown block\n",
    "        code = agent_output.split(\"```\")[1].split(\"```\")[0]\n",
    "        return code.strip()\n",
    "    # Add more sophisticated extraction logic if needed\n",
    "    return agent_output # Assume raw output is code if no markdown found\n",
    "\n",
    "# --- Evaluation Engine ---\n",
    "\n",
    "def _execute_code_sandboxed(code_to_run: str, timeout_seconds: int) -> Tuple[bool, str, Any, float]:\n",
    "    \"\"\"\n",
    "    Executes arbitrary Python code in a sandboxed environment (separate process).\n",
    "    Returns: (success, status_message, result, execution_time)\n",
    "             success: True if code ran and all assertions passed (for MBPP) or returned normally.\n",
    "             status_message: 'OK', 'Timeout', 'RuntimeError', 'AssertionError', etc.\n",
    "             result: The result of the execution (e.g., for HumanEval's check_correctness)\n",
    "                     or None if not applicable/error.\n",
    "             execution_time: Time taken for execution in seconds.\n",
    "    \"\"\"\n",
    "    start_time = time.time()\n",
    "    # Using a temporary file to write the code and execute it\n",
    "    with tempfile.NamedTemporaryFile(mode=\"w\", suffix=\".py\", delete=False) as tmpf:\n",
    "        tmpf.write(code_to_run)\n",
    "        tmp_file_name = tmpf.name\n",
    "\n",
    "    process = None\n",
    "    try:\n",
    "        # Execute the temporary file as a script\n",
    "        process = subprocess.Popen(\n",
    "            [sys.executable, tmp_file_name],\n",
    "            stdout=subprocess.PIPE,\n",
    "            stderr=subprocess.PIPE,\n",
    "            text=True\n",
    "        )\n",
    "        stdout, stderr = process.communicate(timeout=timeout_seconds)\n",
    "        exec_time = time.time() - start_time\n",
    "\n",
    "        if process.returncode == 0:\n",
    "            # For MBPP, success means no stderr and specific output if expected (usually just no errors)\n",
    "            # For HumanEval, this basic check is not enough; check_correctness is needed.\n",
    "            # Here, we assume if returncode is 0, the *script* ran without crashing.\n",
    "            # Assertions within the script will cause non-zero return if they raise unhandled exceptions.\n",
    "            return True, \"OK\", stdout, exec_time\n",
    "        else:\n",
    "            error_type = \"RuntimeError\" # Default\n",
    "            if \"AssertionError\" in stderr:\n",
    "                error_type = \"AssertionError\"\n",
    "            # You might want to parse stderr for more specific error types\n",
    "            return False, f\"{error_type}: {stderr[:200]}...\", None, exec_time\n",
    "\n",
    "    except subprocess.TimeoutExpired:\n",
    "        exec_time = time.time() - start_time\n",
    "        if process:\n",
    "            process.kill()\n",
    "            process.communicate() # Clean up\n",
    "        return False, \"Timeout\", None, exec_time\n",
    "    except Exception as e:\n",
    "        exec_time = time.time() - start_time\n",
    "        return False, f\"ExecutionSetupError: {str(e)}\", None, exec_time\n",
    "    finally:\n",
    "        if os.path.exists(tmp_file_name):\n",
    "            os.remove(tmp_file_name)\n",
    "\n",
    "\n",
    "def evaluate_mbpp_solution(task_id: str, generated_code: str, tests: List[str], timeout_seconds: int) -> Dict[str, Any]:\n",
    "    \"\"\"\n",
    "    Evaluates a generated solution for an MBPP problem.\n",
    "    Combines the generated code with test assertions and executes them.\n",
    "    \"\"\"\n",
    "    logging.debug(f\"Task {task_id} (MBPP): Evaluating solution.\")\n",
    "\n",
    "    # MBPP tests are usually a list of assert statements.\n",
    "    # The generated code should define the function(s) being tested.\n",
    "    # We need to ensure the function name in generated_code matches what tests expect.\n",
    "    # This can be tricky if the agent doesn't follow instructions perfectly.\n",
    "    # For simplicity, we assume the generated code defines the necessary functions.\n",
    "\n",
    "    full_code_to_run = generated_code + \"\\n\\n\" + \"\\n\".join(tests)\n",
    "\n",
    "    # It's crucial to run this in a sandboxed/restricted environment.\n",
    "    # Using multiprocessing.Process for better isolation than subprocess for complex cases,\n",
    "    # but subprocess is simpler for direct script execution.\n",
    "    # For now, using the _execute_code_sandboxed with subprocess.\n",
    "\n",
    "    success, status, _, exec_time = _execute_code_sandboxed(full_code_to_run, timeout_seconds)\n",
    "\n",
    "    return {\n",
    "        \"task_id\": task_id,\n",
    "        \"passed\": success,\n",
    "        \"status\": status,\n",
    "        \"execution_time\": exec_time\n",
    "    }\n",
    "\n",
    "\n",
    "def evaluate_humaneval_solution(task_id: str, prompt: str, generated_code_completion: str, test_code: str, entry_point: str, timeout_seconds: int) -> Dict[str, Any]:\n",
    "    \"\"\"\n",
    "    Evaluates a generated solution for a HumanEval problem using OpenAI's utilities.\n",
    "    \"\"\"\n",
    "    logging.debug(f\"Task {task_id} (HumanEval): Evaluating solution for entry point '{entry_point}'.\")\n",
    "\n",
    "    # The `human_eval` library expects a \"completion\" which is the body of the function.\n",
    "    # The `prompt` usually contains the function signature up to the docstring.\n",
    "    # The `generated_code_completion` should be what the LLM generates to complete the function.\n",
    "    # We need to ensure the `generated_code_completion` is just the completion, not the full function again.\n",
    "    # If your agent returns the full function, you might need to strip the signature.\n",
    "\n",
    "    # For this example, let's assume `generated_code_completion` is the full function code.\n",
    "    # The `check_correctness` function handles constructing the full program.\n",
    "    # It expects the `completion` to be the part that fills in the `prompt`.\n",
    "    # If `generated_code_completion` is the *entire* function, we might need to adjust.\n",
    "    # Let's assume `generated_code_completion` IS the full function code provided by the agent.\n",
    "    # The `check_correctness` function in `human_eval` is a bit intricate.\n",
    "    # It writes the solution to a file and runs it.\n",
    "\n",
    "    # A common way to use check_correctness:\n",
    "    # problem = {\"prompt\": prompt, \"test\": test_code, \"entry_point\": entry_point}\n",
    "    # completion_id = 0 # or some unique id for this completion\n",
    "    # temp_dir = tempfile.mkdtemp()\n",
    "    # sample = {\"task_id\": task_id, \"completion\": generated_code_completion} # `completion` is key\n",
    "    #\n",
    "    # try:\n",
    "    #     # This is a simplified path; check_correctness often used within a larger script.\n",
    "    #     # The `evaluate_functional_correctness` in human_eval.evaluation is the main entry point.\n",
    "    #     # It expects samples written to a .jsonl file.\n",
    "    #     # For a single evaluation, we can try to replicate parts of its logic or use a simpler exec.\n",
    "    #\n",
    "    #     # Simpler approach: construct the full code and run tests.\n",
    "    #     # The `test_code` in HumanEval is a string that defines a `check` function.\n",
    "    #     # e.g., test = f\"check({entry_point})\"\n",
    "    #     # The `prompt` contains the function signature.\n",
    "    #     # `generated_code_completion` is the agent's attempt.\n",
    "    #\n",
    "    #     # Full code: prompt + generated_code_completion + test_code\n",
    "    #     # The human_eval prompt often ends with \"```python\\n\" or similar,\n",
    "    #     # so the generated_code_completion should start with the function body.\n",
    "    #     # If your agent provides the full function def, you might need to adjust.\n",
    "    #\n",
    "    #     # Let's assume `generated_code_completion` IS the full function code.\n",
    "    #     # And `test_code` is the `check` function.\n",
    "\n",
    "    program_to_check = generated_code_completion + \"\\n\" + test_code\n",
    "    # The `check` function in HumanEval's test_code will call the `entry_point`.\n",
    "    # It typically raises an AssertionError if a test fails.\n",
    "\n",
    "    # Using a slightly more robust execution similar to human_eval's own logic\n",
    "    # This is a simplified version. The actual human_eval script is more robust.\n",
    "    start_time = time.time()\n",
    "    result_queue = multiprocessing.Queue()\n",
    "\n",
    "    def unsafe_execute():\n",
    "        with swallow_io(): # Suppress stdout/stderr from the executed code\n",
    "            try:\n",
    "                # The HumanEval test string is often `check(entry_point)` or similar.\n",
    "                # We need to make the generated function and entry_point available.\n",
    "                # `program_to_check` contains the generated function and the test harness.\n",
    "                # The test harness usually defines a `check` function and calls it.\n",
    "                exec_globals = {}\n",
    "                exec(program_to_check, exec_globals) # Execute the generated code + test harness\n",
    "                                                    # The `check` function should run here.\n",
    "                result_queue.put(None) # Signal success\n",
    "            except AssertionError:\n",
    "                result_queue.put(\"AssertionError\")\n",
    "            except Exception as e:\n",
    "                result_queue.put(e)\n",
    "\n",
    "\n",
    "    process = multiprocessing.Process(target=unsafe_execute)\n",
    "    process.start()\n",
    "    process.join(timeout=timeout_seconds)\n",
    "    exec_time = time.time() - start_time\n",
    "\n",
    "    if process.is_alive():\n",
    "        process.kill()\n",
    "        process.join()\n",
    "        return {\"task_id\": task_id, \"passed\": False, \"status\": \"Timeout\", \"execution_time\": exec_time}\n",
    "\n",
    "    if result_queue.empty(): # Should not happen if join completed unless timeout\n",
    "        return {\"task_id\": task_id, \"passed\": False, \"status\": \"UnknownExecutionError\", \"execution_time\": exec_time}\n",
    "\n",
    "    result = result_queue.get()\n",
    "\n",
    "    if result is None: # Means execution finished without exceptions caught by unsafe_execute\n",
    "        return {\"task_id\": task_id, \"passed\": True, \"status\": \"OK\", \"execution_time\": exec_time}\n",
    "    elif isinstance(result, str) and result == \"AssertionError\":\n",
    "        return {\"task_id\": task_id, \"passed\": False, \"status\": \"AssertionError\", \"execution_time\": exec_time}\n",
    "    else: # Some other exception\n",
    "        return {\"task_id\": task_id, \"passed\": False, \"status\": f\"RuntimeError: {type(result).__name__}\", \"execution_time\": exec_time}\n",
    "\n",
    "\n",
    "# --- Pass@k Calculation ---\n",
    "\n",
    "def calculate_pass_at_k(results_per_problem: Dict[str, List[Dict[str, Any]]], k: int) -> float:\n",
    "    \"\"\"\n",
    "    Calculates pass@k.\n",
    "    A problem is considered passed if at least one of the k generated solutions passes.\n",
    "\n",
    "    Args:\n",
    "        results_per_problem (Dict[str, List[Dict[str, Any]]]):\n",
    "            A dictionary where keys are task_ids and values are lists of evaluation results\n",
    "            for each of the k attempts for that task.\n",
    "            Each result dict should have a \"passed\": True/False key.\n",
    "        k (int): The number of solutions generated per problem.\n",
    "\n",
    "    Returns:\n",
    "        float: The pass@k score.\n",
    "    \"\"\"\n",
    "    if not results_per_problem:\n",
    "        return 0.0\n",
    "\n",
    "    total_problems = len(results_per_problem)\n",
    "    passed_problems_at_k = 0\n",
    "\n",
    "    for task_id, attempts in results_per_problem.items():\n",
    "        if len(attempts) == 0: # Should not happen if pipeline is correct\n",
    "            logging.warning(f\"No attempts found for task {task_id} in pass@k calculation.\")\n",
    "            continue\n",
    "        if len(attempts) > k:\n",
    "            logging.warning(f\"Task {task_id} has {len(attempts)} attempts, expected up to {k}. Using first {k}.\")\n",
    "        \n",
    "        # Check if any of the first k attempts passed\n",
    "        # (In case more than k were generated/evaluated for some reason)\n",
    "        if any(attempt[\"passed\"] for attempt in attempts[:k]):\n",
    "            passed_problems_at_k += 1\n",
    "\n",
    "    return passed_problems_at_k / total_problems if total_problems > 0 else 0.0\n",
    "\n",
    "\n",
    "# --- Main Pipeline Orchestration ---\n",
    "\n",
    "def run_evaluation_pipeline(\n",
    "    benchmark_name: str,\n",
    "    problems: List[Dict[str, Any]],\n",
    "    agent_interaction_func: Callable, # Takes prompt, task_id, entry_point -> returns List[str]\n",
    "    evaluation_func: Callable,       # Takes task_id, generated_code, tests/prompt, etc. -> returns Dict (eval result)\n",
    "    num_samples_per_problem: int = 1, # For pass@k, this is k\n",
    "    timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,\n",
    "    max_workers: Optional[int] = None # For parallel execution\n",
    "    ):\n",
    "    \"\"\"\n",
    "    Runs the full evaluation pipeline for a given benchmark.\n",
    "    \"\"\"\n",
    "    logging.info(f\"Starting evaluation for {benchmark_name} with {num_samples_per_problem} samples per problem.\")\n",
    "    all_results_per_problem: Dict[str, List[Dict[str, Any]]] = {} # task_id -> list of attempt results\n",
    "    summary_results = [] # List of dicts, one per (task, attempt)\n",
    "\n",
    "    # Using ProcessPoolExecutor for parallel evaluation of problems or samples\n",
    "    # Note: If your agent_interaction_func is not thread-safe or process-safe,\n",
    "    # you might need to run that part sequentially or adapt.\n",
    "    # Evaluation (code execution) is often the bottleneck and good for parallelization.\n",
    "\n",
    "    # For simplicity, this example processes problems sequentially,\n",
    "    # but samples for a given problem could be generated/evaluated in parallel.\n",
    "    # A more advanced setup would use the executor for tasks *and* samples.\n",
    "\n",
    "    for i, problem in enumerate(problems):\n",
    "        task_id = problem[\"task_id\"]\n",
    "        prompt = problem[\"prompt\"]\n",
    "        tests = problem[\"tests\"] # For MBPP, list of asserts; for HumanEval, test harness string\n",
    "        entry_point = problem.get(\"entry_point\")\n",
    "\n",
    "        logging.info(f\"Processing {benchmark_name} problem {i+1}/{len(problems)}: {task_id}\")\n",
    "\n",
    "        # 1. Generate k code solutions using the multi-agent framework\n",
    "        try:\n",
    "            generated_solutions_raw = agent_interaction_func(prompt, task_id, entry_point)\n",
    "            if len(generated_solutions_raw) == 0:\n",
    "                logging.warning(f\"Agent returned no solutions for {task_id}.\")\n",
    "                all_results_per_problem[task_id] = []\n",
    "                continue\n",
    "            if len(generated_solutions_raw) < num_samples_per_problem:\n",
    "                logging.warning(f\"Agent returned {len(generated_solutions_raw)} solutions for {task_id}, less than requested {num_samples_per_problem}.\")\n",
    "            \n",
    "            # Ensure we only evaluate up to num_samples_per_problem\n",
    "            generated_solutions_raw = generated_solutions_raw[:num_samples_per_problem]\n",
    "\n",
    "        except Exception as e:\n",
    "            logging.error(f\"Error during agent interaction for {task_id}: {e}\")\n",
    "            all_results_per_problem[task_id] = [{\"task_id\": task_id, \"passed\": False, \"status\": f\"AgentError: {e}\", \"execution_time\": 0.0} for _ in range(num_samples_per_problem)]\n",
    "            continue\n",
    "\n",
    "        all_results_per_problem[task_id] = []\n",
    "\n",
    "        for attempt_idx, raw_solution in enumerate(generated_solutions_raw):\n",
    "            logging.info(f\"  Attempt {attempt_idx+1}/{len(generated_solutions_raw)} for {task_id}\")\n",
    "\n",
    "            # 2. Extract code\n",
    "            extracted_code = extract_python_code(raw_solution)\n",
    "            if not extracted_code.strip():\n",
    "                logging.warning(f\"    Empty code extracted for {task_id}, attempt {attempt_idx+1}.\")\n",
    "                eval_result = {\"task_id\": task_id, \"passed\": False, \"status\": \"EmptyCode\", \"execution_time\": 0.0}\n",
    "            else:\n",
    "                # 3. Evaluate\n",
    "                try:\n",
    "                    if benchmark_name.lower() == \"mbpp\":\n",
    "                        eval_result = evaluation_func(task_id, extracted_code, tests, timeout_seconds)\n",
    "                    elif benchmark_name.lower() == \"humaneval\":\n",
    "                        eval_result = evaluation_func(task_id, prompt, extracted_code, tests, entry_point, timeout_seconds)\n",
    "                    else:\n",
    "                        raise ValueError(f\"Unknown benchmark: {benchmark_name}\")\n",
    "                except Exception as e:\n",
    "                    logging.error(f\"    Error during evaluation for {task_id}, attempt {attempt_idx+1}: {e}\")\n",
    "                    eval_result = {\"task_id\": task_id, \"passed\": False, \"status\": f\"EvaluationError: {e}\", \"execution_time\": 0.0}\n",
    "\n",
    "            eval_result[\"attempt\"] = attempt_idx\n",
    "            eval_result[\"raw_solution\"] = raw_solution # Store for inspection\n",
    "            eval_result[\"extracted_code\"] = extracted_code # Store for inspection\n",
    "            all_results_per_problem[task_id].append(eval_result)\n",
    "            summary_results.append(eval_result)\n",
    "            logging.info(f\"    Result for {task_id}, attempt {attempt_idx+1}: {'Pass' if eval_result['passed'] else 'Fail'} ({eval_result['status']})\")\n",
    "\n",
    "    # 4. Calculate pass@k metrics\n",
    "    pass_at_k_scores = {}\n",
    "    for k_val in range(1, num_samples_per_problem + 1):\n",
    "        pass_at_k_scores[f\"pass@{k_val}\"] = calculate_pass_at_k(all_results_per_problem, k_val)\n",
    "        logging.info(f\"{benchmark_name} Pass@{k_val}: {pass_at_k_scores[f'pass@{k_val}']:.4f}\")\n",
    "\n",
    "    return summary_results, pass_at_k_scores, all_results_per_problem\n",
    "\n",
    "# --- Reporting ---\n",
    "\n",
    "def save_results(output_dir: str, benchmark_name: str, detailed_results: List[Dict], pass_at_k_scores: Dict, all_results_per_problem: Dict):\n",
    "    \"\"\"Saves evaluation results to JSON files.\"\"\"\n",
    "    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    # Save detailed results (list of all attempts)\n",
    "    detailed_path = os.path.join(output_dir, f\"{benchmark_name}_detailed_results.jsonl\")\n",
    "    with open(detailed_path, \"w\") as f:\n",
    "        for result in detailed_results:\n",
    "            f.write(json.dumps(result) + \"\\n\")\n",
    "    logging.info(f\"Detailed results saved to {detailed_path}\")\n",
    "\n",
    "    # Save pass@k scores\n",
    "    pass_at_k_path = os.path.join(output_dir, f\"{benchmark_name}_pass_at_k.json\")\n",
    "    with open(pass_at_k_path, \"w\") as f:\n",
    "        json.dump(pass_at_k_scores, f, indent=4)\n",
    "    logging.info(f\"Pass@k scores saved to {pass_at_k_path}\")\n",
    "\n",
    "    # Save results grouped by problem (useful for pass@k calculation source)\n",
    "    all_results_path = os.path.join(output_dir, f\"{benchmark_name}_all_results_per_problem.json\")\n",
    "    with open(all_results_path, \"w\") as f:\n",
    "        json.dump(all_results_per_problem, f, indent=4)\n",
    "    logging.info(f\"All results per problem saved to {all_results_path}\")\n",
    "\n",
    "\n",
    "# --- Main Execution ---\n",
    "if __name__ == \"__main__\":\n",
    "    parser = argparse.ArgumentParser(description=\"Code Generation Evaluation Pipeline\")\n",
    "    parser.add_argument(\"--benchmark\", type=str, required=True, choices=[\"mbpp\", \"humaneval\", \"all\"], help=\"Benchmark to run.\")\n",
    "    parser.add_argument(\"--num_samples\", type=int, default=1, help=\"Number of samples to generate per problem (k for pass@k).\")\n",
    "    parser.add_argument(\"--timeout\", type=int, default=DEFAULT_TIMEOUT_SECONDS, help=\"Timeout for code execution in seconds.\")\n",
    "    parser.add_argument(\"--output_dir\", type=str, default=\"eval_results\", help=\"Directory to save results.\")\n",
    "    parser.add_argument(\"--max_problems\", type=int, default=None, help=\"Maximum number of problems to evaluate from each benchmark (for quick testing).\")\n",
    "    # Add any arguments needed for your specific multi-agent framework\n",
    "    # parser.add_argument(\"--agent_config\", type=str, help=\"Path to agent configuration file.\")\n",
    "\n",
    "    args = parser.parse_args()\n",
    "\n",
    "    # --- CONFIGURE YOUR AGENT INTERACTION FUNCTION ---\n",
    "    # This is where you'd initialize your multi-agent framework\n",
    "    # and pass its generation function to the pipeline.\n",
    "    # For example:\n",
    "    # my_agent = MyAgentFramework(config=args.agent_config)\n",
    "    # agent_func = my_agent.generate_solutions_for_prompt\n",
    "    # For this script, we use the placeholder:\n",
    "    \n",
    "    # This lambda makes `num_samples_per_problem` available to the agent interaction func\n",
    "    # if your agent needs to know how many solutions to generate.\n",
    "    # The pipeline itself will also limit evaluation to `num_samples_per_problem`.\n",
    "    agent_func_configured = lambda prompt, task_id, entry_point: generate_code_with_multi_agent_framework(\n",
    "        prompt, task_id, entry_point # The placeholder doesn't use num_samples, but your real one might\n",
    "    )\n",
    "    # -------------------------------------------------\n",
    "\n",
    "    benchmarks_to_run = []\n",
    "    if args.benchmark == \"all\" or args.benchmark == \"mbpp\":\n",
    "        benchmarks_to_run.append({\n",
    "            \"name\": \"mbpp\",\n",
    "            \"loader\": lambda: load_mbpp_dataset(N=args.max_problems),\n",
    "            \"evaluator\": evaluate_mbpp_solution\n",
    "        })\n",
    "    if args.benchmark == \"all\" or args.benchmark == \"humaneval\":\n",
    "        benchmarks_to_run.append({\n",
    "            \"name\": \"humaneval\",\n",
    "            \"loader\": lambda: load_humaneval_dataset(N=args.max_problems),\n",
    "            \"evaluator\": evaluate_humaneval_solution\n",
    "        })\n",
    "\n",
    "    for bench_config in benchmarks_to_run:\n",
    "        benchmark_name = bench_config[\"name\"]\n",
    "        logging.info(f\"--- Running Benchmark: {benchmark_name.upper()} ---\")\n",
    "\n",
    "        problems_data = bench_config[\"loader\"]()\n",
    "        if not problems_data:\n",
    "            logging.error(f\"No problems loaded for {benchmark_name}. Skipping.\")\n",
    "            continue\n",
    "\n",
    "        detailed_results, pass_at_k_scores, all_results_per_problem = run_evaluation_pipeline(\n",
    "            benchmark_name=benchmark_name,\n",
    "            problems=problems_data,\n",
    "            agent_interaction_func=agent_func_configured,\n",
    "            evaluation_func=bench_config[\"evaluator\"],\n",
    "            num_samples_per_problem=args.num_samples,\n",
    "            timeout_seconds=args.timeout\n",
    "        )\n",
    "\n",
    "        save_results(args.output_dir, benchmark_name, detailed_results, pass_at_k_scores, all_results_per_problem)\n",
    "\n",
    "        logging.info(f\"--- Finished Benchmark: {benchmark_name.upper()} ---\")\n",
    "        logging.info(f\"Final Pass@k Scores for {benchmark_name}: {pass_at_k_scores}\")\n",
    "\n",
    "    logging.info(\"Evaluation pipeline finished.\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
