{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "512513cb",
   "metadata": {},
   "source": [
    "### LLM calling test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9f4c2a0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from openai import OpenAI\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "\n",
    "class LLMModel():\n",
    "    # \"https://api.deepseek.com/v1\", \"https://api.openai.com/v1/\", \"https://dashscope.aliyuncs.com/compatible-mode/v1\"\n",
    "    def __init__(self, api_key, embedding_key=None, model=\"gpt-3.5-turbo\", base_url=\"https://api.openai.com/v1/\", use_web_search=False, enable_thinking=None):\n",
    "        self.api_key = api_key\n",
    "        self.model = model\n",
    "        self.client = OpenAI(api_key=self.api_key, base_url=base_url)\n",
    "        if embedding_key is None:\n",
    "            embedding_key = api_key\n",
    "        self.embedding_client = OpenAI(api_key=embedding_key, base_url=\"https://api.openai.com/v1/\")\n",
    "        self.use_web_search = use_web_search\n",
    "        self.enable_thinking = enable_thinking\n",
    "    \n",
    "    def LLM_response(self, prompt, gen_kwargs={}, model=None, full_response=False, enable_thinking=None):\n",
    "        if model is None:\n",
    "            model = self.model\n",
    "        if enable_thinking is None:\n",
    "            enable_thinking = self.enable_thinking\n",
    "\n",
    "        if type(prompt) == str:\n",
    "            input_messages = [\n",
    "                {\"role\": \"user\", \"content\": prompt}\n",
    "                ]\n",
    "        elif type(prompt) == list:\n",
    "            input_messages = prompt\n",
    "        else:\n",
    "            print(\"prompt must be a string or a list of messages, current type: \", type(prompt))\n",
    "            raise ValueError(\"prompt must be a string or a list of messages\")\n",
    "        \n",
    "        if self.use_web_search:\n",
    "            model = \"gpt-4o-search-preview\"\n",
    "            gen_kwargs[\"web_search_options\"] = {}\n",
    "\n",
    "        if self.enable_thinking is not None:\n",
    "            if enable_thinking == True:\n",
    "                gen_kwargs[\"extra_body\"] = {\"enable_thinking\": True}\n",
    "            elif enable_thinking == False:\n",
    "                gen_kwargs[\"extra_body\"] = {\"enable_thinking\": False}\n",
    "\n",
    "        # print(model)\n",
    "        # print(\"input_messages: \", input_messages)\n",
    "        # print(gen_kwargs)\n",
    "\n",
    "        completion = self.client.chat.completions.create(\n",
    "            model=model,\n",
    "            messages=input_messages,\n",
    "            **gen_kwargs\n",
    "            )\n",
    "\n",
    "        if full_response:\n",
    "            return completion\n",
    "        \n",
    "        return completion.choices[0].message.content\n",
    "\n",
    "    def LLM_response_async(self, prompts, gen_kwargs={}, model=None, max_workers=20, full_response=False, max_retries=None, current_retry=0):\n",
    "        if model is None:\n",
    "            model = self.model\n",
    "\n",
    "        all_success = True\n",
    "        \n",
    "        # check if prompts is a list of strings or list of list of messages\n",
    "        if not isinstance(prompts, list) or not all(isinstance(p, (str, list)) for p in prompts):\n",
    "            raise ValueError(\"prompts must be a list of strings or a list of list of messages\")\n",
    "\n",
    "        results = [None] * len(prompts)\n",
    "        \n",
    "        future_to_index = {}\n",
    "\n",
    "        with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "            for index, prompt in enumerate(prompts):\n",
    "                future = executor.submit(\n",
    "                    self.LLM_response,\n",
    "                    prompt=prompt,\n",
    "                    gen_kwargs=gen_kwargs,\n",
    "                    model=model,\n",
    "                    full_response=full_response\n",
    "                )\n",
    "                future_to_index[future] = index\n",
    "\n",
    "            for future in as_completed(future_to_index):\n",
    "                original_index = future_to_index[future] # get the original index of the prompt\n",
    "                try:\n",
    "                    data = future.result()\n",
    "                    results[original_index] = (True, data, prompts[original_index])\n",
    "                except Exception as exc:\n",
    "                    print(f'Prompt at index {original_index} generated an exception: {exc}')\n",
    "                    results[original_index] = (False, exc if full_response else f\"Error: {str(exc)}\", prompts[original_index])\n",
    "                    all_success = False\n",
    "\n",
    "        if isinstance(max_retries, int) and max_retries > 0 and not all_success and current_retry < max_retries:\n",
    "            print(f\"Retry attempt {current_retry + 1} of {max_retries}\")\n",
    "            # Collect prompts that failed\n",
    "            retry_items_index = [i for i, (success, _, _) in enumerate(results) if not success]\n",
    "            retry_prompts = [prompts[i] for i in retry_items_index]\n",
    "\n",
    "            # Recursive call for retries\n",
    "            retry_results, retry_all_success = self.LLM_response_async(\n",
    "                prompts=retry_prompts,\n",
    "                gen_kwargs=gen_kwargs,\n",
    "                model=model,\n",
    "                max_workers=max_workers,\n",
    "                full_response=full_response,\n",
    "                max_retries=max_retries,\n",
    "                current_retry=current_retry + 1\n",
    "            )\n",
    "\n",
    "            # Update the original results with retry results\n",
    "            for idx, original_idx in enumerate(retry_items_index):\n",
    "                results[original_idx] = retry_results[idx]\n",
    "\n",
    "            all_success = retry_all_success\n",
    "\n",
    "        return results, all_success\n",
    "    \n",
    "    def Embedding_response(self, input_texts, model=\"text-embedding-3-large\"):\n",
    "        if type(input_texts) == str:\n",
    "            input_texts = [input_texts]\n",
    "        elif type(input_texts) == list:\n",
    "            if not all(isinstance(text, str) for text in input_texts):\n",
    "                raise ValueError(\"All items in input_texts list must be strings\")\n",
    "        else:\n",
    "            print(\"input_texts must be a string or a list of strings, current type: \", type(input_texts))\n",
    "            raise ValueError(\"input_texts must be a string or a list of strings\")\n",
    "        \n",
    "        embedding_client = self.embedding_client\n",
    "\n",
    "        response = embedding_client.embeddings.create(\n",
    "            model=model,\n",
    "            input=input_texts\n",
    "        )\n",
    "\n",
    "        embeddings = [data.embedding for data in response.data]\n",
    "\n",
    "        return embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3ce62b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Einstein's theory of relativity actually consists of two related theories that changed how we understand space, time, and gravity.\n",
      "\n",
      "## Special Relativity (1905)\n",
      "This theory is based on two key ideas:\n",
      "- **Nothing travels faster than light** - Light speed is the universal speed limit\n",
      "- **The laws of physics are the same everywhere** - Whether you're standing still or moving at constant speed\n",
      "\n",
      "The surprising consequences:\n",
      "- **Time slows down** when you move very fast (time dilation)\n",
      "- **Objects get shorter** in the direction they're moving (length contraction)\n",
      "- **Time and space are connected** as \"spacetime\"\n",
      "- **Mass and energy are related**: E=mc²\n",
      "\n",
      "## General Relativity (1915)\n",
      "This expanded the theory to include gravity:\n",
      "- **Gravity isn't a force** pulling objects together\n",
      "- Instead, **massive objects bend spacetime** itself\n",
      "- Objects follow the curved paths through this bent spacetime\n",
      "- Think of a bowling ball placed on a stretched rubber sheet - it creates a dip that causes marbles to roll toward it\n",
      "\n",
      "## Real-world examples:\n",
      "- GPS satellites must account for time running slightly faster in space\n",
      "- Particle accelerators confirm that objects get more massive as they speed up\n",
      "- We've observed the bending of light around massive stars\n",
      "- Gravitational waves (ripples in spacetime) have been detected\n",
      "\n",
      "The theory essentially tells us that space and time aren't fixed - they're flexible and interconnected, shaped by matter and energy.\n"
     ]
    }
   ],
   "source": [
    "from openai import OpenAI\n",
    "key = None\n",
    "model = \"claude-sonnet-4-20250514\"\n",
    "prompt = \"Explain the theory of relativity in simple terms.\"\n",
    "temperature = 0.7\n",
    "client = OpenAI(api_key=key, base_url=\"http://35.220.164.252:3888/v1/\")\n",
    "completion = client.chat.completions.create(\n",
    "    model=model,\n",
    "    temperature=temperature,\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt},\n",
    "    ],\n",
    ")\n",
    "print(completion.choices[0].message.content )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4b37254d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM response:  Hello! How are you doing today? Is there anything I can help you with?\n"
     ]
    }
   ],
   "source": [
    "test_prompt = \"Hi\"\n",
    "llm_response = llm_model.LLM_response(test_prompt)\n",
    "print(\"LLM response: \", llm_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cbb22335",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM response:  Hello! How can I assist you today?\n"
     ]
    }
   ],
   "source": [
    "test_prompt = \"Hi\"\n",
    "llm_response = llm_model.LLM_response(test_prompt)\n",
    "print(\"LLM response: \", llm_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "64d4c38a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embedding 0 shape:  3072\n",
      "Embedding 1 shape:  3072\n"
     ]
    }
   ],
   "source": [
    "embedding_test = [\"Hello, world!\", \"Another text to embed.\"]\n",
    "embeddings = llm_model.Embedding_response(embedding_test)\n",
    "for i, emb in enumerate(embeddings):\n",
    "    print(f\"Embedding {i} shape: \", len(emb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0f610808",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM response:  你好！有什么我可以帮助你的吗？😊\n"
     ]
    }
   ],
   "source": [
    "test_prompt = \"你好\"\n",
    "llm_response_2 = llm_model_2.LLM_response(test_prompt)\n",
    "print(\"LLM response: \", llm_response_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e9083721",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Response 0: Hello! How can I assist you today?\n",
      "Response 1: Hello! How can I assist you today?\n",
      "Response 2: I'm just a computer program, so I don't have feelings or emotions like humans do. I'm here to help answer any questions or assist with any tasks you may have. How can I assist you today?\n",
      "Response 3: I am an AI digital assistant and I do not have a personal name. You can simply refer to me as Assistant. How can I assist you today?\n"
     ]
    }
   ],
   "source": [
    "# test async\n",
    "test_prompts_async = [\"Hi\", \"Hello\", \"How are you?\", \"What is your name?\"]\n",
    "llm_responses_async, all_success = llm_model.LLM_response_async(test_prompts_async, max_workers=5)\n",
    "for i, resp in enumerate(llm_responses_async):\n",
    "    print(f\"Response {i}: {resp[1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e4aef58e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "[(True, 'Hello! How can I assist you today?', 'Hi'), (True, 'Hello! How can I assist you today?', 'Hello'), (True, \"Thank you for asking! I'm just a virtual assistant, so I don't have feelings or emotions, but I'm here and ready to assist you with anything you need. How can I help you today?\", 'How are you?'), (True, 'I am an AI assistant and I do not have a personal name. You can just call me Assistant. How can I help you today?', 'What is your name?')]\n"
     ]
    }
   ],
   "source": [
    "print(all_success)\n",
    "print(llm_responses_async)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "304b3014",
   "metadata": {},
   "source": [
    "### LLM_TM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "7ecddeae",
   "metadata": {},
   "outputs": [],
   "source": [
    "class LLMTM():\n",
    "    def __init__(self, LLM_model):\n",
    "        self.LLM_model = LLM_model\n",
    "    \n",
    "    def task_knowledge_refinement(self, task_description, web_search=False):\n",
    "        # 1. Create the prompt for knowledge refinement \n",
    "        prompt = f\"\"\"\n",
    "You are a world-class subject matter expert with deep knowledge related to the following task.\n",
    "Your goal is to provide essential background knowledge, expert insights, and context that will help someone better understand and execute this task.\n",
    "\n",
    "Please analyze the task description below and generate a concise, informative expansion of the key concepts.\n",
    "Do NOT repeat the original task description in your output. Focus only on adding new, relevant expert knowledge or detailed explanations for the key concepts.\n",
    "\n",
    "**Original Task Description:**\n",
    "---\n",
    "{task_description}\n",
    "---\n",
    "\n",
    "**Your Expert Knowledge Expansion:**\n",
    "\"\"\"\n",
    "\n",
    "        # 2. Use LLM to generate output\n",
    "        additional_knowledge = self.LLM_model.LLM_response(prompt)\n",
    "        extracted_knowledge = additional_knowledge.strip()\n",
    "\n",
    "        # 3. Combine the original task description and the additional knowledge\n",
    "        # We format the output clearly using Markdown for better readability.\n",
    "        refined_task_description = (\n",
    "            \"\\n\\n## Task Description\\n\\n\"\n",
    "            f\"{task_description}\"\n",
    "            \"\\n\\n\"\n",
    "            \"## Expert Knowledge & Context\\n\\n\"\n",
    "            f\"{extracted_knowledge}\"\n",
    "        )\n",
    "\n",
    "        # 4. Return the refined task description\n",
    "        return refined_task_description\n",
    "\n",
    "    def get_plan(self, task_description, num_plans):\n",
    "        print(f\"INFO: Generating {num_plans} initial plans...\")\n",
    "        plans = {f\"plan_{i}\": f\"Initial plan content {i}\" for i in range(num_plans)}\n",
    "        return plans, plans # Returning raw plans as well\n",
    "    \n",
    "    def get_plan(self, task_description, num_plans=1):\n",
    "        if num_plans < 1:\n",
    "            return []\n",
    "\n",
    "        # 1. Generate the planning prompt from the helper function\n",
    "        plan_prompt = self.get_planning_prompt(task_description)\n",
    "\n",
    "        # 2. Create a list of prompts to generate multiple plans in parallel\n",
    "        prompts_list = [plan_prompt] * num_plans\n",
    "\n",
    "        # 3. Call the asynchronous LLM function to get plans efficiently\n",
    "        results, all_successful = self.LLM_model.LLM_response_async(prompts=prompts_list)\n",
    "\n",
    "        if not all_successful:\n",
    "            print(\"Warning: some plan generation requests failed.\")\n",
    "\n",
    "        # 4. Collect and return the successfully generated plans\n",
    "        final_plans = [response for success, response, _ in results if success]\n",
    "\n",
    "        return final_plans\n",
    "    \n",
    "    def summarize_plan(self, plans, single_plan=True):\n",
    "        if isinstance(plans, str):\n",
    "            plans = [plans]\n",
    "\n",
    "        if single_plan and len(plans) > 1:\n",
    "            plans = [plans[0]]\n",
    "\n",
    "        # 1. Generate the detailed prompt using the helper function\n",
    "        prompt = self.plan_summarize_prompt(plans)\n",
    "\n",
    "        # 2. Call the LLM to get the summarization.\n",
    "        try:\n",
    "            summarized_plan = self.LLM_model.LLM_response(prompt)\n",
    "            return summarized_plan.strip()\n",
    "        except Exception as e:\n",
    "            # Handle potential exceptions during the LLM call\n",
    "            print(f\"An error occurred during LLM call for plan summarization: {e}\")\n",
    "            return \"\"\n",
    "\n",
    "    def plan_refinement_loop(self, plan, max_refine_rounds=3):\n",
    "        current_plan = \"\"\n",
    "\n",
    "        # --- Step 1: Handle Input Type and Select a Single Plan ---\n",
    "        if isinstance(plan, list):\n",
    "            if not plan:\n",
    "                print(\"Error: The provided plan list is empty.\")\n",
    "                return \"\"\n",
    "            \n",
    "            # If only one plan is in the list, select it automatically\n",
    "            if len(plan) == 1:\n",
    "                current_plan = plan[0]\n",
    "            else:\n",
    "                # Translate all plans in the list for user review\n",
    "                translation_prompts = [self.get_translation_prompt(p) for p in plan]\n",
    "                results, _ = self.LLM_model.LLM_response_async(prompts=translation_prompts)\n",
    "\n",
    "                # Correlate successful translations with their original technical plans\n",
    "                valid_options = [(plan[i], resp) for i, (succ, resp, _) in enumerate(results) if succ]\n",
    "\n",
    "                if not valid_options:\n",
    "                    print(\"Error: Failed to translate any of the provided plans.\")\n",
    "                    return \"\"\n",
    "\n",
    "                # Present the translated plans to the user for selection\n",
    "                print(\"\\nPlease select one of the following plans to proceed with:\")\n",
    "                for i, (_, readable_plan) in enumerate(valid_options):\n",
    "                    print(f\"\\n--- [Option {i + 1}] ---\")\n",
    "                    print(readable_plan)\n",
    "                    print(\"--------------------\")\n",
    "\n",
    "                # Get and validate user's choice\n",
    "                choice = -1\n",
    "                while True:\n",
    "                    try:\n",
    "                        user_input = input(f\"\\nEnter the number of the plan you want to refine (1-{len(valid_options)}): \")\n",
    "                        choice = int(user_input)\n",
    "                        if 1 <= choice <= len(valid_options):\n",
    "                            break\n",
    "                        else:\n",
    "                            print(f\"Invalid selection. Please enter a number between 1 and {len(valid_options)}.\")\n",
    "                    except ValueError:\n",
    "                        print(\"Invalid input. Please enter a number.\")\n",
    "                \n",
    "                # Set the chosen technical plan as the one to be refined\n",
    "                current_plan = valid_options[choice - 1][0]\n",
    "\n",
    "        elif isinstance(plan, str):\n",
    "            current_plan = plan\n",
    "        else:\n",
    "            raise TypeError(\"The 'plan' argument must be a string or a list of strings.\")\n",
    "\n",
    "        # --- Step 2: Begin the Refinement Loop on the Selected Plan ---\n",
    "        for i in range(max_refine_rounds):\n",
    "            print(f\"\\n--- [Refinement Round {i + 1}/{max_refine_rounds}] ---\")\n",
    "\n",
    "            # Translate the current technical plan into a readable format\n",
    "            translation_prompt = self.get_translation_prompt(current_plan)\n",
    "            readable_plan = self.LLM_model.LLM_response(translation_prompt)\n",
    "\n",
    "            # Show the readable plan to the user\n",
    "            print(\"\\nHere is the current plan for your review:\")\n",
    "            print(\"-----------------------------------------\")\n",
    "            print(readable_plan)\n",
    "            print(\"-----------------------------------------\")\n",
    "\n",
    "            # Ask if the user wants to refine it further\n",
    "            user_input = input(\"Are you satisfied with this plan, or would you like to refine it? (yes/no refine): \").lower().strip()\n",
    "\n",
    "            if user_input.startswith('y'):\n",
    "                print(\"\\nPlan approved. Finalizing the process.\")\n",
    "                return current_plan # Return the approved technical plan\n",
    "\n",
    "            # Get user feedback for refinement\n",
    "            feedback = input(\"\\nPlease provide your feedback for refinement:\\n> \")\n",
    "\n",
    "            # Refine the plan based on the feedback\n",
    "            print(\"\\nRefining the plan based on your feedback...\")\n",
    "            refinement_prompt = self.get_refinement_prompt(\n",
    "                original_plan=current_plan,\n",
    "                user_feedback=feedback\n",
    "            )\n",
    "            refined_plan = self.LLM_model.LLM_response(refinement_prompt)\n",
    "\n",
    "            # The refined plan becomes the new \"current\" plan for the next loop\n",
    "            current_plan = refined_plan\n",
    "\n",
    "        print(f\"\\nMaximum refinement rounds ({max_refine_rounds}) reached. Using the last generated plan.\")\n",
    "        return current_plan\n",
    "\n",
    "    def get_planning_prompt(self, task_description):\n",
    "        prompt = f\"\"\"\n",
    "You are an expert software architect and planning agent. Your primary role is to analyze a user's coding request and generate a comprehensive, step-by-step implementation plan. This plan will be used as a blueprint by a separate code generation agent to write the actual code.\n",
    "\n",
    "Your plan must be clear, logical, and detailed enough for the code agent to understand and implement without further clarification.\n",
    "\n",
    "**User's Task Description:**\n",
    "\"{task_description}\"\n",
    "\n",
    "---\n",
    "\n",
    "Please generate the implementation plan by structuring your response into the following three sections:\n",
    "\n",
    "### 1. Overall Task Definition\n",
    "Summarize the core goal of the task. Clearly and precisely define the expected **input format** for the entire program and the **final output format** that the program must produce. This ensures the final solution can be tested correctly.\n",
    "\n",
    "### 2. Component Breakdown\n",
    "Break the problem down into logical, self-contained components (e.g., functions, classes). For each component, you must provide:\n",
    "- **Name:** A descriptive name for the component (e.g., `parse_user_data`).\n",
    "- **Purpose:** A brief explanation of what this component does.\n",
    "- **Inputs:** A clear description of the arguments it takes, including their expected data types.\n",
    "- **Outputs:** A clear description of what it returns, including its data type.\n",
    "\n",
    "### 3. Main Function (Dataflow)\n",
    "Describe the logic and workflow for the `main` function. This function will serve as the entry point for the entire script and will be used for testing and evaluation. Your description should detail the dataflow:\n",
    "- How the initial input is received by the `main` function.\n",
    "- The sequence in which the components defined in Part 2 are called.\n",
    "- How the output of one component becomes the input for another.\n",
    "- How the final result is assembled and returned, matching the format specified in the \"Overall Task Definition\".\"\"\"\n",
    "        return prompt\n",
    "\n",
    "    def get_refinement_prompt(self, original_plan, user_feedback):\n",
    "        prompt = f\"\"\"\n",
    "You are an expert software architect. Your task is to refine an existing implementation plan based on user feedback. You must generate a new, complete plan that incorporates the requested changes while maintaining the original structure.\n",
    "\n",
    "**Original Plan:**\n",
    "---\n",
    "{original_plan}\n",
    "---\n",
    "\n",
    "**User's Feedback for Refinement:**\n",
    "---\n",
    "{user_feedback}\n",
    "---\n",
    "\n",
    "Please generate the **full, updated plan** that addresses the user's feedback. The new plan must be a complete replacement for the original and follow the same three-part format:\n",
    "\n",
    "### 1. Overall Task Definition\n",
    "(Update the input/output definitions if the feedback requires it.)\n",
    "\n",
    "### 2. Component Breakdown\n",
    "(Modify, add, or remove components based on the feedback.)\n",
    "\n",
    "### 3. Main Function (Dataflow)\n",
    "(Update the workflow to reflect the changes in the components and logic.)\n",
    "\"\"\"\n",
    "        return prompt\n",
    "\n",
    "    def get_translation_prompt(self, technical_plan):\n",
    "        prompt = f\"\"\"\n",
    "You are an expert communicator specializing in translating technical software plans into plain language for non-technical clients.\n",
    "\n",
    "The user is an expert in their subject matter but has no coding experience. Your goal is to explain the proposed software's workflow clearly and concisely.\n",
    "\n",
    "**Instructions:**\n",
    "1.  Read the provided technical plan carefully.\n",
    "2.  Do NOT use programming jargon (e.g., 'function', 'class', 'API', 'data type', 'return value').\n",
    "3.  Instead, use analogies: a 'component' or 'function' can be a 'processing step' or a 'tool'; 'input' is 'information needed'; 'output' is the 'result produced'.\n",
    "4.  Structure your explanation in a clear, step-by-step format that is easy to follow.\n",
    "\n",
    "**Technical Plan to Translate:**\n",
    "---\n",
    "{technical_plan}\n",
    "---\n",
    "\n",
    "**Generate the user-friendly explanation below, focusing on:**\n",
    "1.  **Overall Goal:** A brief, one-sentence summary of what the program will accomplish.\n",
    "2.  **Required Information:** What information the program needs to start its work.\n",
    "3.  **The Process (Step-by-Step):** Describe each component from the plan as a distinct step in a workflow. For each step, explain its purpose, what information it uses, and what result it produces.\n",
    "4.  **Final Result:** Describe the final output the user will receive from the program.\n",
    "\"\"\"\n",
    "        return prompt\n",
    "\n",
    "    def generate_new_plan(self, best_plan, task_desc, codes, tests):\n",
    "        print(f\"INFO: Generating new plans based on the current best: {best_plan}\")\n",
    "        return {f\"{best_plan}_v2\": \"A new, improved plan version\"}\n",
    "    \n",
    "    def plan_summarize_prompt(self, plans):\n",
    "        # Ensure plans is a list for consistent processing\n",
    "        if isinstance(plans, str):\n",
    "            plans = [plans]\n",
    "\n",
    "        # Format the input plans for inclusion in the prompt\n",
    "        formatted_plans = \"\"\n",
    "        if len(plans) == 1:\n",
    "            formatted_plans = f\"## Plan to be Summarized:\\n\\n{plans[0]}\"\n",
    "        else:\n",
    "            # If multiple plans, number them for clarity\n",
    "            for i, plan in enumerate(plans):\n",
    "                formatted_plans += f\"## Plan {i+1}:\\n\\n{plan}\\n\\n---\\n\\n\"\n",
    "\n",
    "        # The core prompt template instructing the LLM on its task\n",
    "        prompt = f\"\"\"\n",
    "You are an expert technical analyst. Your task is to read and synthesize the following software development plan(s). Your output will be a single, consolidated summary that will be used by another AI agent to generate comprehensive test cases.\n",
    "\n",
    "The summary must be **neutral and objective**. If multiple plans are provided, combine their requirements without showing preference for any single implementation strategy. Focus strictly on the **\"what\" (requirements)**, not the \"how\" (implementation details).\n",
    "\n",
    "From the provided plan(s), extract and structure your summary with the following sections:\n",
    "\n",
    "1.  **Main Task:** A concise, one-sentence description of the overall goal.\n",
    "2.  **Input Specification:** Detail all expected inputs. For each input, specify its name, data type (e.g., string, integer, list of dictionaries), format (e.g., JSON object, CSV string), and any known constraints or validation rules (e.g., \"integer must be positive\", \"string cannot be empty\").\n",
    "3.  **Output Specification:** Detail the expected output. Specify its data type, format, and any conditions it must satisfy (e.g., \"returns a dictionary with 'status' and 'data' keys\", \"returns -1 on error\").\n",
    "4.  **Core Functionalities & Logic:** List the key behaviors, logical steps, or calculations the code must perform. This helps identify different scenarios to test.\n",
    "5.  **Constraints & Edge Cases:** Explicitly list any system limitations, assumptions, or specific edge cases mentioned in the plans (e.g., handling of null/empty inputs, maximum file size, error handling conditions).\n",
    "\n",
    "Do **not** suggest how to write test cases. Only provide the factual specifications derived from the plan(s) below.\n",
    "\n",
    "---\n",
    "{formatted_plans}\n",
    "---\n",
    "\n",
    "Provide the structured summary now.\n",
    "\"\"\"\n",
    "        return prompt.strip()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f70f07dc",
   "metadata": {},
   "source": [
    "### LLM_CG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2926688",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "import re\n",
    "import inspect\n",
    "import uuid\n",
    "import traceback\n",
    "import random\n",
    "import json\n",
    "\n",
    "class LLMCG():\n",
    "    def __init__(self, LLM_model):\n",
    "        self.LLM_model = LLM_model\n",
    "\n",
    "    def refine_codes(self, generated_codes, code_results, test_cases, error_test_num):\n",
    "        print(\"INFO: Refining codes based on evaluation results...\")\n",
    "        # Mock refinement: just adds a comment\n",
    "        for code_id, code_info in generated_codes.items():\n",
    "            code_info[\"content\"] += \"\\n  # refined\"\n",
    "        return generated_codes\n",
    "\n",
    "    \n",
    "    def _validate_and_create_function(self, code_string):\n",
    "        \"\"\"\n",
    "        Validates the generated Python code string for syntax and correctness.\n",
    "        If valid, it returns the compiled function object. Otherwise, returns None.\n",
    "        \"\"\"\n",
    "        try:\n",
    "            # Safely compile the code to check for syntax errors\n",
    "            compile(code_string, '<string>', 'exec')\n",
    "            \n",
    "            # Execute the code in a temporary namespace to define the function\n",
    "            local_namespace = {}\n",
    "            exec(code_string, {}, local_namespace)\n",
    "            \n",
    "            # Ensure 'test_function' was defined and is a callable function\n",
    "            if 'test_function' in local_namespace and callable(local_namespace['test_function']):\n",
    "                return local_namespace['test_function']\n",
    "            else:\n",
    "                print(f\"Warning: 'test_function' not found in executed code.\\nCode: {code_string}\")\n",
    "                return None\n",
    "        except Exception as e:\n",
    "            print(f\"Warning: Failed to validate or create function due to {e}.\\nCode: {code_string}\")\n",
    "            return None\n",
    "\n",
    "    def generate_test_cases_from_raw(self, raw_llm_outputs, test_type):\n",
    "\n",
    "        # 1. Normalize the input to always be a list of strings\n",
    "        if isinstance(raw_llm_outputs, str):\n",
    "            outputs_list = [raw_llm_outputs]\n",
    "        else:\n",
    "            outputs_list = raw_llm_outputs\n",
    "\n",
    "        if not outputs_list:\n",
    "            return []\n",
    "\n",
    "        # 2. Generate an extraction prompt for each raw output\n",
    "        prompts = [self._get_extraction_prompt(raw_output) for raw_output in outputs_list]\n",
    "\n",
    "        # 3. Call the LLM asynchronously to format all prompts\n",
    "        llm_results, all_successful = self.LLM_model.LLM_response_async(prompts)\n",
    "\n",
    "        # if not all_successful:\n",
    "        #     print(\"Warning: At least one LLM call failed during test formatting.\")\n",
    "\n",
    "        # 4. Process each LLM result\n",
    "        all_test_cases = []\n",
    "        for i, (success, formatted_response, _original_prompt) in enumerate(llm_results):\n",
    "            if not success:\n",
    "                print(f\"Warning: Failed to get formatted response. Error: {formatted_response}\")\n",
    "                continue\n",
    "\n",
    "            # Retrieve the original description for this test\n",
    "            original_raw_output = outputs_list[i]\n",
    "\n",
    "            # Parse the callable test functions from the formatted response\n",
    "            test_functions, code_strs = self._extract_and_parse_tests(formatted_response)\n",
    "\n",
    "            # 5. Create a dictionary for each successfully parsed function\n",
    "            for test_func, code_str in zip(test_functions, code_strs):\n",
    "                all_test_cases.append({\n",
    "                    \"test_function\": test_func,\n",
    "                    \"test_function_string\": code_str,\n",
    "                    \"description\": original_raw_output,\n",
    "                    \"type\": test_type,\n",
    "                    \"weight\": 1.0\n",
    "                })\n",
    "\n",
    "        return all_test_cases\n",
    "\n",
    "    def adding_new_test_case(self, test_cases, new_test_case):\n",
    "        next_id = max(test_cases.keys() or [-1]) + 1\n",
    "\n",
    "        # Normalize input to a list to handle both single and multiple additions\n",
    "        if isinstance(new_test_case, dict):\n",
    "            cases_to_add = [new_test_case]\n",
    "        elif isinstance(new_test_case, list):\n",
    "            cases_to_add = new_test_case\n",
    "        else:\n",
    "            raise TypeError(\"new_test_case must be a dictionary or a list of dictionaries.\")\n",
    "\n",
    "        # Iterate through the cases to add them to the main dictionary\n",
    "        for case in cases_to_add:\n",
    "            if isinstance(case, dict):\n",
    "                test_cases[next_id] = case\n",
    "                next_id += 1 # Increment the ID for the next case\n",
    "            else:\n",
    "                # Handle cases where a list contains non-dictionary items\n",
    "                print(f\"Warning: Skipping non-dictionary item in list: {case}\")\n",
    "\n",
    "        return test_cases\n",
    "\n",
    "    def generate_tests(self, num, task_description, plan=None, original_test_cases=None, debug=False):\n",
    "        if original_test_cases is None:\n",
    "            # Ensure we start with a mutable list\n",
    "            test_cases = {}\n",
    "        else:\n",
    "            # Create a copy to avoid modifying the original list in place\n",
    "            test_cases = original_test_cases\n",
    "\n",
    "        # get test case generation prompts for different purposes\n",
    "        prompt_in_content = self.get_test_prompt_in_content(task_description, plan)\n",
    "        prompt_correctness = self.get_test_prompt_correctness(task_description, plan)\n",
    "        prompt_runnable_check = self.get_test_prompt_runnable(task_description, plan)\n",
    "\n",
    "        # use LLM to generate test cases\n",
    "        test_in_contents_raw = self.LLM_model.LLM_response(prompt_in_content)\n",
    "        test_runnable_checks_raw = self.LLM_model.LLM_response(prompt_runnable_check)\n",
    "        test_correctness_raw_list_generation, success_result = self.LLM_model.LLM_response_async([prompt_correctness]*num, max_workers=5)\n",
    "\n",
    "        if not success_result:\n",
    "            print(\"Warning: Some correctness test case generations failed.\")\n",
    "        test_correctness_raw_list = [\"\" for _ in range(num)]\n",
    "        for i, temp_list in enumerate(test_correctness_raw_list_generation):\n",
    "            if temp_list[0]:\n",
    "                test_correctness_raw_list[i] = temp_list[1]\n",
    "\n",
    "        # --- Extract test cases by using LLM calls ---\n",
    "        # print(task_description)\n",
    "        # print(\"#############################################################\")\n",
    "        # print(test_in_contents_raw)\n",
    "        # print(\"#############################################################\")\n",
    "        # print(test_runnable_checks_raw)\n",
    "        # print(\"#############################################################\")\n",
    "        # print(test_correctness_raw_list)\n",
    "        # raise NotImplementedError(\"debug\")\n",
    "    \n",
    "        # 1. Extract from task description\n",
    "        test_cases = self.adding_new_test_case(test_cases, self.generate_test_cases_from_raw(test_in_contents_raw, 'in_content'))\n",
    "        \n",
    "        # 2. Extract from runnable checks\n",
    "        # print(test_runnable_checks_raw)\n",
    "        test_cases = self.adding_new_test_case(test_cases, self.generate_test_cases_from_raw(test_runnable_checks_raw, 'runnable'))\n",
    "\n",
    "        # 3. Extract from correctness checks\n",
    "        test_cases = self.adding_new_test_case(test_cases, self.generate_test_cases_from_raw(test_correctness_raw_list, 'correctness'))\n",
    "\n",
    "        if debug:\n",
    "            debug_info = {\"prompt_in_content\": prompt_in_content,\n",
    "                          \"prompt_correctness\": prompt_correctness,\n",
    "                          \"prompt_runnable_check\": prompt_runnable_check,\n",
    "                          \"raw_in_content\": test_in_contents_raw,\n",
    "                          \"raw_runnable_check\": test_runnable_checks_raw,\n",
    "                          \"raw_correctness\": test_correctness_raw_list,\n",
    "                          \"test_cases\": test_cases}\n",
    "            return test_cases, debug_info\n",
    "\n",
    "        return test_cases\n",
    "\n",
    "    def _get_extraction_prompt(self, raw_llm_output):\n",
    "        prompt = f\"\"\"\n",
    "You are an expert programmer specializing in writing Python unit tests.\n",
    "Your task is to extract all test cases from the provided raw text and convert them into executable Python test functions.\n",
    "\n",
    "**Instructions:**\n",
    "\n",
    "1.  **Function Signature**: Each test function must have the exact signature `def test_case(func_to_test):`. The `func_to_test` parameter is the function that will be under test.\n",
    "2.  **Return Value**: Each test function must return a tuple `(bool, str)`.\n",
    "    * The first element is a boolean: `True` if the test passes, `False` otherwise.\n",
    "    * The second element is a string message. If the test fails, this message must be informative, explaining the reason for failure (e.g., \"Expected: <expected_value>, Got: <actual_value>\").\n",
    "3.  **Error Handling**: Wrap the call to `func_to_test` in a `try...except` block to catch any exceptions during its execution. If an exception occurs, the test should fail and the message should include the exception details.\n",
    "4.  **Multiple Tests**: If the raw text contains multiple test cases, generate a separate `test_case` function for each.\n",
    "5.  **Formatting**:\n",
    "    * Enclose all the generated Python code within a single markdown code block (e.g., ```python ... ```).\n",
    "    * Use the exact separator `---TEST-CASE-SEPARATOR---` on its own line between each distinct `test_case` function.\n",
    "\n",
    "**Example:**\n",
    "\n",
    "* **Raw Text:**\n",
    "    \"The function must handle edge cases like an empty list. For `[]`, it should return `0`. Also, check for a list with a single element like `[5]`, which should return `5`.\"\n",
    "\n",
    "* **Expected Output:**\n",
    "    ```python\n",
    "    import traceback\n",
    "\n",
    "    def test_case(func_to_test):\n",
    "        try:\n",
    "            input_val = []\n",
    "            expected_output = 0\n",
    "            actual_output = func_to_test(input_val)\n",
    "            if actual_output == expected_output:\n",
    "                return True, \"Test passed for empty list.\"\n",
    "            else:\n",
    "                return False, f\"Test failed for input {{input_val}}. Expected: {{expected_output}}, Got: {{actual_output}}\"\n",
    "        except Exception as e:\n",
    "            return False, f\"Test failed for input {{input_val}} with exception: {{e}}\\\\n{{traceback.format_exc()}}\"\n",
    "\n",
    "    ---TEST-CASE-SEPARATOR---\n",
    "\n",
    "    import traceback\n",
    "\n",
    "    def test_case(func_to_test):\n",
    "        try:\n",
    "            input_val = [5]\n",
    "            expected_output = 5\n",
    "            actual_output = func_to_test(input_val)\n",
    "            if actual_output == expected_output:\n",
    "                return True, \"Test passed for single-element list.\"\n",
    "            else:\n",
    "                return False, f\"Test failed for input {{input_val}}. Expected: {{expected_output}}, Got: {{actual_output}}\"\n",
    "        except Exception as e:\n",
    "            return False, f\"Test failed for input {{input_val}} with exception: {{e}}\\\\n{{traceback.format_exc()}}\"\n",
    "    ```\n",
    "\n",
    "Now, please process the following raw text and generate the corresponding Python test functions.\n",
    "\n",
    "**Raw Text to Process:**\n",
    "\n",
    "---\n",
    "{raw_llm_output}\n",
    "---\n",
    "\"\"\"\n",
    "        return prompt.strip()\n",
    "\n",
    "    def _extract_and_parse_tests(self, llm_output):\n",
    "\n",
    "        # 1. Find the Python code block using a regular expression\n",
    "        code_block_match = re.search(r\"```python\\n(.*?)```\", llm_output, re.DOTALL)\n",
    "        if not code_block_match:\n",
    "            print(\"Warning: Could not find a python code block in the LLM output.\")\n",
    "            print(f\"LLM Output:\\n{llm_output}\")\n",
    "            return [], []\n",
    "\n",
    "        code_block = code_block_match.group(1).strip()\n",
    "        \n",
    "        # 2. Split the code block into individual test case strings\n",
    "        test_case_strings = code_block.split(\"---TEST-CASE-SEPARATOR---\")\n",
    "\n",
    "        test_functions = []\n",
    "        code_strs = []\n",
    "        for i, code_str in enumerate(test_case_strings):\n",
    "            code_str = code_str.strip()\n",
    "            code_strs.append(code_str)\n",
    "            if not code_str:\n",
    "                continue\n",
    "\n",
    "            try:\n",
    "                # 3. Execute the code in an isolated namespace to define the function\n",
    "                local_namespace = {}\n",
    "                exec(code_str, {}, local_namespace)\n",
    "\n",
    "                # 4. Retrieve the compiled function from the namespace\n",
    "                if 'test_case' in local_namespace and callable(local_namespace['test_case']):\n",
    "                    test_functions.append(local_namespace['test_case'])\n",
    "                else:\n",
    "                    print(f\"Warning: 'test_case' function not found in code snippet #{i+1}.\")\n",
    "\n",
    "            except SyntaxError as e:\n",
    "                print(f\"Warning: Syntax error parsing test case #{i+1}: {e}\")\n",
    "                print(f\"Problematic code:\\n---\\n{code_str}\\n---\")\n",
    "            except Exception as e:\n",
    "                print(f\"Warning: An unexpected error occurred while parsing test case #{i+1}: {e}\")\n",
    "                print(f\"Problematic code:\\n---\\n{code_str}\\n---\")\n",
    "\n",
    "        return test_functions, code_strs\n",
    "    \n",
    "    def get_test_prompt_in_content(self, task_description, plan=None):\n",
    "        # Conditionally define the text and section for the plan\n",
    "        if plan:\n",
    "            context_description = \"task description and plan\"\n",
    "            plan_section = f\"\"\"**Plan:**\n",
    "---\n",
    "{plan}\n",
    "---\"\"\"\n",
    "        else:\n",
    "            context_description = \"task description\"\n",
    "            plan_section = \"\"\n",
    "\n",
    "        # Construct the prompt, inserting the plan section only if it exists\n",
    "        prompt_temp = f\"\"\"\n",
    "You are a meticulous assistant. Your task is to carefully read the following {context_description} and extract any explicit examples or test cases mentioned within it.\n",
    "\n",
    "**Task Description:**\n",
    "---\n",
    "{task_description}\n",
    "---\n",
    "{plan_section}\n",
    "**Your Instructions:**\n",
    "1.  Read the {context_description} carefully.\n",
    "2.  Identify every example that demonstrates the function's behavior, usually in the form of an input and its corresponding expected output.\n",
    "3.  List each input-output pair you find.\n",
    "4.  **Crucially**, if you find absolutely no examples or test cases in the text, you must return the single word: **None**.\n",
    "\n",
    "Do not invent any new test cases. Only extract what is explicitly written.\n",
    "\"\"\"\n",
    "        return prompt_temp.strip()\n",
    "\n",
    "    def get_test_prompt_correctness(self, task_description, plan=None):\n",
    "        if plan:\n",
    "            context_description = \"task description and plan\"\n",
    "            plan_section = f\"\"\"**Plan:**\n",
    "---\n",
    "{plan}\n",
    "---\"\"\"\n",
    "        else:\n",
    "            context_description = \"task description\"\n",
    "            plan_section = \"\"\n",
    "\n",
    "        prompt_temp = f\"\"\"\n",
    "You are a senior software quality engineer. Your goal is to design a single, high-quality test case to verify the correctness of a function based on the {context_description} provided. This test case must be **novel** and **not** one of the examples already mentioned in the description.\n",
    "\n",
    "**Task Description:**\n",
    "---\n",
    "{task_description}\n",
    "---\n",
    "{plan_section}\n",
    "**Your Thought Process (Follow these steps meticulously):**\n",
    "\n",
    "1.  **Analyze the Task:**\n",
    "    * What is the primary goal of the function?\n",
    "    * What are the specified inputs and their data types?\n",
    "    * What is the specified output and its data type?\n",
    "    * What are the constraints and edge cases (e.g., empty inputs, negative numbers, large values, specific formats)?\n",
    "\n",
    "2.  **Identify a Novel Test Scenario:**\n",
    "    * Review the examples in the description (if any) and consciously choose a different category of input. Consider edge cases, corner cases, or common failure points for this type of problem. For example, if the description tests positive numbers, consider testing negative numbers, zero, or a mix.\n",
    "\n",
    "3.  **Generate the Test Case (Show your work):**\n",
    "    * **Step A: Propose the Input.** State the exact input you will use for your test.\n",
    "    * **Step B: Reason Step-by-Step.** Walk through the logic required by the task description, applying it to your chosen input. Explain how you arrive at the final output. This reasoning is the most important part.\n",
    "    * **Step C: State the Expected Output.** Clearly state the final, correct output based on your reasoning.\n",
    "\n",
    "Provide your analysis and the final test case (Input, Reasoning, and Output).\n",
    "\"\"\"\n",
    "        return prompt_temp.strip()\n",
    "    \n",
    "    def get_test_prompt_runnable(self, task_description, plan=None):\n",
    "        if plan:\n",
    "            context_description = \"task description and plan\"\n",
    "            plan_section = f\"\"\"**Plan:**\n",
    "---\n",
    "{plan}\n",
    "---\"\"\"\n",
    "        else:\n",
    "            context_description = \"task description\"\n",
    "            plan_section = \"\"\n",
    "\n",
    "        prompt_temp = f\"\"\"\n",
    "You are a build engineer creating a \"smoke test\". The goal is **not** to check for correctness, but simply to ensure a function is runnable (i.e., it can be called without crashing due to syntax errors or basic type mismatches).\n",
    "\n",
    "**Task Description:**\n",
    "---\n",
    "{task_description}\n",
    "---\n",
    "{plan_section}\n",
    "**Your Instructions:**\n",
    "\n",
    "1.  **Analyze the Function Signature:** Based on the {context_description}, determine the expected data type and structure of the input arguments. For example, does it take a single integer, a list of strings, two arguments?\n",
    "\n",
    "2.  **Propose a Mock Input:** Create the simplest possible, valid input that conforms to the signature you identified.\n",
    "\n",
    "3.  **Describe the Test Logic:** Explain that the test involves calling the function with this mock input. The test passes if the function executes and returns *anything* without raising an exception. The actual return value does not matter for this specific test. Describe the necessary components for a test that would run the function and catch any potential errors.\n",
    "\"\"\"\n",
    "        return prompt_temp.strip()\n",
    "    \n",
    "    def _sample_test_case(self, test_cases, num_test):\n",
    "        if not test_cases or num_test <= 0:\n",
    "            return []\n",
    "\n",
    "        all_test_strings = [\n",
    "            data['test_function_string']\n",
    "            for data in test_cases.values()\n",
    "            if 'test_function_string' in data\n",
    "        ]\n",
    "\n",
    "        if not all_test_strings:\n",
    "            return []\n",
    "\n",
    "        # Ensure we don't try to sample more items than exist\n",
    "        num_to_sample = min(num_test, len(all_test_strings))\n",
    "\n",
    "        return random.sample(all_test_strings, num_to_sample)\n",
    "\n",
    "    def _construct_prompt(self, plan, task_description, sampled_tests, use_task_description=True):\n",
    "\n",
    "        prompt_parts = [\n",
    "            \"You are an expert Python algorithm engineer. Your task is to generate a complete and runnable Python script based on the provided plan and context.\",\n",
    "            \"Follow these instructions carefully:\",\n",
    "            \"1. **Reasoning First**: Before writing any code, provide a step-by-step reasoning of your approach. Explain the chosen algorithms, data structures, and the logic for the main function. This thought process is critical.\",\n",
    "            \"2. **Code Generation**: After your reasoning, provide the complete Python code in a single block. The code must be fully functional and self-contained.\",\n",
    "            \"3. **Main Function**: The script MUST include a `main` function that serves as the entry point. This function must accept inputs and return outputs exactly as described in the plan, as it will be used for automated evaluation.\",\n",
    "            \"4. **No Type Hints**: Do not use type hints from the `typing` module in your code.\",\n",
    "            \"\\n---\\n\"\n",
    "        ]\n",
    "\n",
    "        # Add the core plan\n",
    "        prompt_parts.append(\"## Plan to Implement\\n\" + plan)\n",
    "\n",
    "        # Conditionally add the task description\n",
    "        if use_task_description:\n",
    "            prompt_parts.append(\"## Task Description\\n\" + task_description)\n",
    "\n",
    "        # Conditionally add sampled test cases for context\n",
    "        if sampled_tests:\n",
    "            test_cases_str = \"\\n\".join(sampled_tests)\n",
    "            prompt_parts.append(\n",
    "                \"## Example Test Cases\\n\"\n",
    "                \"Here are some example test cases to help you understand the required input/output format. Your solution should be able to pass these.\\n\"\n",
    "                + test_cases_str\n",
    "            )\n",
    "        \n",
    "        prompt_parts.append(\"\\n---\\n\")\n",
    "        prompt_parts.append(\"Now, begin with your reasoning, followed by the complete code.\")\n",
    "\n",
    "        return \"\\n\".join(prompt_parts)\n",
    "\n",
    "    def generate_code_with_reasoning(self, task_description, plans, num_codes, test_cases=None, num_test=0, use_task_description=True):\n",
    "        sampled_tests = []\n",
    "        if test_cases and num_test > 0:\n",
    "            sampled_tests = self._sample_test_case(test_cases, num_test)\n",
    "\n",
    "        prompt = self._construct_prompt(\n",
    "            plans,\n",
    "            task_description,\n",
    "            use_task_description,\n",
    "            sampled_tests\n",
    "        )\n",
    "\n",
    "        if num_codes <= 0:\n",
    "            return []\n",
    "\n",
    "        if num_codes == 1:\n",
    "            try:\n",
    "                # Use synchronous call for a single request\n",
    "                response = self.LLM_model.LLM_response(prompt)\n",
    "                return [response] if response else []\n",
    "            except Exception as e:\n",
    "                print(f\"An error occurred during single LLM call: {e}\")\n",
    "                return []\n",
    "        else:\n",
    "            # Use asynchronous call for multiple requests for efficiency\n",
    "            prompts_list = [prompt] * num_codes\n",
    "            try:\n",
    "                results, all_successful = self.LLM_model.LLM_response_async(prompts_list)\n",
    "                if not all_successful:\n",
    "                    print(\"Warning: Not all asynchronous LLM calls were successful.\")\n",
    "                \n",
    "                # Extract successful responses\n",
    "                successful_responses = [res[1] for res in results if res[0]]\n",
    "                return successful_responses\n",
    "            except Exception as e:\n",
    "                print(f\"An error occurred during asynchronous LLM calls: {e}\")\n",
    "                return []\n",
    "\n",
    "    def _code_extract_prompt(self, raw_llm_output):\n",
    "        prompt = f\"\"\"\n",
    "You are a precise code parsing tool. Your task is to extract the complete Python code block and the name of the main function from the text provided below. The text contains reasoning followed by the code.\n",
    "\n",
    "Respond ONLY with a JSON object in the following format:\n",
    "{{\n",
    "  \"code_str\": \"...\",\n",
    "  \"main_func_name\": \"...\"\n",
    "}}\n",
    "\n",
    "- The value for \"code_str\" should be the entire, clean Python code as a single string. This includes all necessary imports and functions.\n",
    "- The value for \"main_func_name\" should be the name of the main entry point function as a string.\n",
    "- If you cannot find a valid Python code block or a main function, return a JSON object with empty strings for both values. Do not add any explanation.\n",
    "\n",
    "--- TEXT TO PARSE ---\n",
    "{raw_llm_output}\n",
    "--- END OF TEXT ---\n",
    "\"\"\"\n",
    "        return prompt\n",
    "\n",
    "    def code_extraction(self, stage_one_outputs, plan_id):\n",
    "        if not stage_one_outputs:\n",
    "            return []\n",
    "\n",
    "        # Create a list of prompts for the async call\n",
    "        prompts = [self._code_extract_prompt(output) for output in stage_one_outputs]\n",
    "        \n",
    "        try:\n",
    "            results, all_successful = self.LLM_model.LLM_response_async(prompts)\n",
    "        except Exception as e:\n",
    "            print(f\"An error occurred during asynchronous LLM calls for extraction: {e}\")\n",
    "            # On catastrophic failure, return a list of failure records\n",
    "            return [\n",
    "                {\"code_str\": \"\", \"main_func_name\": \"\", \"reasoning\": original_output, \"plan_id\": plan_id}\n",
    "                for original_output in stage_one_outputs\n",
    "            ]\n",
    "\n",
    "        extracted_data = []\n",
    "        \n",
    "        # Correlate original outputs with their corresponding results\n",
    "        output_result_pairs = zip(stage_one_outputs, results)\n",
    "\n",
    "        for original_output, result_tuple in output_result_pairs:\n",
    "            success, response, _ = result_tuple\n",
    "\n",
    "            if not success:\n",
    "                # Handle failure of the LLM call itself\n",
    "                extracted_data.append({\n",
    "                    \"code_str\": \"\",\n",
    "                    \"main_func_name\": \"\",\n",
    "                    \"reasoning\": original_output,\n",
    "                    \"plan_id\": plan_id\n",
    "                })\n",
    "                continue\n",
    "\n",
    "            try:\n",
    "                # The primary success path: parsing the JSON response\n",
    "                parsed_json = json.loads(response)\n",
    "                \n",
    "                # Use .get() for safe dictionary access, providing \"\" as a default\n",
    "                code_str = parsed_json.get(\"code_str\", \"\")\n",
    "                main_func_name = parsed_json.get(\"main_func_name\", \"\")\n",
    "\n",
    "                if not isinstance(code_str, str) or not isinstance(main_func_name, str):\n",
    "                    # Handle case where JSON is valid but types are wrong\n",
    "                    raise TypeError(\"JSON values are not strings.\")\n",
    "\n",
    "                extracted_data.append({\n",
    "                    \"code_str\": code_str,\n",
    "                    \"main_func_name\": main_func_name,\n",
    "                    \"reasoning\": original_output,\n",
    "                    \"plan_id\": plan_id\n",
    "                })\n",
    "\n",
    "            except (json.JSONDecodeError, TypeError) as e:\n",
    "                # Handle cases where the LLM response is not valid JSON or has wrong types\n",
    "                print(f\"Failed to parse LLM response for extraction: {e}\")\n",
    "                extracted_data.append({\n",
    "                    \"code_str\": \"\",\n",
    "                    \"main_func_name\": \"\",\n",
    "                    \"reasoning\": original_output,\n",
    "                    \"plan_id\": plan_id\n",
    "                })\n",
    "\n",
    "        return extracted_data\n",
    "\n",
    "    def generate_codes(self, num_codes, task_description, plan_id, plan, test_cases=None, num_test_cases=1, use_task_description=True):\n",
    "\n",
    "        raw_outputs = self.stage_one_generator.generate_code_with_reasoning(\n",
    "            task_description=task_description,\n",
    "            plans=plan,\n",
    "            num_codes=num_codes,\n",
    "            test_cases=test_cases,\n",
    "            num_test=num_test_cases,\n",
    "            use_task_description=use_task_description\n",
    "        )\n",
    "\n",
    "        # If the first stage failed to produce any output, terminate and return.\n",
    "        if not raw_outputs:\n",
    "            print(\"Code Generation Warning: Stage One did not produce any output.\")\n",
    "            return []\n",
    "\n",
    "        # Pass the raw outputs from stage one to the second stage for parsing and structuring.\n",
    "        extracted_codes = self.code_extraction(stage_one_outputs=raw_outputs, plan_id=plan_id)\n",
    "\n",
    "        # restructured as a dictionary with unique IDs\n",
    "        extracted_codes = {str(uuid.uuid4()): code_info for code_info in extracted_codes}\n",
    "\n",
    "        # Return the final list of structured code objects.\n",
    "        return extracted_codes\n",
    "    \n",
    "    def preprocess_codes(self, codes):\n",
    "        updated_codes = codes.copy()\n",
    "        \n",
    "        # --- Step 1: Identify codes that need preprocessing ---\n",
    "        ids_to_embed = []\n",
    "        strings_to_embed = []\n",
    "\n",
    "        for code_id, info in updated_codes.items():\n",
    "            # Process AST if not present\n",
    "            if 'ast' not in info:\n",
    "                try:\n",
    "                    code_str = info.get('code_str', '')\n",
    "                    # Parse the code into an AST object\n",
    "                    parsed_ast = ast.parse(code_str)\n",
    "                    # Dump the AST into a serializable string format\n",
    "                    info['ast'] = ast.dump(parsed_ast)\n",
    "                except SyntaxError:\n",
    "                    info['ast'] = \"Error: Invalid Python syntax\"\n",
    "            \n",
    "            # Identify codes that need an embedding\n",
    "            if 'embedding' not in info:\n",
    "                ids_to_embed.append(code_id)\n",
    "                strings_to_embed.append(info.get('code_str', ''))\n",
    "\n",
    "        # --- Step 2: Batch process embeddings if any are needed ---\n",
    "        if strings_to_embed:\n",
    "            # Call the embedding API once for all unprocessed codes\n",
    "            embeddings = self.LLM_model.Embedding_response(strings_to_embed)\n",
    "            \n",
    "            # --- Step 3: Update the records with the new embeddings ---\n",
    "            for code_id, embedding_vector in zip(ids_to_embed, embeddings):\n",
    "                updated_codes[code_id]['embedding'] = embedding_vector\n",
    "\n",
    "        return updated_codes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "086fe0e1",
   "metadata": {},
   "source": [
    "### Evaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "349a8008",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
    "import tempfile\n",
    "import json\n",
    "import concurrent.futures\n",
    "from tqdm import tqdm\n",
    "\n",
    "class Evaluator():\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def evaluate_codes(self, codes, test_cases, timeout):\n",
    "        print(f\"INFO: Evaluating {len(codes)} codes against {len(test_cases)} test cases...\")\n",
    "        # Mock evaluation: assign random scores\n",
    "        import random\n",
    "        return {code_id: {\"score\": random.random(), \"passed\": random.randint(0, len(test_cases))} for code_id in codes}\n",
    "    \n",
    "    def static_code_analyzer(self, codes):\n",
    "        print(\"INFO: Running static code analysis...\")\n",
    "        code_id_list = list(codes.keys())\n",
    "        code_strs = [code_info[\"content\"] for code_info in codes.values()]\n",
    "\n",
    "        with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "            static_scores = list(tqdm(\n",
    "                executor.map(self._compute_static_scores, code_strs),\n",
    "                total=len(code_strs),\n",
    "                desc=\"Analyzing codes\"\n",
    "            ))\n",
    "\n",
    "        return {code_id: {\"pylint_score\": scores[0], \"radon_mi_score\": scores[1]} for code_id, scores in zip(code_id_list, static_scores)}\n",
    "\n",
    "    def _compute_static_scores(self, code_str):\n",
    "        return (\n",
    "            self.pylint_code_score(code_str),\n",
    "            self.radon_mi_code_score(code_str)\n",
    "        )\n",
    "\n",
    "    def pylint_code_score(self, code):\n",
    "        try:\n",
    "            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp:\n",
    "                tmp.write(code)\n",
    "                tmp_path = tmp.name\n",
    "            \n",
    "            result = subprocess.run(\n",
    "                [\"pylint\", \"--output-format=text\", tmp_path],\n",
    "                capture_output=True,\n",
    "                text=True,\n",
    "                check=False\n",
    "            )\n",
    "            os.unlink(tmp_path)\n",
    "            \n",
    "            match = re.search(r\"rated at (\\d+\\.?\\d*)/10\", result.stdout)\n",
    "            return float(match.group(1))/10 if match else -1\n",
    "        \n",
    "        except Exception as e:\n",
    "            print(f\"Pylint analysis failed: {e}\")\n",
    "            return -1\n",
    "\n",
    "    def radon_mi_code_score(self, code):\n",
    "        try:\n",
    "            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp:\n",
    "                tmp.write(code)\n",
    "                tmp_path = tmp.name\n",
    "            \n",
    "            result = subprocess.run(\n",
    "                [\"radon\", \"mi\", \"--json\", tmp_path],\n",
    "                capture_output=True,\n",
    "                text=True,\n",
    "                check=False\n",
    "            )\n",
    "            os.unlink(tmp_path)\n",
    "            \n",
    "            data = json.loads(result.stdout)\n",
    "            if data and isinstance(data, dict):\n",
    "                return list(data.values())[0][\"mi\"] / 100\n",
    "            return -1\n",
    "        except Exception as e:\n",
    "            print(f\"Radon analysis failed: {e}\")\n",
    "            return -1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa75a5d5",
   "metadata": {},
   "source": [
    "### CodeRunner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f436b92",
   "metadata": {},
   "outputs": [],
   "source": [
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def update_weights(self, codes, test_cases, last_results):\n",
    "        print(\"INFO: Updating weights for test cases and scores for codes...\")\n",
    "        # This method would analyze which tests are better at discriminating\n",
    "        # good code from bad, and update their weights accordingly.\n",
    "        # It would also re-calculate code scores based on new test case weights.\n",
    "        return test_cases, codes\n",
    "\n",
    "    def filter_by_weights(self, test_cases, codes, code_threshold=0.2, test_threshold=0.2):\n",
    "        print(\"INFO: Filtering low-score codes and low-confidence test cases...\")\n",
    "        original_code_count = len(codes)\n",
    "        original_test_count = len(test_cases)\n",
    "\n",
    "        codes = {k: v for k, v in codes.items() if v.get('score', 0) > code_threshold}\n",
    "        test_cases = {k: v for k, v in test_cases.items() if v.get('weight', 0) > test_threshold}\n",
    "\n",
    "        print(f\"INFO: Codes reduced from {original_code_count} to {len(codes)}\")\n",
    "        print(f\"INFO: Test cases reduced from {original_test_count} to {len(test_cases)}\")\n",
    "        return test_cases, codes\n",
    "\n",
    "    def select_best_plan(self, plans, codes):\n",
    "        print(\"INFO: Selecting the best performing plan...\")\n",
    "        plan_scores = {}\n",
    "        for code_id, code_info in codes.items():\n",
    "            plan_id = code_info.get(\"plan_id\")\n",
    "            if plan_id:\n",
    "                plan_scores.setdefault(plan_id, []).append(code_info.get(\"score\", 0))\n",
    "\n",
    "        if not plan_scores:\n",
    "            return list(plans.keys())[0] if plans else \"default_plan\"\n",
    "\n",
    "        # Average score per plan\n",
    "        avg_scores = {p_id: sum(s) / len(s) for p_id, s in plan_scores.items()}\n",
    "        best_plan_id = max(avg_scores, key=avg_scores.get)\n",
    "        print(f\"INFO: Best plan is '{best_plan_id}' with average score {avg_scores[best_plan_id]:.2f}\")\n",
    "        return best_plan_id"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "939ee899",
   "metadata": {},
   "source": [
    "### LCDP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d8ce09f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import copy\n",
    "import logging\n",
    "import pandas as pd\n",
    "from tqdm.asyncio import tqdm_asyncio\n",
    "from pathlib import Path\n",
    "from datetime import datetime\n",
    "# from LLM_call import LLMModel\n",
    "# from LLM_TM import LLMTM\n",
    "# from LLM_CG import LLMCG, CodeRunner\n",
    "# from Evaluator import Evaluator\n",
    "from collections import defaultdict\n",
    "from typing import Dict, List\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from scipy.linalg import solve\n",
    "import collections\n",
    "\n",
    "# # 创建Logger实例\n",
    "# logger = logging.getLogger(__name__)\n",
    "# logger.setLevel(logging.DEBUG)\n",
    "\n",
    "# # 定义日志格式\n",
    "# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n",
    "\n",
    "# # 输出到文件的Handler\n",
    "# log_dir = Path(\"log\")\n",
    "# log_dir.mkdir(exist_ok=True)\n",
    "# timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "# log_filename = f\"{timestamp}.log\"\n",
    "# log_path = log_dir / log_filename\n",
    "# file_handler = logging.FileHandler(log_path)\n",
    "# file_handler.setLevel(logging.DEBUG)\n",
    "# file_handler.setFormatter(formatter)\n",
    "\n",
    "# # 输出到控制台的Handler\n",
    "# console_handler = logging.StreamHandler()\n",
    "# console_handler.setLevel(logging.INFO)\n",
    "# console_handler.setFormatter(formatter)\n",
    "\n",
    "# # 将Handler添加到Logger\n",
    "# logging.basicConfig(\n",
    "#     level=logging.DEBUG,\n",
    "#     datefmt=\"%Y-%m-%d %H:%M:%S\",\n",
    "#     handlers=[\n",
    "#         file_handler,   # 文件输出\n",
    "#         console_handler # 控制台输出\n",
    "#     ]\n",
    "# )\n",
    "\n",
    "\n",
    "class LCDP():\n",
    "    def __init__(self, api_key, model=\"gpt-3.5-turbo\", base_url=\"https://api.openai.com/v1/\", max_workers=5, ignore_advice=False, use_web_search=False):\n",
    "        self.llm_model = LLMModel(api_key, model, base_url=base_url, use_web_search=use_web_search)\n",
    "        self.llmtm = LLMTM(self.llm_model)\n",
    "        self.llmcg = LLMCG(self.llm_model)\n",
    "        self.code_runner = CodeRunner(max_workers=max_workers)\n",
    "        self.evaluator = Evaluator()\n",
    "        self.task_description = None\n",
    "        self.current_plan = {}\n",
    "        self.test_cases = {}\n",
    "        self.codes = {}\n",
    "        self.test_timeout = None\n",
    "        self.ignore_advice = ignore_advice\n",
    "        self.gp_model = None\n",
    "\n",
    "    def initialize(self):\n",
    "        self.task_description = None\n",
    "        self.current_plan = {}\n",
    "        self.test_cases = {}\n",
    "        self.codes = {}\n",
    "        self.test_timeout = None\n",
    "        self.gp_model = None\n",
    "        self.k = 1\n",
    "\n",
    "    def run(self, \n",
    "            task_description, \n",
    "            max_iterations=3, \n",
    "            stop_t=0.8, \n",
    "            num_plans=3, \n",
    "            num_tests=5, \n",
    "            num_codes=5,\n",
    "            num_codes_select=3, \n",
    "            refine_rounds=3,\n",
    "            code_refine_rounds=3,\n",
    "            test_timeout=None, \n",
    "            min_tests=15, \n",
    "            max_tests=20, \n",
    "        #   use_async_generation=True, \n",
    "            knowledge_refine=False,\n",
    "            best_only=False, \n",
    "            error_test_num=3, \n",
    "            record_all_results=False, \n",
    "            forced_test_cases=None):\n",
    "        \n",
    "        self.initialize()\n",
    "\n",
    "        if knowledge_refine:\n",
    "            print(\"\\n########################################################################\")\n",
    "            print(\"### Phase 0: Refine Task Description\")\n",
    "            self.task_description = self.llmtm.task_knowledge_refinement(task_description)\n",
    "\n",
    "        # generate plans\n",
    "        print(\"\\n########################################################################\")\n",
    "        print(\"### Phase 1: Plan Generation and Refinement\")\n",
    "        plan = self.llmtm.get_plan(self.task_description, num_plans=num_plans)\n",
    "\n",
    "        if self.ignore_advice:\n",
    "            # skip plan refinement by ignoring user feedback\n",
    "            self.current_plan = plan\n",
    "        else:\n",
    "            # User iterative refinement to improve/select the plan\n",
    "            self.current_plan = self.llmtm.plan_refinement_loop(plan, refine_rounds)\n",
    "\n",
    "        self.plan_summary = self.llmtm.summarize_plan(self.current_plan)\n",
    "        # print(self.plan_summary)\n",
    "\n",
    "        # generate test cases\n",
    "        print(\"\\n########################################################################\")\n",
    "        print(\"### Phase 2: Test Case Generation and Weighting\")\n",
    "        \n",
    "        if forced_test_cases is not None:\n",
    "            self.test_cases = forced_test_cases\n",
    "        \n",
    "        print(\"initialize test cases and weights...\")\n",
    "        self.test_cases, debug_info = self.llmcg.generate_tests(num_tests, self.task_description, plan=self.plan_summary, original_test_cases=self.test_cases, debug=True)\n",
    "\n",
    "        # print(self.test_cases)\n",
    "        # raise NotImplementedError(\"filter_test_cases method is not implemented yet.\")\n",
    "        # self.test_cases = self.llmcg.filter_test_cases(self.test_cases)\n",
    "\n",
    "        print(\"\\n########################################################################\")\n",
    "        print(\"### Phase 3: First generation and evaluation\")\n",
    "        \n",
    "        for plan_id, plan_i in self.current_plan.items():\n",
    "            # generate codes for each plan\n",
    "            print(\"generate code for plan: \", plan_id)\n",
    "            generated_codes = self.llmcg.generate_codes(num_codes, self.task_description, plan_id, plan_i, self.test_cases, use_task_description=True)\n",
    "\n",
    "            # evaluate codes and refine code by self-reflection and error analysis\n",
    "            for _ in range(code_refine_rounds):\n",
    "                print(\"\\n--- Code Evaluation ---\")\n",
    "                code_results = self.evaluator.evaluate_codes(generated_codes, self.test_cases, self.test_timeout)\n",
    "                print(\"\\n--- Code Refinement ---\")\n",
    "                generated_codes = self.llmcg.refine_codes(generated_codes, code_results, self.test_cases, error_test_num)\n",
    "\n",
    "            # evaluate codes based on static code analysis tools\n",
    "            static_analysis_results = self.evaluator.static_code_analyzer(generated_codes)\n",
    "\n",
    "            # update code storage with results\n",
    "            self.codes = self.update_codes(self.codes, generated_codes, code_results, static_analysis_results)\n",
    "\n",
    "        # preprocess code information (AST, embedding, etc.)\n",
    "        self.codes = self.llmcg.preprocess_codes(self.codes)\n",
    "\n",
    "        print(\"\\n########################################################################\")\n",
    "        print(\"### Phase 4: Iterative Code Generation and Evaluation\")\n",
    "        for iteration in range(max_iterations):\n",
    "            # update test cases' and codes' weight based on previous results\n",
    "            self.test_cases, self.codes = self.update_weights(self.codes, self.test_cases)\n",
    "\n",
    "            # filter test cases and codes based on weight\n",
    "            self.test_cases, self.codes = self.filter_by_weights(self.test_cases, self.codes, test_threshold=0.2, code_threshold=0.2)\n",
    "\n",
    "            # add more test cases if needed\n",
    "            if len(self.test_cases) < min_tests:\n",
    "                print(f\"INFO: Test cases below threshold ({len(self.test_cases)} < {min_tests}). Generating more.\")\n",
    "                required_tests = min_tests - len(self.test_cases)\n",
    "                self.test_cases, debug_info = self.llmcg.generate_tests(required_tests, self.task_description, plan=self.plan_summary, original_test_cases=self.test_cases, debug=True)\n",
    "\n",
    "            # select best plan based on current codes' performance\n",
    "            self.current_best_plan = self.select_best_plan(self.current_plan, self.codes)\n",
    "\n",
    "            # generate new plan based on current best plan\n",
    "            self.current_plan = self.llmtm.generate_new_plan(self.current_best_plan, self.task_description, self.codes, self.test_cases)\n",
    "            \n",
    "            # Optional user refinement for new plans\n",
    "            if not self.ignore_advice:\n",
    "                 new_plans = self.llmtm.plan_refinement_loop(self.llmtm, new_plans, refine_rounds)\n",
    "\n",
    "            all_candidate_codes = {}\n",
    "            for plan_id, plan_i in new_plans.items():\n",
    "                # 5. Generate several new \"candidate\" codes for each new plan\n",
    "                candidate_codes = self.llmcg.generate_codes(num_codes, self.task_description, plan_id, plan_i, self.test_cases, use_task_description=True)\n",
    "                all_candidate_codes.update(candidate_codes)\n",
    "\n",
    "            # 6. Preprocess all new codes \n",
    "            all_current_codes = self.llmcg.preprocess_codes(all_current_codes)\n",
    "\n",
    "            # 7. Build Gaussian Process model\n",
    "            self.gp_model = self.build_gp_model(self.codes)\n",
    "\n",
    "            # 8. Select the most promising codes using the model and an acquisition function\n",
    "            # k initial value is 1, each round times 0.75\n",
    "            self.k *= 0.75\n",
    "            codes_to_evaluate = self.select_codes_with_acquisition(all_candidate_codes, num_code_select=num_codes_select, k=self.k)\n",
    "\n",
    "            # 9. Evaluate ONLY the selected codes\n",
    "            print(f\"INFO: Evaluating {len(codes_to_evaluate)} selected codes...\")\n",
    "            for _ in range(code_refine_rounds):\n",
    "                code_results = self.evaluator.evaluate_codes(codes_to_evaluate, self.test_cases, test_timeout)\n",
    "                codes_to_evaluate = self.llmcg.refine_codes(codes_to_evaluate, code_results, self.test_cases, 3)\n",
    "\n",
    "            final_results = self.evaluator.evaluate_codes(codes_to_evaluate, self.test_cases, test_timeout)\n",
    "            static_analysis_results = self.evaluator.static_code_analyzer(codes_to_evaluate)\n",
    "\n",
    "            # 10. Update the main code repository with the newly evaluated codes\n",
    "            self.codes = self.update_codes(self.codes, codes_to_evaluate, final_results)\n",
    "            \n",
    "            # 11. Check for stopping condition\n",
    "            best_score = max(c.get('score', 0) for c in self.codes.values())\n",
    "            print(f\"INFO: Best score after iteration {iteration + 1} is {best_score:.2f}\")\n",
    "            if best_score >= stop_t:\n",
    "                print(f\"INFO: Stopping condition met. Score {best_score:.2f} >= {stop_t}\")\n",
    "                break\n",
    "        \n",
    "        # Final Output\n",
    "        if not self.codes:\n",
    "            print(\"ERROR: No code was generated.\")\n",
    "            return None\n",
    "            \n",
    "        best_code = max(self.codes.values(), key=lambda x: x.get('score', 0))\n",
    "        print(\"\\n########################################################################\")\n",
    "        print(\"### Process Finished\")\n",
    "        print(f\"Best code found with score: {best_code['score']:.2f}\")\n",
    "        print(\"Content:\")\n",
    "        print(best_code['content'])\n",
    "        print(\"########################################################################\")\n",
    "\n",
    "        return best_code\n",
    "\n",
    "    def update_codes(self, existing_codes, generated_codes, code_results, static_analysis_results):\n",
    "        # Make a copy to avoid modifying the original dict in place unexpectedly\n",
    "        updated_codes = existing_codes.copy()\n",
    "\n",
    "        # Iterate over each newly generated code\n",
    "        for code_id, code_info in generated_codes.items():\n",
    "            # Generate a new, guaranteed unique ID for our storage\n",
    "            new_unique_id = str(uuid.uuid4())\n",
    "\n",
    "            # Safely get test results, defaulting to an empty dict if not found\n",
    "            test_results = code_results.get(code_id, {})\n",
    "            \n",
    "            # Calculate the pass rate from test results\n",
    "            pass_rate = 0.0\n",
    "            if test_results:\n",
    "                passed_count = sum(1 for result in test_results.values() if result is True)\n",
    "                total_count = len(test_results)\n",
    "                if total_count > 0:\n",
    "                    pass_rate = (passed_count / total_count) * 100\n",
    "\n",
    "            # Create the consolidated record for the new code\n",
    "            updated_codes[new_unique_id] = {\n",
    "                'source_id': code_id,  # Keep track of the original ID\n",
    "                'code_str': code_info.get('code_str'),\n",
    "                'main_func_name': code_info.get('main_func_name'),\n",
    "                'reasoning': code_info.get('reasoning'),\n",
    "                'plan_id': code_info.get('plan_id'),\n",
    "                'test_results': test_results,\n",
    "                'pass_rate_percent': round(pass_rate, 2),\n",
    "                'static_analysis': static_analysis_results.get(code_id, {}) \n",
    "            }\n",
    "        \n",
    "        return updated_codes\n",
    "\n",
    "    def select_best_plan(self, plans, codes):\n",
    "        print(\"INFO: Selecting the best performing plan...\")\n",
    "        plan_scores = {}\n",
    "        for code_id, code_info in codes.items():\n",
    "            plan_id = code_info.get(\"plan_id\")\n",
    "            if plan_id:\n",
    "                plan_scores.setdefault(plan_id, []).append(code_info.get(\"score\", 0))\n",
    "\n",
    "        if not plan_scores:\n",
    "            return list(plans.keys())[0] if plans else \"default_plan\"\n",
    "\n",
    "        # Average score per plan\n",
    "        avg_scores = {p_id: sum(s) / len(s) for p_id, s in plan_scores.items()}\n",
    "        best_plan_id = max(avg_scores, key=avg_scores.get)\n",
    "        print(f\"INFO: Best plan is '{best_plan_id}' with average score {avg_scores[best_plan_id]:.2f}\")\n",
    "        return best_plan_id\n",
    "    \n",
    "    def update_weights(self,codes,test_cases,alpha = 0.9):\n",
    "        # --- 1. Calculate Code Scores ---\n",
    "        # The score is the weighted pass rate.\n",
    "        # S(x) = sum(I(x passes Ti) * C(Ti)) / sum(C(Ti))\n",
    "        \n",
    "        sum_of_all_weights = sum(t_info.get('weight', 0) for t_info in test_cases.values())\n",
    "\n",
    "        if sum_of_all_weights == 0:\n",
    "            # If all weights are zero, assign a default score of 0 to all codes\n",
    "            for code_id in codes:\n",
    "                codes[code_id]['score'] = 0.0\n",
    "        else:\n",
    "            for code_id, code_info in codes.items():\n",
    "                weighted_passes = 0\n",
    "                for test_id, passed in code_info.get('test_results', {}).items():\n",
    "                    if passed and test_id in test_cases:\n",
    "                        weighted_passes += test_cases[test_id].get('weight', 0)\n",
    "                \n",
    "                score = weighted_passes / sum_of_all_weights\n",
    "                codes[code_id]['score'] = round(score, 4)\n",
    "\n",
    "        # --- 2. Update Test Case Weights ---\n",
    "        # C(Ti)_new = (1-a)*C(Ti)_old + a * (avg_score_pass - avg_score_fail)\n",
    "        \n",
    "        for test_id, test_info in test_cases.items():\n",
    "            passing_code_scores = []\n",
    "            failing_code_scores = []\n",
    "\n",
    "            # Segregate codes based on whether they passed or failed this test case\n",
    "            for code_info in codes.values():\n",
    "                # Ensure the code has a result for the current test case\n",
    "                if test_id in code_info.get('test_results', {}):\n",
    "                    if code_info['test_results'][test_id]:\n",
    "                        passing_code_scores.append(code_info.get('score', 0))\n",
    "                    else:\n",
    "                        failing_code_scores.append(code_info.get('score', 0))\n",
    "            \n",
    "            # Calculate the average score for codes that passed the test\n",
    "            avg_pass_score = (\n",
    "                sum(passing_code_scores) / len(passing_code_scores)\n",
    "                if passing_code_scores else 0.0\n",
    "            )\n",
    "\n",
    "            # Calculate the average score for codes that failed the test\n",
    "            avg_fail_score = (\n",
    "                sum(failing_code_scores) / len(failing_code_scores)\n",
    "                if failing_code_scores else 0.0\n",
    "            )\n",
    "\n",
    "            discriminative_power = avg_pass_score - avg_fail_score\n",
    "            old_weight = test_info.get('weight', 1.0)\n",
    "\n",
    "            # Update the weight using the provided formula\n",
    "            new_weight = (1 - alpha) * old_weight + alpha * discriminative_power\n",
    "            test_cases[test_id]['weight'] = round(new_weight, 4)\n",
    "\n",
    "        return test_cases, codes\n",
    "\n",
    "    def filter_by_weights(self,test_cases,codes,test_threshold=0.2,code_threshold=0.2):\n",
    "        # --- 1. Filter Test Cases ---\n",
    "        filtered_test_cases = {\n",
    "            test_id: test_info\n",
    "            for test_id, test_info in test_cases.items()\n",
    "            if test_info.get('weight', 0) >= test_threshold\n",
    "        }\n",
    "        kept_test_ids = set(filtered_test_cases.keys())\n",
    "\n",
    "        # --- 2. Filter Codes ---\n",
    "        filtered_codes = {\n",
    "            code_id: code_info\n",
    "            for code_id, code_info in codes.items()\n",
    "            if code_info.get('score', 0) >= code_threshold\n",
    "        }\n",
    "\n",
    "        # --- 3. Clean 'test_results' in Surviving Codes ---\n",
    "        for code_id, code_info in filtered_codes.items():\n",
    "            original_results = code_info.get('test_results', {})\n",
    "            # Keep results only for test cases that were not filtered out\n",
    "            cleaned_results = {\n",
    "                test_id: result\n",
    "                for test_id, result in original_results.items()\n",
    "                if test_id in kept_test_ids\n",
    "            }\n",
    "            filtered_codes[code_id]['test_results'] = cleaned_results\n",
    "\n",
    "        return filtered_test_cases, filtered_codes\n",
    "    \n",
    "    def _rbf_kernel(self, X1, X2):\n",
    "        # 1. 计算余弦相似度\n",
    "        sim_matrix = cosine_similarity(X1, X2)\n",
    "\n",
    "        # 2. 计算余弦距离的平方\n",
    "        dist_sq = np.square(1 - sim_matrix)\n",
    "\n",
    "        # 3. 计算RBF核\n",
    "        if self.l is not None:\n",
    "            kernel = np.exp(-dist_sq / (2 * self.l**2))\n",
    "        else:\n",
    "            kernel = np.exp(-dist_sq)\n",
    "\n",
    "        return kernel\n",
    "\n",
    "    def build_gp_model(self, codes):\n",
    "        if not codes:\n",
    "            raise ValueError(\"Input 'codes' dictionary cannot be empty.\")\n",
    "\n",
    "        # 1. 从字典中提取数据\n",
    "        code_ids = list(codes.keys())\n",
    "        X_train = np.array([codes[cid][\"embedding\"] for cid in code_ids])\n",
    "        y_train = np.array([codes[cid][\"score\"] for cid in code_ids])\n",
    "\n",
    "        if X_train.ndim == 1:\n",
    "            X_train = X_train.reshape(1, -1)\n",
    "            y_train = y_train.reshape(-1)\n",
    "\n",
    "\n",
    "        # 2. 计算训练集的核矩阵 K\n",
    "        K = self._rbf_kernel(X_train, X_train)\n",
    "        # 加上噪声项以保证数值稳定性\n",
    "        K_stable = K + self.sigma_n * np.eye(len(X_train))\n",
    "\n",
    "        # 3. 预计算用于预测的关键部分\n",
    "        # 我们需要求解 (K + sigma*I) * alpha = y\n",
    "        # alpha = (K + sigma*I)^-1 * y\n",
    "        # 使用 solve 比直接求逆更稳定、更高效\n",
    "        try:\n",
    "            alpha = solve(K_stable, y_train, assume_a='pos')\n",
    "            K_inv = solve(K_stable, np.eye(len(X_train)), assume_a='pos')\n",
    "        except np.linalg.LinAlgError:\n",
    "            print(\"Warning: Kernel matrix is singular. Using pseudo-inverse.\")\n",
    "            pseudo_inv = np.linalg.pinv(K_stable)\n",
    "            alpha = pseudo_inv @ y_train\n",
    "            K_inv = pseudo_inv\n",
    "\n",
    "\n",
    "        self.gp_model = {\n",
    "            \"X_train\": X_train,\n",
    "            \"y_train\": y_train,\n",
    "            \"code_ids\": code_ids,\n",
    "            \"alpha\": alpha, # 预计算的 (K+sigma*I)^-1 * y\n",
    "            \"K_inv\": K_inv, # 预计算的 (K+sigma*I)^-1\n",
    "        }\n",
    "        print(f\"GP model built successfully with {len(code_ids)} data points.\")\n",
    "        return self.gp_model\n",
    "\n",
    "    def select_codes_with_acquisition(self,\n",
    "                                      all_candidate_codes: dict,\n",
    "                                      num_code_select: int,\n",
    "                                      k: float):\n",
    "        if self.gp_model is None:\n",
    "            raise RuntimeError(\"GP model has not been built. Please call 'build_gp_model' first.\")\n",
    "        if not all_candidate_codes:\n",
    "            return []\n",
    "\n",
    "        # 1. 从模型和输入中提取数据\n",
    "        X_train = self.gp_model[\"X_train\"]\n",
    "        alpha = self.gp_model[\"alpha\"]\n",
    "        K_inv = self.gp_model[\"K_inv\"]\n",
    "\n",
    "        candidate_ids = list(all_candidate_codes.keys())\n",
    "        X_candidate = np.array([all_candidate_codes[cid][\"embedding\"] for cid in candidate_ids])\n",
    "\n",
    "        # 2. 计算候选代码与训练集之间的核向量 k_*\n",
    "        k_star = self._rbf_kernel(X_candidate, X_train)\n",
    "\n",
    "        # 3. 计算每个候选代码的预测均值和方差\n",
    "        # 预测均值: mu = k_* @ alpha\n",
    "        mu_s = k_star @ alpha\n",
    "\n",
    "        # 预测方差: sigma^2 = k_** - k_*^T @ K_inv @ k_*\n",
    "        # k_** (候选点自身的核) 对角线上的值为1，因为d=0, exp(0)=1\n",
    "        k_star_star = 1\n",
    "        # var = k_** - np.diag(k_star @ K_inv @ k_star.T)\n",
    "        var_s = k_star_star - np.einsum('ij,jk,ik->i', k_star, K_inv, k_star)\n",
    "        # 确保方差非负，避免数值计算误差\n",
    "        var_s = np.maximum(var_s, 1e-8)\n",
    "        std_s = np.sqrt(var_s)\n",
    "\n",
    "        # 4. 计算UCB分数\n",
    "        ucb_scores = mu_s + k * std_s\n",
    "\n",
    "        # 5. 选择分数最高的代码\n",
    "        selected_indices = np.argsort(-ucb_scores)[:num_code_select]\n",
    "        selected_code_ids = [candidate_ids[i] for i in selected_indices]\n",
    "\n",
    "        # for i in selected_indices:\n",
    "        #     print(f\"Code {candidate_ids[i]}: UCB={ucb_scores[i]:.4f} (mu={mu_s[i]:.4f}, std={std_s[i]:.4f})\")\n",
    "\n",
    "        return selected_code_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "94d5af1f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'prompt_in_content': \"You are a meticulous assistant. Your task is to carefully read the following task description and extract any explicit examples or test cases mentioned within it.\\n\\n**Task Description:**\\n---\\n\\n\\n## Task Description\\n\\nWrite a Python function to compute the factorial of a number.\\n\\n## Expert Knowledge & Context\\n\\nTo compute the factorial of a number in Python, you can create a function that takes an integer input and calculates the product of all positive integers up to that number. The factorial of a non-negative integer n is denoted as n! and is the product of all positive integers less than or equal to n.\\n\\nOne common way to compute the factorial is using a recursive function that calls itself with a decreasing value of n until it reaches 1, at which point it returns 1. The function then multiplies the current value of n with the result of the recursive call.\\n\\nAnother approach is to use a loop to iterate from 1 to n and multiply the current result with the loop index in each iteration. This method can be more efficient for large values of n as it avoids the overhead of function calls.\\n\\nIt's important to consider edge cases such as handling negative inputs (which should result in an error or an exception), handling the factorial of 0 (which is defined as 1), and ensuring the function can handle large numbers without overflowing.\\n\\nOverall, understanding the concept of factorials and implementing an efficient algorithm in Python can be beneficial for solving various mathematical and programming problems that require the computation of factorials.\\n---\\n\\n**Your Instructions:**\\n1.  Read the task description carefully.\\n2.  Identify every example that demonstrates the function's behavior, usually in the form of an input and its corresponding expected output.\\n3.  List each input-output pair you find.\\n4.  **Crucially**, if you find absolutely no examples or test cases in the text, you must return the single word: **None**.\\n\\nDo not invent any new test cases. Only extract what is explicitly written.\", 'prompt_correctness': \"You are a senior software quality engineer. Your goal is to design a single, high-quality test case to verify the correctness of a function based on the task description provided. This test case must be **novel** and **not** one of the examples already mentioned in the description.\\n\\n**Task Description:**\\n---\\n\\n\\n## Task Description\\n\\nWrite a Python function to compute the factorial of a number.\\n\\n## Expert Knowledge & Context\\n\\nTo compute the factorial of a number in Python, you can create a function that takes an integer input and calculates the product of all positive integers up to that number. The factorial of a non-negative integer n is denoted as n! and is the product of all positive integers less than or equal to n.\\n\\nOne common way to compute the factorial is using a recursive function that calls itself with a decreasing value of n until it reaches 1, at which point it returns 1. The function then multiplies the current value of n with the result of the recursive call.\\n\\nAnother approach is to use a loop to iterate from 1 to n and multiply the current result with the loop index in each iteration. This method can be more efficient for large values of n as it avoids the overhead of function calls.\\n\\nIt's important to consider edge cases such as handling negative inputs (which should result in an error or an exception), handling the factorial of 0 (which is defined as 1), and ensuring the function can handle large numbers without overflowing.\\n\\nOverall, understanding the concept of factorials and implementing an efficient algorithm in Python can be beneficial for solving various mathematical and programming problems that require the computation of factorials.\\n---\\n\\n**Your Thought Process (Follow these steps meticulously):**\\n\\n1.  **Analyze the Task:**\\n    * What is the primary goal of the function?\\n    * What are the specified inputs and their data types?\\n    * What is the specified output and its data type?\\n    * What are the constraints and edge cases (e.g., empty inputs, negative numbers, large values, specific formats)?\\n\\n2.  **Identify a Novel Test Scenario:**\\n    * Review the examples in the description (if any) and consciously choose a different category of input. Consider edge cases, corner cases, or common failure points for this type of problem. For example, if the description tests positive numbers, consider testing negative numbers, zero, or a mix.\\n\\n3.  **Generate the Test Case (Show your work):**\\n    * **Step A: Propose the Input.** State the exact input you will use for your test.\\n    * **Step B: Reason Step-by-Step.** Walk through the logic required by the task description, applying it to your chosen input. Explain how you arrive at the final output. This reasoning is the most important part.\\n    * **Step C: State the Expected Output.** Clearly state the final, correct output based on your reasoning.\\n\\nProvide your analysis and the final test case (Input, Reasoning, and Output).\", 'prompt_runnable_check': 'You are a build engineer creating a \"smoke test\". The goal is **not** to check for correctness, but simply to ensure a function is runnable (i.e., it can be called without crashing due to syntax errors or basic type mismatches).\\n\\n**Task Description:**\\n---\\n\\n\\n## Task Description\\n\\nWrite a Python function to compute the factorial of a number.\\n\\n## Expert Knowledge & Context\\n\\nTo compute the factorial of a number in Python, you can create a function that takes an integer input and calculates the product of all positive integers up to that number. The factorial of a non-negative integer n is denoted as n! and is the product of all positive integers less than or equal to n.\\n\\nOne common way to compute the factorial is using a recursive function that calls itself with a decreasing value of n until it reaches 1, at which point it returns 1. The function then multiplies the current value of n with the result of the recursive call.\\n\\nAnother approach is to use a loop to iterate from 1 to n and multiply the current result with the loop index in each iteration. This method can be more efficient for large values of n as it avoids the overhead of function calls.\\n\\nIt\\'s important to consider edge cases such as handling negative inputs (which should result in an error or an exception), handling the factorial of 0 (which is defined as 1), and ensuring the function can handle large numbers without overflowing.\\n\\nOverall, understanding the concept of factorials and implementing an efficient algorithm in Python can be beneficial for solving various mathematical and programming problems that require the computation of factorials.\\n---\\n\\n**Your Instructions:**\\n\\n1.  **Analyze the Function Signature:** Based on the task description, determine the expected data type and structure of the input arguments. For example, does it take a single integer, a list of strings, two arguments?\\n\\n2.  **Propose a Mock Input:** Create the simplest possible, valid input that conforms to the signature you identified.\\n\\n3.  **Describe the Test Logic:** Explain that the test involves calling the function with this mock input. The test passes if the function executes and returns *anything* without raising an exception. The actual return value does not matter for this specific test. Describe the necessary components for a test that would run the function and catch any potential errors.', 'raw_in_content': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'raw_runnable_check': '**Function Signature:**\\n\\nThe function should take a single integer input representing the number for which we want to calculate the factorial.\\n\\n```python\\ndef factorial(n: int) -> int:\\n    ...\\n```\\n\\n**Mock Input:**\\n\\nA valid mock input to test the function would be the number 5.\\n\\n**Test Logic:**\\n\\n1. Define the function `factorial(n: int) -> int`.\\n2. Call the function with the mock input `factorial(5)`.\\n3. If the function executes without any syntax errors or exceptions, the smoke test is successful.\\n4. The specific return value of the function does not matter for this test.', 'raw_correctness': [\"**Analysis:**\\n\\n- **Primary Goal:** The primary goal of the function is to compute the factorial of a given number.\\n  \\n- **Specified Inputs and Data Types:** The function should take an integer input `n` representing the number for which the factorial needs to be computed.\\n  \\n- **Specified Output and Data Type:** The output should be an integer, the factorial of the given number `n`.\\n  \\n- **Constraints and Edge Cases:**\\n  - The function should handle negative inputs by raising an exception or providing an error message.\\n  - The factorial of 0 is defined as 1.\\n  - The function should be able to handle large values without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\nI will test the function with a large positive integer as input to ensure it can handle and compute the factorial correctly for extensive numbers.\\n\\n**Generate the Test Case:**\\n\\n**Step A:**\\n- Input: 15\\n\\n**Step B:**\\n- This test case will check the function's ability to compute the factorial of a very large positive integer.\\n- Factorial of 15: 15! = 15 x 14 x 13 x ... x 1\\n\\n**Step C:**\\n- Expected Output: 1307674368000 (15 factorial)\\n\\nTherefore, the test case is as follows:\\n```python\\nassert factorial(15) == 1307674368000\\n```\", '**Analysis:**\\n\\n- The primary goal of the function is to compute the factorial of a given number.\\n- The specified input is an integer representing the number for which we need to compute the factorial.\\n- The specified output is an integer representing the computed factorial value.\\n- Constraints and edge cases to consider: handling negative inputs (should result in an error or an exception), factorial of 0 (defined as 1), handling large numbers without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing the maximum possible positive integer value for the input to see how the function handles a large number.\\n\\n**Generate the Test Case:**\\n\\n- **Step A: Propose the Input:** Input integer = 20 (the maximum positive integer value for testing)\\n\\n- **Step B: Reasoning:**\\n    - Factorial of 20 can be calculated as 20! = 20 x 19 x 18 x ... x 1\\n    - Utilizing a loop to compute the factorial can be more efficient for larger values\\n    - Therefore, calculate factorial of 20 using a loop\\n\\n- **Step C: State the Expected Output:** Expected output = 2432902008176640000\\n\\nThus, the test case is as follows:\\n\\n**Test Case:**\\n- Input: 20\\n- Expected Output: 2432902008176640000\\n\\nThis test case will verify how well the function can handle and compute the factorial of a large input number such as 20, ensuring that the implementation can handle computations efficiently and correctly even with significant values.', '**Analysis:**\\n\\n1. **Primary Goal:** The primary goal of the function is to calculate the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be calculated.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints and Edge Cases:**\\n   - The input should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing a large input number to ensure that the function can handle large values without overflowing.\\n\\n**Generate the Test Case:**\\n\\n*Step A: Input*\\n- Input: 10\\n\\n*Step B: Reasoning*\\n- We will calculate the factorial of 10 using the loop method.\\n- Iterating from 1 to 10, we will multiply the current result with the loop index.\\n- Calculation: 1 x 2 x 3 x 4 x 5 x 6 x 7 x 8 x 9 x 10\\n\\n*Step C: Expected Output*\\n- Output: 3628800\\n\\nTherefore, the test case is as follows:\\n- Input: 10\\n- Expected Output: 3628800', '**Analysis:**\\n\\n1. **Primary Goal:** The function should compute the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be computed.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints:**\\n   - The input number should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\nI will choose the edge case of calculating the factorial of the maximum allowed integer value in Python (sys.maxsize). This test will help verify if the function can handle large values without causing an integer overflow.\\n\\n**Generate the Test Case:**\\n\\n*Step A:*\\n- Input: sys.maxsize\\n\\n*Step B:*\\n- Since sys.maxsize represents the largest integer value supported by the system, calculating the factorial of this number will involve a large number of multiplication operations.\\n- The factorial of any number n is the product of all positive integers up to n.\\n- For sys.maxsize, which is already a very large number, calculating its factorial should result in an extremely large number.\\n\\n*Step C:*\\n- Expected Output: A very large integer representing the factorial of sys.maxsize.', \"**1. Analyze the Task:**\\n- Primary Goal: The function should compute the factorial of a given number.\\n- Inputs: The input is a non-negative integer.\\n- Output: The output is the factorial of the input number, which is also an integer.\\n- Constraints: Handle negative inputs and the factorial of 0 appropriately.\\n\\n**2. Identify a Novel Test Scenario:**\\nLet's consider testing the function with a large input number to ensure it can handle computation efficiently for big factorials.\\n\\n**3. Generate the Test Case:**\\n\\n**Step A: Propose the Input**\\nInput: 20\\n\\n**Step B: Reasoning Step-by-Step**\\nFor the input of 20:\\nFactorial(20) = 20 * Factorial(19)\\n              = 20 * 19 * Factorial(18)\\n              = 20 * 19 * 18 * Factorial(17)\\n                             ...\\n              = 20 * 19 * 18 * ... * 1\\n\\n**Step C: State the Expected Output**\\nExpected Output: 2432902008176640000\\n\\nThis test case ensures that the function can handle larger inputs and compute their factorials correctly without overflowing or being inefficient.\"], 'test_cases': {0: {'test_function': <function test_case at 0x000001FFCDCAB1C0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 3\\n        expected_output = 6\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 3.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 1: {'test_function': <function test_case at 0x000001FFCDCAAEF0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 5\\n        expected_output = 120\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 5.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 2: {'test_function': <function test_case at 0x000001FFCDCAA950>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 0\\n        expected_output = 1\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 0.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 3: {'test_function': <function test_case at 0x000001FFCDCAAC20>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        actual_output = func_to_test(5)\\n        return True, \"Smoke test passed.\"\\n    except Exception as e:\\n        return False, f\"Smoke test failed with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Function Signature:**\\n\\nThe function should take a single integer input representing the number for which we want to calculate the factorial.\\n\\n```python\\ndef factorial(n: int) -> int:\\n    ...\\n```\\n\\n**Mock Input:**\\n\\nA valid mock input to test the function would be the number 5.\\n\\n**Test Logic:**\\n\\n1. Define the function `factorial(n: int) -> int`.\\n2. Call the function with the mock input `factorial(5)`.\\n3. If the function executes without any syntax errors or exceptions, the smoke test is successful.\\n4. The specific return value of the function does not matter for this test.', 'type': 'runnable'}, 4: {'test_function': <function test_case at 0x000001FFCDCAB0A0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 15\\n        expected_output = 1307674368000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for large positive integer input.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': \"**Analysis:**\\n\\n- **Primary Goal:** The primary goal of the function is to compute the factorial of a given number.\\n  \\n- **Specified Inputs and Data Types:** The function should take an integer input `n` representing the number for which the factorial needs to be computed.\\n  \\n- **Specified Output and Data Type:** The output should be an integer, the factorial of the given number `n`.\\n  \\n- **Constraints and Edge Cases:**\\n  - The function should handle negative inputs by raising an exception or providing an error message.\\n  - The factorial of 0 is defined as 1.\\n  - The function should be able to handle large values without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\nI will test the function with a large positive integer as input to ensure it can handle and compute the factorial correctly for extensive numbers.\\n\\n**Generate the Test Case:**\\n\\n**Step A:**\\n- Input: 15\\n\\n**Step B:**\\n- This test case will check the function's ability to compute the factorial of a very large positive integer.\\n- Factorial of 15: 15! = 15 x 14 x 13 x ... x 1\\n\\n**Step C:**\\n- Expected Output: 1307674368000 (15 factorial)\\n\\nTherefore, the test case is as follows:\\n```python\\nassert factorial(15) == 1307674368000\\n```\", 'type': 'correctness'}, 5: {'test_function': <function test_case at 0x000001FFCDCAA680>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 20\\n        expected_output = 2432902008176640000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for factorial of 20.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n- The primary goal of the function is to compute the factorial of a given number.\\n- The specified input is an integer representing the number for which we need to compute the factorial.\\n- The specified output is an integer representing the computed factorial value.\\n- Constraints and edge cases to consider: handling negative inputs (should result in an error or an exception), factorial of 0 (defined as 1), handling large numbers without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing the maximum possible positive integer value for the input to see how the function handles a large number.\\n\\n**Generate the Test Case:**\\n\\n- **Step A: Propose the Input:** Input integer = 20 (the maximum positive integer value for testing)\\n\\n- **Step B: Reasoning:**\\n    - Factorial of 20 can be calculated as 20! = 20 x 19 x 18 x ... x 1\\n    - Utilizing a loop to compute the factorial can be more efficient for larger values\\n    - Therefore, calculate factorial of 20 using a loop\\n\\n- **Step C: State the Expected Output:** Expected output = 2432902008176640000\\n\\nThus, the test case is as follows:\\n\\n**Test Case:**\\n- Input: 20\\n- Expected Output: 2432902008176640000\\n\\nThis test case will verify how well the function can handle and compute the factorial of a large input number such as 20, ensuring that the implementation can handle computations efficiently and correctly even with significant values.', 'type': 'correctness'}, 6: {'test_function': <function test_case at 0x000001FFCDCABB50>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 10\\n        expected_output = 3628800\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for factorial of 10.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n1. **Primary Goal:** The primary goal of the function is to calculate the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be calculated.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints and Edge Cases:**\\n   - The input should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing a large input number to ensure that the function can handle large values without overflowing.\\n\\n**Generate the Test Case:**\\n\\n*Step A: Input*\\n- Input: 10\\n\\n*Step B: Reasoning*\\n- We will calculate the factorial of 10 using the loop method.\\n- Iterating from 1 to 10, we will multiply the current result with the loop index.\\n- Calculation: 1 x 2 x 3 x 4 x 5 x 6 x 7 x 8 x 9 x 10\\n\\n*Step C: Expected Output*\\n- Output: 3628800\\n\\nTherefore, the test case is as follows:\\n- Input: 10\\n- Expected Output: 3628800', 'type': 'correctness'}, 7: {'test_function': <function test_case at 0x000001FFCDCAA8C0>, 'test_function_string': 'import sys\\nimport traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = sys.maxsize\\n        expected_output = 0  # Placeholder for the expected large factorial value\\n        actual_output = func_to_test(input_val)\\n        if actual_output > expected_output:\\n            return True, \"Test passed for calculating factorial of sys.maxsize.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: > {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n1. **Primary Goal:** The function should compute the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be computed.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints:**\\n   - The input number should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\nI will choose the edge case of calculating the factorial of the maximum allowed integer value in Python (sys.maxsize). This test will help verify if the function can handle large values without causing an integer overflow.\\n\\n**Generate the Test Case:**\\n\\n*Step A:*\\n- Input: sys.maxsize\\n\\n*Step B:*\\n- Since sys.maxsize represents the largest integer value supported by the system, calculating the factorial of this number will involve a large number of multiplication operations.\\n- The factorial of any number n is the product of all positive integers up to n.\\n- For sys.maxsize, which is already a very large number, calculating its factorial should result in an extremely large number.\\n\\n*Step C:*\\n- Expected Output: A very large integer representing the factorial of sys.maxsize.', 'type': 'correctness'}, 8: {'test_function': <function test_case at 0x000001FFCDCAB760>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 20\\n        expected_output = 2432902008176640000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for large input number (20).\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': \"**1. Analyze the Task:**\\n- Primary Goal: The function should compute the factorial of a given number.\\n- Inputs: The input is a non-negative integer.\\n- Output: The output is the factorial of the input number, which is also an integer.\\n- Constraints: Handle negative inputs and the factorial of 0 appropriately.\\n\\n**2. Identify a Novel Test Scenario:**\\nLet's consider testing the function with a large input number to ensure it can handle computation efficiently for big factorials.\\n\\n**3. Generate the Test Case:**\\n\\n**Step A: Propose the Input**\\nInput: 20\\n\\n**Step B: Reasoning Step-by-Step**\\nFor the input of 20:\\nFactorial(20) = 20 * Factorial(19)\\n              = 20 * 19 * Factorial(18)\\n              = 20 * 19 * 18 * Factorial(17)\\n                             ...\\n              = 20 * 19 * 18 * ... * 1\\n\\n**Step C: State the Expected Output**\\nExpected Output: 2432902008176640000\\n\\nThis test case ensures that the function can handle larger inputs and compute their factorials correctly without overflowing or being inefficient.\", 'type': 'correctness'}}}\n"
     ]
    }
   ],
   "source": [
    "print(best_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "ec04bac3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7\n"
     ]
    }
   ],
   "source": [
    "print(len(best_code))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e042356d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['prompt_in_content', 'prompt_correctness', 'prompt_runnable_check', 'raw_in_content', 'raw_runnable_check', 'raw_correctness', 'test_cases'])\n"
     ]
    }
   ],
   "source": [
    "print(best_code.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "b115ae39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: {'test_function': <function test_case at 0x000001FFCDCAB1C0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 3\\n        expected_output = 6\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 3.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 1: {'test_function': <function test_case at 0x000001FFCDCAAEF0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 5\\n        expected_output = 120\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 5.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 2: {'test_function': <function test_case at 0x000001FFCDCAA950>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 0\\n        expected_output = 1\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for input 0.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Examples:**\\n- Example 1:\\n    - Input: 3\\n    - Output: 6\\n    \\n- Example 2:\\n    - Input: 5\\n    - Output: 120\\n    \\n- Example 3:\\n    - Input: 0\\n    - Output: 1', 'type': 'in_content'}, 3: {'test_function': <function test_case at 0x000001FFCDCAAC20>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        actual_output = func_to_test(5)\\n        return True, \"Smoke test passed.\"\\n    except Exception as e:\\n        return False, f\"Smoke test failed with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Function Signature:**\\n\\nThe function should take a single integer input representing the number for which we want to calculate the factorial.\\n\\n```python\\ndef factorial(n: int) -> int:\\n    ...\\n```\\n\\n**Mock Input:**\\n\\nA valid mock input to test the function would be the number 5.\\n\\n**Test Logic:**\\n\\n1. Define the function `factorial(n: int) -> int`.\\n2. Call the function with the mock input `factorial(5)`.\\n3. If the function executes without any syntax errors or exceptions, the smoke test is successful.\\n4. The specific return value of the function does not matter for this test.', 'type': 'runnable'}, 4: {'test_function': <function test_case at 0x000001FFCDCAB0A0>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 15\\n        expected_output = 1307674368000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for large positive integer input.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': \"**Analysis:**\\n\\n- **Primary Goal:** The primary goal of the function is to compute the factorial of a given number.\\n  \\n- **Specified Inputs and Data Types:** The function should take an integer input `n` representing the number for which the factorial needs to be computed.\\n  \\n- **Specified Output and Data Type:** The output should be an integer, the factorial of the given number `n`.\\n  \\n- **Constraints and Edge Cases:**\\n  - The function should handle negative inputs by raising an exception or providing an error message.\\n  - The factorial of 0 is defined as 1.\\n  - The function should be able to handle large values without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\nI will test the function with a large positive integer as input to ensure it can handle and compute the factorial correctly for extensive numbers.\\n\\n**Generate the Test Case:**\\n\\n**Step A:**\\n- Input: 15\\n\\n**Step B:**\\n- This test case will check the function's ability to compute the factorial of a very large positive integer.\\n- Factorial of 15: 15! = 15 x 14 x 13 x ... x 1\\n\\n**Step C:**\\n- Expected Output: 1307674368000 (15 factorial)\\n\\nTherefore, the test case is as follows:\\n```python\\nassert factorial(15) == 1307674368000\\n```\", 'type': 'correctness'}, 5: {'test_function': <function test_case at 0x000001FFCDCAA680>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 20\\n        expected_output = 2432902008176640000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for factorial of 20.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n- The primary goal of the function is to compute the factorial of a given number.\\n- The specified input is an integer representing the number for which we need to compute the factorial.\\n- The specified output is an integer representing the computed factorial value.\\n- Constraints and edge cases to consider: handling negative inputs (should result in an error or an exception), factorial of 0 (defined as 1), handling large numbers without overflowing.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing the maximum possible positive integer value for the input to see how the function handles a large number.\\n\\n**Generate the Test Case:**\\n\\n- **Step A: Propose the Input:** Input integer = 20 (the maximum positive integer value for testing)\\n\\n- **Step B: Reasoning:**\\n    - Factorial of 20 can be calculated as 20! = 20 x 19 x 18 x ... x 1\\n    - Utilizing a loop to compute the factorial can be more efficient for larger values\\n    - Therefore, calculate factorial of 20 using a loop\\n\\n- **Step C: State the Expected Output:** Expected output = 2432902008176640000\\n\\nThus, the test case is as follows:\\n\\n**Test Case:**\\n- Input: 20\\n- Expected Output: 2432902008176640000\\n\\nThis test case will verify how well the function can handle and compute the factorial of a large input number such as 20, ensuring that the implementation can handle computations efficiently and correctly even with significant values.', 'type': 'correctness'}, 6: {'test_function': <function test_case at 0x000001FFCDCABB50>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 10\\n        expected_output = 3628800\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for factorial of 10.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n1. **Primary Goal:** The primary goal of the function is to calculate the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be calculated.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints and Edge Cases:**\\n   - The input should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\n\\nFor this test case, we will consider testing a large input number to ensure that the function can handle large values without overflowing.\\n\\n**Generate the Test Case:**\\n\\n*Step A: Input*\\n- Input: 10\\n\\n*Step B: Reasoning*\\n- We will calculate the factorial of 10 using the loop method.\\n- Iterating from 1 to 10, we will multiply the current result with the loop index.\\n- Calculation: 1 x 2 x 3 x 4 x 5 x 6 x 7 x 8 x 9 x 10\\n\\n*Step C: Expected Output*\\n- Output: 3628800\\n\\nTherefore, the test case is as follows:\\n- Input: 10\\n- Expected Output: 3628800', 'type': 'correctness'}, 7: {'test_function': <function test_case at 0x000001FFCDCAA8C0>, 'test_function_string': 'import sys\\nimport traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = sys.maxsize\\n        expected_output = 0  # Placeholder for the expected large factorial value\\n        actual_output = func_to_test(input_val)\\n        if actual_output > expected_output:\\n            return True, \"Test passed for calculating factorial of sys.maxsize.\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: > {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': '**Analysis:**\\n\\n1. **Primary Goal:** The function should compute the factorial of a given number.\\n\\n2. **Specified Inputs:**\\n   - Input: An integer representing the number for which the factorial needs to be computed.\\n\\n3. **Specified Output:**\\n   - Output: An integer representing the factorial of the input number.\\n\\n4. **Constraints:**\\n   - The input number should be a non-negative integer.\\n   - The factorial of 0 is defined as 1.\\n\\n**Identify a Novel Test Scenario:**\\nI will choose the edge case of calculating the factorial of the maximum allowed integer value in Python (sys.maxsize). This test will help verify if the function can handle large values without causing an integer overflow.\\n\\n**Generate the Test Case:**\\n\\n*Step A:*\\n- Input: sys.maxsize\\n\\n*Step B:*\\n- Since sys.maxsize represents the largest integer value supported by the system, calculating the factorial of this number will involve a large number of multiplication operations.\\n- The factorial of any number n is the product of all positive integers up to n.\\n- For sys.maxsize, which is already a very large number, calculating its factorial should result in an extremely large number.\\n\\n*Step C:*\\n- Expected Output: A very large integer representing the factorial of sys.maxsize.', 'type': 'correctness'}, 8: {'test_function': <function test_case at 0x000001FFCDCAB760>, 'test_function_string': 'import traceback\\n\\ndef test_case(func_to_test):\\n    try:\\n        input_val = 20\\n        expected_output = 2432902008176640000\\n        actual_output = func_to_test(input_val)\\n        if actual_output == expected_output:\\n            return True, \"Test passed for large input number (20).\"\\n        else:\\n            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\\n    except Exception as e:\\n        return False, f\"Test failed for input {input_val} with exception: {e}\\\\n{traceback.format_exc()}\"', 'description': \"**1. Analyze the Task:**\\n- Primary Goal: The function should compute the factorial of a given number.\\n- Inputs: The input is a non-negative integer.\\n- Output: The output is the factorial of the input number, which is also an integer.\\n- Constraints: Handle negative inputs and the factorial of 0 appropriately.\\n\\n**2. Identify a Novel Test Scenario:**\\nLet's consider testing the function with a large input number to ensure it can handle computation efficiently for big factorials.\\n\\n**3. Generate the Test Case:**\\n\\n**Step A: Propose the Input**\\nInput: 20\\n\\n**Step B: Reasoning Step-by-Step**\\nFor the input of 20:\\nFactorial(20) = 20 * Factorial(19)\\n              = 20 * 19 * Factorial(18)\\n              = 20 * 19 * 18 * Factorial(17)\\n                             ...\\n              = 20 * 19 * 18 * ... * 1\\n\\n**Step C: State the Expected Output**\\nExpected Output: 2432902008176640000\\n\\nThis test case ensures that the function can handle larger inputs and compute their factorials correctly without overflowing or being inefficient.\", 'type': 'correctness'}}\n"
     ]
    }
   ],
   "source": [
    "print(best_code['test_cases'])  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "5f4ba597",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test case 0: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 3\n",
      "        expected_output = 6\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for input 3.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: in_content\n",
      "Test case 1: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 5\n",
      "        expected_output = 120\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for input 5.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: in_content\n",
      "Test case 2: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 0\n",
      "        expected_output = 1\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for input 0.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: in_content\n",
      "Test case 3: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        actual_output = func_to_test(5)\n",
      "        return True, \"Smoke test passed.\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Smoke test failed with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: runnable\n",
      "Test case 4: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 15\n",
      "        expected_output = 1307674368000\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for large positive integer input.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: correctness\n",
      "Test case 5: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 20\n",
      "        expected_output = 2432902008176640000\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for factorial of 20.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: correctness\n",
      "Test case 6: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 10\n",
      "        expected_output = 3628800\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for factorial of 10.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: correctness\n",
      "Test case 7: \n",
      "import sys\n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = sys.maxsize\n",
      "        expected_output = 0  # Placeholder for the expected large factorial value\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output > expected_output:\n",
      "            return True, \"Test passed for calculating factorial of sys.maxsize.\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: > {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: correctness\n",
      "Test case 8: \n",
      "import traceback\n",
      "\n",
      "def test_case(func_to_test):\n",
      "    try:\n",
      "        input_val = 20\n",
      "        expected_output = 2432902008176640000\n",
      "        actual_output = func_to_test(input_val)\n",
      "        if actual_output == expected_output:\n",
      "            return True, \"Test passed for large input number (20).\"\n",
      "        else:\n",
      "            return False, f\"Test failed for input {input_val}. Expected: {expected_output}, Got: {actual_output}\"\n",
      "    except Exception as e:\n",
      "        return False, f\"Test failed for input {input_val} with exception: {e}\\n{traceback.format_exc()}\", \n",
      "Type: correctness\n"
     ]
    }
   ],
   "source": [
    "# print each test case with its type\n",
    "for i, (id, test_case) in enumerate(best_code['test_cases'].items()):\n",
    "    print(f\"Test case {i}: \\n{test_case['test_function_string']}, \\nType: {test_case['type']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0f222807",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input Data (LCP on GPT-4): \"96.95 88.41 92.51 89.70\"\n",
      "Formatted LaTeX Output: ['(43.4\\\\%)', '(74.7\\\\%)', '(35.4\\\\%)', '(71.8\\\\%)']\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "def format_improvement_rate_latex(data_string, base_model):\n",
    "    \"\"\"\n",
    "    Calculates improvement rates and formats them as a list of LaTeX strings.\n",
    "\n",
    "    Args:\n",
    "        data_string (str): A string containing four space-separated float values.\n",
    "        base_model (str): The baseline model to compare against, either '3.5' or '4'.\n",
    "\n",
    "    Returns:\n",
    "        list: A list of strings, with each string being the LaTeX-formatted \n",
    "              improvement rate (e.g., '(29.1\\\\%)').\n",
    "              Returns an empty list if the inputs are invalid.\n",
    "    \"\"\"\n",
    "    # Baseline data for comparison\n",
    "    baseline_data = {\n",
    "        '3.5': [57.3, 42.7, 52.2, 36.8],\n",
    "        '4': [67.6, 50.6, 68.3, 52.2]\n",
    "    }\n",
    "\n",
    "    # Validate the base_model input\n",
    "    if base_model not in baseline_data:\n",
    "        print(f\"Error: Invalid base_model '{base_model}'. Please use '3.5' or '4'.\")\n",
    "        return []\n",
    "\n",
    "    # Select the correct baseline\n",
    "    baseline_scores = baseline_data[base_model]\n",
    "\n",
    "    try:\n",
    "        # Parse the input string into a list of floats\n",
    "        new_scores = [float(score) for score in data_string.split()]\n",
    "        if len(new_scores) != 4:\n",
    "            print(f\"Error: Expected 4 numbers in the data string, but got {len(new_scores)}.\")\n",
    "            return []\n",
    "    except ValueError:\n",
    "        print(\"Error: The data string contains non-numeric values.\")\n",
    "        return []\n",
    "\n",
    "    latex_rates = []\n",
    "    # Calculate the improvement rate for each score\n",
    "    for new_score, baseline_score in zip(new_scores, baseline_scores):\n",
    "        if baseline_score == 0:\n",
    "            # Avoid division by zero\n",
    "            rate = 0.0\n",
    "        else:\n",
    "            # Calculate the percentage change and round it\n",
    "            rate = round(((new_score - baseline_score) / baseline_score) * 100, 1)\n",
    "        \n",
    "        # Format the rate into the specified LaTeX string \"(rate\\%)\"\n",
    "        # The extra backslash is needed to escape the backslash in the f-string\n",
    "        latex_string = f\"({rate}\\%)\"\n",
    "        latex_rates.append(latex_string)\n",
    "\n",
    "    return latex_rates\n",
    "\n",
    "lcp_3_5_data = \"96.95 88.41 92.51 89.70\"\n",
    "print(f\"Input Data (LCP on GPT-4): \\\"{lcp_3_5_data}\\\"\")\n",
    "rates_3_5 = format_improvement_rate_latex(lcp_3_5_data, '4')\n",
    "print(\"Formatted LaTeX Output:\", rates_3_5)\n",
    "print(\"-\" * 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e84f21f5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "test01",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
