{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     code  score\n",
      "0  code_1    0.9\n",
      "1  code_2    0.8\n",
      "2  code_3    0.7\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "new_data = {\n",
    "    \"code_id_1\": {\"code\": \"code_1\", \"score\":0.9},\n",
    "    \"code_id_2\": {\"code\": \"code_2\", \"score\":0.8},\n",
    "    \"code_id_3\": {\"code\": \"code_3\", \"score\":0.7},\n",
    "}\n",
    "new_data = pd.DataFrame.from_dict(new_data, orient='index').reset_index(drop=True)\n",
    "print(new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_median_sorted_arrays(nums1, nums2):\n",
    "    # Ensure nums1 is the smaller array for efficient binary search\n",
    "    if len(nums1) > len(nums2):\n",
    "        nums1, nums2 = nums2, nums1\n",
    "\n",
    "    m, n = len(nums1), len(nums2)\n",
    "    total_length = m + n\n",
    "    left, right = 0, m\n",
    "\n",
    "    while left <= right:\n",
    "        # Partition nums1\n",
    "        partition1 = (left + right) // 2\n",
    "        # Partition nums2 is calculated based on partition1\n",
    "        partition2 = (total_length + 1) // 2 - partition1\n",
    "        \n",
    "        # Boundary values for partitions; if partition is at the edge, we give -inf or +inf\n",
    "        maxLeft1 = float('-inf') if partition1 == 0 else nums1[partition1 - 1]\n",
    "        minRight1 = float('inf') if partition1 == m else nums1[partition1]\n",
    "\n",
    "        maxLeft2 = float('-inf') if partition2 == 0 else nums2[partition2 - 1]\n",
    "        minRight2 = float('inf') if partition2 == n else nums2[partition2]\n",
    "\n",
    "        # Check if we've partitioned correctly\n",
    "        if maxLeft1 <= minRight2 and maxLeft2 <= minRight1:\n",
    "            # We have found the correct partition\n",
    "            if total_length % 2 == 0:  # Even length\n",
    "                print(maxLeft1, maxLeft2, minRight1, minRight2)\n",
    "                return (max(maxLeft1, maxLeft2) + min(minRight1, minRight2)) / 2\n",
    "            else:  # Odd length\n",
    "                return max(maxLeft1, maxLeft2)\n",
    "        \n",
    "        # If maxLeft1 is greater than minRight2, we need to move towards left in nums1\n",
    "        elif maxLeft1 > minRight2:\n",
    "            right = partition1 - 1\n",
    "        \n",
    "        # If maxLeft2 is greater than minRight1, we need to move towards right in nums1\n",
    "        else:\n",
    "            left = partition1 + 1\n",
    "\n",
    "def median_of_two_sorted_arrays(nums1, nums2):\n",
    "    return find_median_sorted_arrays(nums1, nums2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 -inf inf 3\n",
      "2.5\n"
     ]
    }
   ],
   "source": [
    "print(median_of_two_sorted_arrays([1, 2], [3, 4]))  # Output: 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "def test_case_2(func):\n",
    "    nums1 = [1, 2]\n",
    "    nums2 = [3, 4]\n",
    "    result = func(nums1, nums2)\n",
    "    return result == 2.5\n",
    "\n",
    "print(test_case_2(find_median_sorted_arrays))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import concurrent.futures\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "def run_test(func_obj, test_func):\n",
    "    try:\n",
    "        return test_func(func_obj)\n",
    "    except Exception as e:\n",
    "        return False\n",
    "\n",
    "# def compile_code(code_str, main_function_name=None):\n",
    "#     try:\n",
    "#         exec(code_str, {}, local_vars := {})\n",
    "#         # 优先根据指定的主函数名称查找\n",
    "#         if main_function_name is not None:\n",
    "#             func = local_vars.get(main_function_name)\n",
    "#             return func if callable(func) else None\n",
    "#         # 未指定名称时查找第一个可调用对象\n",
    "#         return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "#     except Exception as e:\n",
    "#         print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "#         return None\n",
    "\n",
    "def compile_code(code_str, main_function_name=None):\n",
    "    try:\n",
    "        local_vars = {}\n",
    "        exec(code_str, local_vars)  # Use one dict for globals and locals\n",
    "        if main_function_name is not None:\n",
    "            func = local_vars.get(main_function_name)\n",
    "            return func if callable(func) else None\n",
    "        return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "    except Exception as e:\n",
    "        print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "        return None\n",
    "\n",
    "def run_all_tests(functions, test_cases, max_workers=5):\n",
    "    \"\"\"\n",
    "    Updated to handle new function structure with main_function_name\n",
    "    \"\"\"\n",
    "    # 编译函数（处理带主函数名称的情况）\n",
    "    compiled_functions = {\n",
    "        fid: compile_code(\n",
    "            code_info['code'],\n",
    "            main_function_name=code_info.get('main_function_name')\n",
    "        )\n",
    "        for fid, code_info in functions.items()\n",
    "    }\n",
    "    \n",
    "    # 编译测试用例（保持原有逻辑）\n",
    "    compiled_tests = {\n",
    "        tid: compile_code(code_info['test_function'])\n",
    "        for tid, code_info in test_cases.items()\n",
    "    }\n",
    "\n",
    "    # 准备结果字典\n",
    "    fun_results = {fid: {} for fid in functions}\n",
    "    test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "    total_tests = len(compiled_functions) * len(compiled_tests)\n",
    "    \n",
    "    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "        futures = {}\n",
    "        pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "        \n",
    "        # 提交测试任务\n",
    "        for func_id, func_obj in compiled_functions.items():\n",
    "            for test_id, test_func in compiled_tests.items():\n",
    "                # 处理编译失败的情况\n",
    "                if func_obj is None or test_func is None:\n",
    "                    fun_results[func_id][test_id] = False\n",
    "                    test_results[test_id][func_id] = False\n",
    "                    pbar.update(1)\n",
    "                    continue\n",
    "                \n",
    "                # 提交并发任务\n",
    "                future = executor.submit(run_test, func_obj, test_func)\n",
    "                futures[future] = (func_id, test_id)\n",
    "\n",
    "        # 处理测试结果\n",
    "        for future in concurrent.futures.as_completed(futures):\n",
    "            func_id, test_id = futures[future]\n",
    "            try:\n",
    "                result = future.result()\n",
    "            except Exception:\n",
    "                result = False\n",
    "            fun_results[func_id][test_id] = result\n",
    "            test_results[test_id][func_id] = result\n",
    "            pbar.update(1)\n",
    "        \n",
    "        pbar.close()\n",
    "    \n",
    "    return fun_results, test_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running tests: 100%|██████████| 26/26 [00:00<00:00, 12639.30it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results:\n",
      "{'code_id_2': {'test_case_3_1': False, 'test_case_2_2': False, 'test_case_4': True, 'test_case_1_1': False, 'test_case_5': True, 'test_case_4_1': False, 'test_case_3': True, 'test_case_5_1': False, 'test_case_4_2': True, 'test_case_2_1': False, 'test_case_3_2': False, 'test_case_1': True, 'test_case_2': True, 'test_case_1_2': False, 'test_case_1_3': False, 'test_case_2_3': False, 'test_case_3_3': False, 'test_case_4_3': False, 'test_case_5_2': False, 'test_case_6': True, 'test_case_1_4': False, 'test_case_2_4': False, 'test_case_3_4': False, 'test_case_4_4': False, 'test_case_6_1': True, 'test_case_5_3': False}}\n",
      "Test Results:\n",
      "{'test_case_1': {'code_id_2': True}, 'test_case_2': {'code_id_2': True}, 'test_case_3': {'code_id_2': True}, 'test_case_4': {'code_id_2': True}, 'test_case_1_1': {'code_id_2': False}, 'test_case_2_1': {'code_id_2': False}, 'test_case_3_1': {'code_id_2': False}, 'test_case_4_1': {'code_id_2': False}, 'test_case_5': {'code_id_2': True}, 'test_case_1_2': {'code_id_2': False}, 'test_case_2_2': {'code_id_2': False}, 'test_case_3_2': {'code_id_2': False}, 'test_case_4_2': {'code_id_2': True}, 'test_case_5_1': {'code_id_2': False}, 'test_case_1_3': {'code_id_2': False}, 'test_case_2_3': {'code_id_2': False}, 'test_case_3_3': {'code_id_2': False}, 'test_case_4_3': {'code_id_2': False}, 'test_case_5_2': {'code_id_2': False}, 'test_case_6': {'code_id_2': True}, 'test_case_1_4': {'code_id_2': False}, 'test_case_2_4': {'code_id_2': False}, 'test_case_3_4': {'code_id_2': False}, 'test_case_4_4': {'code_id_2': False}, 'test_case_5_3': {'code_id_2': False}, 'test_case_6_1': {'code_id_2': True}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "code_1 = \"\"\"# Component: merge_sorted_arrays\n",
    "def merge_sorted_arrays(nums1, nums2):\n",
    "    merged = []\n",
    "    i = j = 0\n",
    "    while i < len(nums1) and j < len(nums2):\n",
    "        if nums1[i] < nums2[j]:\n",
    "            merged.append(nums1[i])\n",
    "            i += 1\n",
    "        else:\n",
    "            merged.append(nums2[j])\n",
    "            j += 1\n",
    "    merged += nums1[i:]\n",
    "    merged += nums2[j:]\n",
    "    return merged\n",
    "\n",
    "# Component: calculate_median\n",
    "def calculate_median(nums):\n",
    "    n = len(nums)\n",
    "    if n % 2 == 1:\n",
    "        return float(nums[n // 2])\n",
    "    else:\n",
    "        return (nums[n // 2 - 1] + nums[n // 2]) / 2\n",
    "\n",
    "# Main function: find_median_sorted_arrays\n",
    "def find_median_sorted_arrays(nums1, nums2):\n",
    "    merged_array = merge_sorted_arrays(nums1, nums2)\n",
    "    return calculate_median(merged_array)\"\"\"\n",
    "\n",
    "code_2 = \"\"\"def merge_sorted_arrays(nums1, nums2):\n",
    "    merged = []\n",
    "    i, j = 0, 0\n",
    "    while i < len(nums1) and j < len(nums2):\n",
    "        if nums1[i] < nums2[j]:\n",
    "            merged.append(nums1[i])\n",
    "            i += 1\n",
    "        else:\n",
    "            merged.append(nums2[j])\n",
    "            j += 1\n",
    "    merged += nums1[i:]\n",
    "    merged += nums2[j:]\n",
    "    return merged\n",
    "\n",
    "def calculate_median(arr):\n",
    "    n = len(arr)\n",
    "    if n % 2 == 1:\n",
    "        return arr[n // 2]\n",
    "    else:\n",
    "        return (arr[n // 2 - 1] + arr[n // 2]) / 2\n",
    "\n",
    "def find_median_sorted_arrays(nums):\n",
    "    merged = merge_sorted_arrays(nums[0], nums[1])\n",
    "    return calculate_median(merged)\"\"\"\n",
    "\n",
    "functions = {\n",
    "    # \"code_id_1\": {\n",
    "    #     \"code\": code_1,\n",
    "    #     \"main_function_name\": \"find_median_sorted_arrays\"\n",
    "    # },\n",
    "    \"code_id_2\": {\n",
    "        \"code\": code_2,\n",
    "        \"main_function_name\": \"find_median_sorted_arrays\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_case_1 = \"\"\"def test_case(func):\n",
    "    nums1 = [1, 3]\n",
    "    nums2 = [2]\n",
    "    return func(nums1, nums2) == 2.0\"\"\"\n",
    "\n",
    "test_case_2 = \"\"\"def test_case(func):\n",
    "    nums1 = [1, 2]\n",
    "    nums2 = [3, 4]\n",
    "    return func(nums1, nums2) == 2.5\"\"\"\n",
    "\n",
    "# test_functions = {\n",
    "#     \"test_case_1\": {\n",
    "#         \"test_function\": test_case_1,\n",
    "#         'test_type': 'correctness'\n",
    "#     },\n",
    "#     \"test_case_2\": {\n",
    "#         \"test_function\": test_case_2,\n",
    "#         'test_type': 'correctness'\n",
    "#     }\n",
    "# }\n",
    "\n",
    "# 运行所有测试用例并获取结果\n",
    "fun_results, test_results = run_all_tests(functions, test_functions)\n",
    "print(\"Function Results:\")\n",
    "print(fun_results)\n",
    "print(\"Test Results:\")\n",
    "print(test_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "raw_text = \"\"\"< type >\n",
    "correctness\n",
    "< / type >\n",
    "< planning >\n",
    "I will design a test function to verify the correctness of the solution by using a predefined input where the expected output is known. The test will ensure that the function correctly calculates the number of operations needed to make the input array non-decreasing. The array will contain various elements that will require multiple operations to be combined in order to achieve a non-decreasing order.\n",
    "< / planning >\n",
    "< code >\n",
    "def test_case(func):\n",
    "    # Test case where multiple operations are needed\n",
    "    nums = [5, 2, 3, 1]\n",
    "    expected_output = 2  # Should require 2 operations\n",
    "    result = func(nums)\n",
    "    return result == expected_output\n",
    "< / code >\n",
    "\n",
    "< type >\n",
    "edge_case\n",
    "< / type >\n",
    "< planning >\n",
    "I will create a test function that inputs an already sorted array. As the array is non-decreasing, the expected output should be zero. This test helps in determining if the function can properly identify when no operations are needed.\n",
    "< / planning >\n",
    "< code >\n",
    "def test_case(func):\n",
    "    # Test case for an already sorted array\n",
    "    nums = [1, 2, 2]\n",
    "    expected_output = 0  # No operations needed\n",
    "    result = func(nums)\n",
    "    return result == expected_output\n",
    "< / code >\n",
    "\n",
    "< type >\n",
    "runtime\n",
    "< / type >\n",
    "< planning >\n",
    "This test case will measure the execution time of the function with a large array input. The purpose is to ensure that the function performs efficiently and does not exceed a specific time limit. I will implement a timer to verify that the function runs within an acceptable duration.\n",
    "< / planning >\n",
    "< code >\n",
    "import time \n",
    "\n",
    "def test_case(func):\n",
    "    # Measure runtime for a large array\n",
    "    nums = list(range(10000, 0, -1))  # A large descending array\n",
    "    start_time = time.time()\n",
    "    func(nums)\n",
    "    end_time = time.time()\n",
    "    return (end_time - start_time) < 1  # Ensure it executes in less than 1 second\n",
    "< / code >\n",
    "\n",
    "< type >\n",
    "component_check\n",
    "< / type >\n",
    "< planning >\n",
    "This test ensures that the solution uses the required components by checking if the code contains the functions named \"find_adjacent_min_sum_pair\", \"replace_pair_with_sum\", \"is_non_decreasing\", and \"minimum_operations_to_non_decreasing\". It will inspect the source code of the target function.\n",
    "< / planning >\n",
    "< code >\n",
    "def test_case(func):\n",
    "    source = inspect.getsource(func)\n",
    "    required_components = [\n",
    "        \"find_adjacent_min_sum_pair\",\n",
    "        \"replace_pair_with_sum\",\n",
    "        \"is_non_decreasing\",\n",
    "        \"minimum_operations_to_non_decreasing\"\n",
    "    ]\n",
    "    return all(component in source for component in required_components)\n",
    "< / code >\n",
    "\n",
    "< type >\n",
    "error_handling\n",
    "< / type >\n",
    "< planning >\n",
    "This test case will check how the function handles invalid inputs such as empty lists and non-integer elements within the array. The expected behavior is that the function should raise an appropriate exception for these invalid cases, ensuring robustness.\n",
    "< / planning >\n",
    "< code >\n",
    "def test_case(func):\n",
    "    # Test case for invalid input (empty list)\n",
    "    try:\n",
    "        func([])\n",
    "        return False  # Should raise an error, if it doesn't we fail the test\n",
    "    except ValueError:  # Assuming the function raises ValueError for empty input\n",
    "        pass\n",
    "\n",
    "    # Test case for invalid input (non-integer values)\n",
    "    try:\n",
    "        func([\"a\", \"b\"])\n",
    "        return False  # Should raise an error, if it doesn't we fail the test\n",
    "    except TypeError:  # Assuming the function raises TypeError for non-integer input\n",
    "        return True  # If both errors are raised correctly, we pass.\n",
    "< / code >\"\"\"\n",
    "\n",
    "def extract_test_cases(output_text):\n",
    "    \"\"\"\n",
    "    Extracts test cases from LLM output text with flexible tag handling.\n",
    "    Supports case-insensitive tags, missing <Type> tags, and multi-separators.\n",
    "    \"\"\"\n",
    "    import re\n",
    "    test_cases = {}\n",
    "    \n",
    "    # Split blocks by <separator> or empty lines\n",
    "    split_pattern = r'(?:<\\s*/\\s*separator\\s*>|<\\s*separator\\s*>|<\\s*separator\\s*/>|\\n\\s*\\n\\s*)'\n",
    "    test_case_blocks = re.split(split_pattern, output_text, flags=re.IGNORECASE)\n",
    "    test_case_blocks = [b.strip() for b in test_case_blocks if b.strip()]\n",
    "    print(len(test_case_blocks))\n",
    "    for idx, block in enumerate(test_case_blocks, 1):\n",
    "        # 1. Extract test_type\n",
    "        test_type = None\n",
    "        \n",
    "        # Case 1: <Type>value</Type>\n",
    "        type_match = re.search(\n",
    "            r'<\\s*type\\s*>(.*?)</\\s*type\\s*>', \n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        if type_match:\n",
    "            test_type = type_match.group(1).strip()\n",
    "        else:\n",
    "            # Case 2: <test_type> like <correctness>\n",
    "            known_tags = {'type', 'planning', 'code', 'reasoning', 'test_function', 'separator'}\n",
    "            for tag_match in re.finditer(r'<\\s*([^\\s>/]+)\\s*.*?>', block, re.IGNORECASE):\n",
    "                tag_name = tag_match.group(1).lower()\n",
    "                if tag_name not in known_tags:\n",
    "                    test_type = tag_name\n",
    "                    break  # Take first unknown tag\n",
    "\n",
    "        if not test_type:  # Skip invalid blocks\n",
    "            continue\n",
    "\n",
    "        # 2. Extract reasoning (support <planning> and <reasoning>)\n",
    "        reasoning_match = re.search(\n",
    "            r'<\\s*(?:reasoning|planning)\\s*>(.*?)</\\s*(?:reasoning|planning)\\s*>',\n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        reasoning = reasoning_match.group(1).strip() if reasoning_match else \"\"\n",
    "\n",
    "        # 3. Extract test_function (priority: test_function > code tags > standalone)\n",
    "        test_func = None\n",
    "        \n",
    "        # Check <test_function> tags\n",
    "        test_func_match = re.search(\n",
    "            r'<\\s*test_function\\s*>(.*?)</\\s*test_function\\s*>',\n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        if test_func_match:\n",
    "            content = test_func_match.group(1).strip()\n",
    "            code_block = re.search(r'```python\\s*(.*?)\\s*```', content, re.DOTALL)\n",
    "            test_func = code_block.group(1).strip() if code_block else content\n",
    "        else:\n",
    "            # Check <Code> tags\n",
    "            code_match = re.search(\n",
    "                r'<\\s*code\\s*>(.*?)</\\s*code\\s*>',\n",
    "                block,\n",
    "                re.IGNORECASE | re.DOTALL\n",
    "            )\n",
    "            if code_match:\n",
    "                content = code_match.group(1).strip()\n",
    "                code_block = re.search(r'```python\\s*(.*?)\\s*```', content, re.DOTALL)\n",
    "                test_func = code_block.group(1).strip() if code_block else content\n",
    "            else:\n",
    "                # Check standalone code blocks\n",
    "                code_block = re.search(r'```python\\s*(.*?)\\s*```', block, re.DOTALL)\n",
    "                if code_block:\n",
    "                    test_func = code_block.group(1).strip()\n",
    "\n",
    "        if test_type and test_func:\n",
    "            test_cases[f'test_case_{idx}'] = {\n",
    "                'test_type': test_type,\n",
    "                'purpose': reasoning,\n",
    "                'test_function': test_func\n",
    "            }\n",
    "    if test_cases == {}:\n",
    "        # If no test cases were found, return False\n",
    "        # logging.warning(\"Failed to extract test cases, llm_output:\\n{output_text}\")\n",
    "        return False\n",
    "\n",
    "    return test_cases\n",
    "\n",
    "print(extract_test_cases(raw_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "共分出 5 个块\n",
      "{'test_case_1': {'test_type': 'correctness', 'purpose': 'I will design a test function to verify the correctness of the solution by using a predefined input where the expected output is known. The test will ensure that the function correctly calculates the number of operations needed to make the input array non-decreasing. The array will contain various elements that will require multiple operations to be combined in order to achieve a non-decreasing order.', 'test_function': 'def test_case(func):\\n    # Test case where multiple operations are needed\\n    nums = [5, 2, 3, 1]\\n    expected_output = 2  # Should require 2 operations\\n    result = func(nums)\\n    return result == expected_output'}, 'test_case_2': {'test_type': 'edge_case', 'purpose': 'I will create a test function that inputs an already sorted array. As the array is non-decreasing, the expected output should be zero. This test helps in determining if the function can properly identify when no operations are needed.', 'test_function': 'def test_case(func):\\n    # Test case for an already sorted array\\n    nums = [1, 2, 2]\\n    expected_output = 0  # No operations needed\\n    result = func(nums)\\n    return result == expected_output'}, 'test_case_3': {'test_type': 'runtime', 'purpose': 'This test case will measure the execution time of the function with a large array input. The purpose is to ensure that the function performs efficiently and does not exceed a specific time limit. I will implement a timer to verify that the function runs within an acceptable duration.', 'test_function': 'import time \\ndef test_case(func):\\n    # Measure runtime for a large array\\n    nums = list(range(10000, 0, -1))  # A large descending array\\n    start_time = time.time()\\n    func(nums)\\n    end_time = time.time()\\n    return (end_time - start_time) < 1  # Ensure it executes in less than 1 second'}, 'test_case_4': {'test_type': 'component_check', 'purpose': 'This test ensures that the solution uses the required components by checking if the code contains the functions named \"find_adjacent_min_sum_pair\", \"replace_pair_with_sum\", \"is_non_decreasing\", and \"minimum_operations_to_non_decreasing\". It will inspect the source code of the target function.', 'test_function': 'def test_case(func):\\n    source = inspect.getsource(func)\\n    required_components = [\\n        \"find_adjacent_min_sum_pair\",\\n        \"replace_pair_with_sum\",\\n        \"is_non_decreasing\",\\n        \"minimum_operations_to_non_decreasing\"\\n    ]\\n    return all(component in source for component in required_components)'}, 'test_case_5': {'test_type': 'error_handling', 'purpose': 'This test case will check how the function handles invalid inputs such as empty lists and non-integer elements within the array. The expected behavior is that the function should raise an appropriate exception for these invalid cases, ensuring robustness.', 'test_function': 'def test_case(func):\\n    # Test case for invalid input (empty list)\\n    try:\\n        func([])\\n        return False  # Should raise an error, if it doesn\\'t we fail the test\\n    except ValueError:  # Assuming the function raises ValueError for empty input\\n        pass\\n    # Test case for invalid input (non-integer values)\\n    try:\\n        func([\"a\", \"b\"])\\n        return False  # Should raise an error, if it doesn\\'t we fail the test\\n    except TypeError:  # Assuming the function raises TypeError for non-integer input\\n        return True  # If both errors are raised correctly, we pass.'}}\n"
     ]
    }
   ],
   "source": [
    "def extract_test_cases(output_text):\n",
    "    \"\"\"\n",
    "    Extracts test cases from LLM output text with flexible tag handling.\n",
    "    Supports case-insensitive tags, missing <Type> tags, and multi-separators.\n",
    "    \"\"\"\n",
    "    import re\n",
    "    test_cases = {}\n",
    "\n",
    "    def preprocess_text(text):\n",
    "        # 定义一个占位符，避免选中正常文本的内容\n",
    "        placeholder = \"###NL###\"\n",
    "        \n",
    "        # 定义替换函数：将匹配到的代码块内的换行符替换为占位符\n",
    "        def repl_code(match):\n",
    "            block = match.group(0)\n",
    "            return block.replace(\"\\n\", placeholder)\n",
    "        \n",
    "        # 对 <code>...</code> 块进行替换（不区分大小写，多行匹配）\n",
    "        text = re.sub(r'(<\\s*code\\s*>.*?</\\s*code\\s*>)', repl_code, text, flags=re.IGNORECASE | re.DOTALL)\n",
    "        # 对 ```python ... ``` 块进行替换\n",
    "        text = re.sub(r'(```python.*?```)', repl_code, text, flags=re.IGNORECASE | re.DOTALL)\n",
    "        \n",
    "        # 如果还需要保护其他块，也可以在这里加上类似处理\n",
    "        return text, placeholder\n",
    "\n",
    "    # 预处理：隐藏代码块内的换行符\n",
    "    modified_text, placeholder = preprocess_text(output_text)\n",
    "    \n",
    "    # 分块：使用<separator>标签 或 连续空行分块\n",
    "    split_pattern = r'(?:<\\s*/\\s*separator\\s*>|<\\s*separator\\s*>|<\\s*separator\\s*/>|\\n\\s*\\n\\s*)'\n",
    "    test_case_blocks = re.split(split_pattern, modified_text, flags=re.IGNORECASE)\n",
    "    test_case_blocks = [b.strip() for b in test_case_blocks if b.strip()]\n",
    "    \n",
    "    # 还原各块内被隐藏的换行符\n",
    "    test_case_blocks = [b.replace(placeholder, \"\\n\") for b in test_case_blocks]\n",
    "\n",
    "    print(f\"共分出 {len(test_case_blocks)} 个块\")\n",
    "    for idx, block in enumerate(test_case_blocks, 1):\n",
    "        # 1. 提取 test_type\n",
    "        test_type = None\n",
    "        \n",
    "        # Case 1：通过 <type>value</type>\n",
    "        type_match = re.search(\n",
    "            r'<\\s*type\\s*>(.*?)<\\s*/\\s*type\\s*>', \n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        if type_match:\n",
    "            test_type = type_match.group(1).strip()\n",
    "        else:\n",
    "            # Case 2：判断是否有其他非已知标签标记的类型\n",
    "            known_tags = {'type', 'planning', 'code', 'reasoning', 'test_function', 'separator'}\n",
    "            for tag_match in re.finditer(r'<\\s*([^\\s>/]+)\\s*.*?>', block, re.IGNORECASE):\n",
    "                tag_name = tag_match.group(1).lower()\n",
    "                if tag_name not in known_tags:\n",
    "                    test_type = tag_name\n",
    "                    break  # 取第一个不在已知标签中的\n",
    "            \n",
    "        if not test_type:  # 若无 test_type 则跳过该块\n",
    "            continue\n",
    "        \n",
    "        # 2. 提取 reasoning（支持 <planning> 和 <reasoning>）\n",
    "        reasoning_match = re.search(\n",
    "            r'<\\s*(?:reasoning|planning)\\s*>(.*?)<\\s*/\\s*(?:reasoning|planning)\\s*>',\n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        reasoning = reasoning_match.group(1).strip() if reasoning_match else \"\"\n",
    "        \n",
    "        # 3. 提取 test_function（优先顺序：test_function 标签 > code 标签 > 独立代码块）\n",
    "        test_func = None\n",
    "        \n",
    "        # 检查 <test_function> 标签\n",
    "        test_func_match = re.search(\n",
    "            r'<\\s*test_function\\s*>(.*?)<\\s*/\\s*test_function\\s*>',\n",
    "            block, \n",
    "            re.IGNORECASE | re.DOTALL\n",
    "        )\n",
    "        if test_func_match:\n",
    "            content = test_func_match.group(1).strip()\n",
    "            code_block = re.search(r'```python\\s*(.*?)\\s*```', content, re.DOTALL)\n",
    "            test_func = code_block.group(1).strip() if code_block else content\n",
    "        else:\n",
    "            # 检查 <code> 标签\n",
    "            code_match = re.search(\n",
    "                r'<\\s*code\\s*>(.*?)<\\s*/\\s*code\\s*>',\n",
    "                block,\n",
    "                re.IGNORECASE | re.DOTALL\n",
    "            )\n",
    "            if code_match:\n",
    "                content = code_match.group(1).strip()\n",
    "                code_block = re.search(r'```python\\s*(.*?)\\s*```', content, re.DOTALL)\n",
    "                test_func = code_block.group(1).strip() if code_block else content\n",
    "            else:\n",
    "                # 检查独立代码块 (```python ... ```)\n",
    "                code_block = re.search(r'```python\\s*(.*?)\\s*```', block, re.DOTALL)\n",
    "                if code_block:\n",
    "                    test_func = code_block.group(1).strip()\n",
    "        \n",
    "        if test_type and test_func:\n",
    "            test_cases[f'test_case_{idx}'] = {\n",
    "                'test_type': test_type,\n",
    "                'purpose': reasoning,\n",
    "                'test_function': test_func\n",
    "            }\n",
    "    \n",
    "    if not test_cases:\n",
    "        # 如果没有提取到测试用例，则返回 False\n",
    "        return False\n",
    "\n",
    "    return test_cases\n",
    "\n",
    "print(extract_test_cases(raw_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_test_cases(output_text):\n",
    "    \"\"\"\n",
    "    提取测试用例，支持标签大小写不敏感、缺少 <type> 标签、以及多分隔符分块。\n",
    "    修改点：修改关闭标签的匹配规则，使其能够处理标签内的额外空格。\n",
    "    \"\"\"\n",
    "    import re\n",
    "    test_cases = {}\n",
    "    \n",
    "    # 修改关闭标签的匹配规则，允许在 < 与 / 之间有空格\n",
    "    type_match = re.search(\n",
    "        r'<\\s*type\\s*>(.*?)<\\s*/\\s*type\\s*>', \n",
    "        output_text, \n",
    "        re.IGNORECASE | re.DOTALL\n",
    "    )\n",
    "    # 后续同理，针对 code、planning 以及其他标签的匹配，\n",
    "    # 都需要将对应关闭标签部分修改为 <\\s*/\\s*xxx\\s*>\n",
    "    \n",
    "    # 示例：对 planning 标签的提取\n",
    "    planning_match = re.search(\n",
    "        r'<\\s*(?:reasoning|planning)\\s*>(.*?)<\\s*/\\s*(?:reasoning|planning)\\s*>',\n",
    "        output_text, \n",
    "        re.IGNORECASE | re.DOTALL\n",
    "    )\n",
    "    \n",
    "    # 此处省略其他逻辑...\n",
    "    \n",
    "    # 分块与其他代码不变...\n",
    "    \n",
    "    return test_cases\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import signal\n",
    "from concurrent.futures import ProcessPoolExecutor\n",
    "from tqdm import tqdm\n",
    "\n",
    "class TestRunner:\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=3):\n",
    "        # 编译函数和测试用例（保持原有逻辑）\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        # 准备结果字典\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(compiled_functions) * len(compiled_tests)\n",
    "        \n",
    "        with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
    "            futures = {}\n",
    "            pid_map = {}  # 存储任务ID与进程PID的映射\n",
    "            pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "            \n",
    "            # 提交测试任务\n",
    "            for func_id, func_obj in compiled_functions.items():\n",
    "                for test_id, test_func in compiled_tests.items():\n",
    "                    if func_obj is None or test_func is None:\n",
    "                        fun_results[func_id][test_id] = False\n",
    "                        test_results[test_id][func_id] = False\n",
    "                        pbar.update(1)\n",
    "                        continue\n",
    "                    \n",
    "                    # 提交任务并记录future\n",
    "                    future = executor.submit(\n",
    "                        self._process_wrapper,  # 新增的进程包装器\n",
    "                        func_obj, \n",
    "                        test_func\n",
    "                    )\n",
    "                    futures[future] = (func_id, test_id)\n",
    "                    pid_map[future] = None  # 初始化PID映射\n",
    "\n",
    "            # 建立PID映射（通过回调函数）\n",
    "            for future in futures:\n",
    "                future.add_done_callback(\n",
    "                    lambda f: pid_map.pop(f, None)\n",
    "                )\n",
    "\n",
    "            # 处理测试结果\n",
    "            for future in concurrent.futures.as_completed(futures):\n",
    "                func_id, test_id = futures[future]\n",
    "                try:\n",
    "                    # 获取结果并设置超时\n",
    "                    result = future.result(timeout=timeout)\n",
    "                except concurrent.futures.TimeoutError:\n",
    "                    result = False\n",
    "                    # 强制终止对应进程\n",
    "                    pid = pid_map.get(future)\n",
    "                    if pid:\n",
    "                        try:\n",
    "                            os.kill(pid, signal.SIGTERM)\n",
    "                        except ProcessLookupError:\n",
    "                            pass\n",
    "                except Exception as e:\n",
    "                    result = False\n",
    "                \n",
    "                fun_results[func_id][test_id] = result\n",
    "                test_results[test_id][func_id] = result\n",
    "                pbar.update(1)\n",
    "            \n",
    "            pbar.close()\n",
    "        \n",
    "        return fun_results, test_results\n",
    "\n",
    "    def _process_wrapper(self, func_obj, test_func):\n",
    "        \"\"\"进程包装器，用于捕获进程PID\"\"\"\n",
    "        pid = os.getpid()\n",
    "        try:\n",
    "            result = self.run_test(func_obj, test_func)\n",
    "        except Exception as e:\n",
    "            result = False\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running tests: 100%|██████████| 1/1 [00:02<00:00,  2.00s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "Test test_sleep for function func1 completed successfully.\n",
      "Function Results: {'func1': {'test_sleep': True}}\n",
      "Test Results: {'test_sleep': {'func1': True}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import logging\n",
    "import concurrent.futures\n",
    "from tqdm import tqdm  # Assuming you have tqdm installed\n",
    "\n",
    "# Your CodeRunner class definition here...\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_test(self, func_obj, test_func):\n",
    "        try:\n",
    "            return test_func(func_obj)\n",
    "        except Exception as e:\n",
    "            return False\n",
    "\n",
    "    def compile_code(self, code_str, main_function_name=None):\n",
    "        try:\n",
    "            local_vars = {}\n",
    "            exec(code_str, local_vars)\n",
    "            if main_function_name is not None:\n",
    "                func = local_vars.get(main_function_name)\n",
    "                return func if callable(func) else None\n",
    "            return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "        except Exception as e:\n",
    "            print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "            return None\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(compiled_functions) * len(compiled_tests)\n",
    "        \n",
    "        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "            futures = {}\n",
    "            pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "            \n",
    "            # Submit all test tasks\n",
    "            for func_id, func_obj in compiled_functions.items():\n",
    "                for test_id, test_func in compiled_tests.items():\n",
    "                    if func_obj is None or test_func is None:\n",
    "                        fun_results[func_id][test_id] = False\n",
    "                        test_results[test_id][func_id] = False\n",
    "                        pbar.update(1)\n",
    "                        continue\n",
    "                    \n",
    "                    future = executor.submit(self.run_test, func_obj, test_func)\n",
    "                    futures[future] = (func_id, test_id)\n",
    "\n",
    "            # Process task results with timeout control\n",
    "            for future in concurrent.futures.as_completed(futures):\n",
    "                func_id, test_id = futures[future]\n",
    "                try:\n",
    "                    result = future.result(timeout=timeout)\n",
    "                    print(f\"Test {test_id} for function {func_id} completed successfully.\")\n",
    "                except TimeoutError:\n",
    "                    print(f\"Test {test_id} for function {func_id} timed out.\")\n",
    "                    result = False\n",
    "                except Exception:\n",
    "                    result = False\n",
    "                \n",
    "                fun_results[func_id][test_id] = result\n",
    "                test_results[test_id][func_id] = result\n",
    "                pbar.update(1)\n",
    "            \n",
    "            pbar.close()\n",
    "        \n",
    "        return fun_results, test_results\n",
    "\n",
    "# Configure logging to show warnings.\n",
    "logging.basicConfig(level=logging.WARNING)\n",
    "\n",
    "# Define a simple function that will be tested.\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\", \n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "# Define a test case that purposefully delays execution.\n",
    "test_cases = {\n",
    "    \"test_sleep\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)  # Delay longer than the timeout value\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# Instantiate the CodeRunner and run the test\n",
    "runner = CodeRunner(max_workers=2)\n",
    "fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=1)\n",
    "\n",
    "print(\"Function Results:\", fun_results)\n",
    "print(\"Test Results:\", test_results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Timed out as expected\n"
     ]
    }
   ],
   "source": [
    "import concurrent.futures\n",
    "import time\n",
    "\n",
    "def long_running_function():\n",
    "    time.sleep(2)  # Sleep longer than our timeout\n",
    "    return True\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "    future = executor.submit(long_running_function)\n",
    "    try:\n",
    "        # Immediately try to get the result with a short timeout\n",
    "        result = future.result(timeout=1)\n",
    "        print(\"Result:\", result)\n",
    "    except concurrent.futures.TimeoutError:\n",
    "        print(\"Timed out as expected\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running tests: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test test_sleep for function func1 timed out.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results: {'func1': {'test_sleep': False}}\n",
      "Test Results: {'test_sleep': {'func1': False}}\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import logging\n",
    "import concurrent.futures\n",
    "from tqdm import tqdm\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_test(self, func_obj, test_func):\n",
    "        try:\n",
    "            return test_func(func_obj)\n",
    "        except Exception as e:\n",
    "            return False\n",
    "\n",
    "    def compile_code(self, code_str, main_function_name=None):\n",
    "        try:\n",
    "            local_vars = {}\n",
    "            exec(code_str, local_vars)\n",
    "            if main_function_name is not None:\n",
    "                func = local_vars.get(main_function_name)\n",
    "                return func if callable(func) else None\n",
    "            return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "        except Exception as e:\n",
    "            print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "            return None\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(functions) * len(test_cases)\n",
    "        \n",
    "        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "            futures = {}\n",
    "            pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "            \n",
    "            # Submit all test tasks\n",
    "            for func_id, func_obj in compiled_functions.items():\n",
    "                for test_id, test_func in compiled_tests.items():\n",
    "                    if func_obj is None or test_func is None:\n",
    "                        fun_results[func_id][test_id] = False\n",
    "                        test_results[test_id][func_id] = False\n",
    "                        pbar.update(1)\n",
    "                        continue\n",
    "                    \n",
    "                    future = executor.submit(self.run_test, func_obj, test_func)\n",
    "                    futures[future] = (func_id, test_id)\n",
    "\n",
    "            # Process each future with individual timeout\n",
    "            for future in futures:\n",
    "                func_id, test_id = futures[future]\n",
    "                try:\n",
    "                    result = future.result(timeout=timeout)\n",
    "                    print(f\"Test {test_id} for function {func_id} completed successfully.\")\n",
    "                except concurrent.futures.TimeoutError:\n",
    "                    print(f\"Test {test_id} for function {func_id} timed out.\")\n",
    "                    result = False\n",
    "                except Exception as e:\n",
    "                    print(f\"Test {test_id} for function {func_id} encountered an error: {e}\")\n",
    "                    result = False\n",
    "                \n",
    "                fun_results[func_id][test_id] = result\n",
    "                test_results[test_id][func_id] = result\n",
    "                pbar.update(1)\n",
    "            \n",
    "            pbar.close()\n",
    "        \n",
    "        return fun_results, test_results\n",
    "\n",
    "# Configure logging\n",
    "logging.basicConfig(level=logging.WARNING)\n",
    "\n",
    "# Sample data\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\", \n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# Run the tests\n",
    "runner = CodeRunner(max_workers=2)\n",
    "fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=1)\n",
    "\n",
    "print(\"Function Results:\", fun_results)\n",
    "print(\"Test Results:\", test_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running tests: 100%|██████████| 6/6 [00:05<00:00,  1.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results: {'func1': {'test_sleep_2': True, 'test_sleep_1': True, 'test_sleep_3': True, 'test_sleep_11': False, 'test_sleep_10': False, 'test_sleep_5': False}}\n",
      "Test Results: {'test_sleep_11': {'func1': False}, 'test_sleep_2': {'func1': True}, 'test_sleep_1': {'func1': True}, 'test_sleep_3': {'func1': True}, 'test_sleep_5': {'func1': False}, 'test_sleep_10': {'func1': False}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_test(self, func_obj, test_func):\n",
    "        try:\n",
    "            return test_func(func_obj)\n",
    "        except Exception as e:\n",
    "            return False\n",
    "\n",
    "    def compile_code(self, code_str, main_function_name=None):\n",
    "        try:\n",
    "            local_vars = {}\n",
    "            exec(code_str, local_vars)\n",
    "            if main_function_name is not None:\n",
    "                func = local_vars.get(main_function_name)\n",
    "                return func if callable(func) else None\n",
    "            return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "        except Exception as e:\n",
    "            print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "            return None\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(functions) * len(test_cases)\n",
    "        \n",
    "        # 创建线程池时不使用 with 语句，避免隐式等待\n",
    "        executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)\n",
    "        try:\n",
    "            futures = {}\n",
    "            pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "            \n",
    "            # 提交所有任务\n",
    "            for func_id, func_obj in compiled_functions.items():\n",
    "                for test_id, test_func in compiled_tests.items():\n",
    "                    if func_obj is None or test_func is None:\n",
    "                        fun_results[func_id][test_id] = False\n",
    "                        test_results[test_id][func_id] = False\n",
    "                        pbar.update(1)\n",
    "                        continue\n",
    "                    \n",
    "                    future = executor.submit(self.run_test, func_obj, test_func)\n",
    "                    futures[future] = (func_id, test_id)\n",
    "\n",
    "            # 等待所有任务完成或达到全局超时\n",
    "            done, not_done = concurrent.futures.wait(\n",
    "                futures.keys(),\n",
    "                timeout=timeout,\n",
    "                return_when=concurrent.futures.ALL_COMPLETED\n",
    "            )\n",
    "\n",
    "            # 处理已完成任务\n",
    "            for future in done:\n",
    "                func_id, test_id = futures[future]\n",
    "                try:\n",
    "                    result = future.result()\n",
    "                except Exception:\n",
    "                    result = False\n",
    "                fun_results[func_id][test_id] = result\n",
    "                test_results[test_id][func_id] = result\n",
    "                pbar.update(1)\n",
    "\n",
    "            # 标记未完成任务为超时\n",
    "            for future in not_done:\n",
    "                func_id, test_id = futures[future]\n",
    "                fun_results[func_id][test_id] = False\n",
    "                test_results[test_id][func_id] = False\n",
    "                pbar.update(1)\n",
    "\n",
    "            pbar.close()\n",
    "            return fun_results, test_results\n",
    "\n",
    "        finally:\n",
    "            # 强制关闭执行器，不等待后台线程完成\n",
    "            executor.shutdown(wait=False)\n",
    "\n",
    "# Configure logging\n",
    "logging.basicConfig(level=logging.WARNING)\n",
    "\n",
    "# Sample data\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\", \n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_2\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(5)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "        \"test_sleep_10\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(10)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# Run the tests\n",
    "runner = CodeRunner(max_workers=10)\n",
    "fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=5)\n",
    "\n",
    "print(\"Function Results:\", fun_results)\n",
    "print(\"Test Results:\", test_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import logging\n",
    "import concurrent.futures\n",
    "from tqdm import tqdm\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_test(self, func_obj, test_func):\n",
    "        try:\n",
    "            return test_func(func_obj)\n",
    "        except Exception as e:\n",
    "            return False\n",
    "\n",
    "    def compile_code(self, code_str, main_function_name=None):\n",
    "        try:\n",
    "            local_vars = {}\n",
    "            exec(code_str, local_vars)\n",
    "            if main_function_name is not None:\n",
    "                func = local_vars.get(main_function_name)\n",
    "                return func if callable(func) else None\n",
    "            return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "        except Exception as e:\n",
    "            print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "            return None\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(functions) * len(test_cases)\n",
    "        \n",
    "        # TODO: Use ThreadPoolExecutor or ProcessPoolExecutor or Asyncio and other functions in this class to run all the tests and record the results.\n",
    "        \n",
    "        return fun_results, test_results\n",
    "\n",
    "# Sample data\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\", \n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# Run the tests\n",
    "runner = CodeRunner(max_workers=2)\n",
    "fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=1)\n",
    "\n",
    "print(\"Function Results:\", fun_results)\n",
    "print(\"Test Results:\", test_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results:\n",
      "  func1:\n",
      "    test_ok: False\n",
      "\n",
      "Test Results:\n",
      "  test_ok:\n",
      "    func1: False\n"
     ]
    }
   ],
   "source": [
    "import multiprocessing\n",
    "from multiprocessing import Process, Queue\n",
    "import time\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        # 创建任务列表 (函数ID, 测试ID, 代码, 主函数名, 测试代码)\n",
    "        tasks = [\n",
    "            (fid, tid, \n",
    "             func_info['code'], \n",
    "             func_info.get('main_function_name'),\n",
    "             test_info['test_function'])\n",
    "            for fid, func_info in functions.items()\n",
    "            for tid, test_info in test_cases.items()\n",
    "        ]\n",
    "\n",
    "        # 初始化结果存储结构\n",
    "        fun_results = {fid: {tid: None for tid in test_cases} for fid in functions}\n",
    "        test_results = {tid: {fid: None for fid in functions} for tid in test_cases}\n",
    "\n",
    "        running = []  # 存储运行中的进程信息\n",
    "        \n",
    "        # 任务调度主循环\n",
    "        for task in tasks:\n",
    "            # 控制并发数量\n",
    "            while len(running) >= max_workers:\n",
    "                self._check_processes(running, fun_results, test_results, timeout)\n",
    "            self._start_process(task, running)\n",
    "\n",
    "        # 等待剩余进程完成\n",
    "        while len(running) > 0:\n",
    "            self._check_processes(running, fun_results, test_results, timeout)\n",
    "\n",
    "        return fun_results, test_results\n",
    "\n",
    "    def _start_process(self, task, running):\n",
    "        \"\"\"启动单个测试进程\"\"\"\n",
    "        fid, tid, func_code, func_main, test_code = task\n",
    "        q = Queue()\n",
    "        p = Process(target=self._worker, args=(func_code, func_main, test_code, q))\n",
    "        p.start()\n",
    "        running.append((p, time.time(), fid, tid, q))\n",
    "\n",
    "    def _check_processes(self, running, fun_res, test_res, timeout):\n",
    "        \"\"\"检查进程状态并更新结果\"\"\"\n",
    "        current_time = time.time()\n",
    "        finished = []\n",
    "\n",
    "        for idx, (proc, start, fid, tid, q) in enumerate(running):\n",
    "            # 处理超时进程\n",
    "            if proc.is_alive():\n",
    "                if current_time - start > timeout:\n",
    "                    proc.terminate()\n",
    "                    proc.join()\n",
    "                    self._update_results(fun_res, test_res, fid, tid, False)\n",
    "                    finished.append(idx)\n",
    "            # 处理已完成进程\n",
    "            else:\n",
    "                result = q.get() if not q.empty() else False\n",
    "                self._update_results(fun_res, test_res, fid, tid, result)\n",
    "                finished.append(idx)\n",
    "                proc.close()\n",
    "\n",
    "        # 逆向删除已完成项\n",
    "        for idx in reversed(finished):\n",
    "            del running[idx]\n",
    "\n",
    "    def _update_results(self, fun_res, test_res, fid, tid, result):\n",
    "        \"\"\"统一更新结果字典\"\"\"\n",
    "        fun_res[fid][tid] = result\n",
    "        test_res[tid][fid] = result\n",
    "\n",
    "    @staticmethod\n",
    "    def _worker(func_code, func_main, test_code, queue):\n",
    "        \"\"\"子进程执行函数\"\"\"\n",
    "        try:\n",
    "            # 动态编译被测试函数\n",
    "            func_globals = {}\n",
    "            exec(func_code, func_globals)\n",
    "            func = func_globals.get(func_main) if func_main else next(\n",
    "                (v for k,v in func_globals.items() \n",
    "                 if callable(v) and not k.startswith('__')), None)\n",
    "\n",
    "            # 动态编译测试函数\n",
    "            test_globals = {}\n",
    "            exec(test_code, test_globals)\n",
    "            test = next(\n",
    "                (v for k,v in test_globals.items() \n",
    "                 if callable(v) and not k.startswith('__')), None)\n",
    "\n",
    "            # 执行测试逻辑\n",
    "            if func and test:\n",
    "                queue.put(test(func))\n",
    "            else:\n",
    "                queue.put(False)\n",
    "        except:\n",
    "            queue.put(False)\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    # 测试数据\n",
    "    functions = {\n",
    "        \"func1\": {\n",
    "            \"code\": \"def func1():\\n    return 'ok'\", \n",
    "            \"main_function_name\": \"func1\"\n",
    "        }\n",
    "    }\n",
    "\n",
    "    test_cases = {\n",
    "        \"test_ok\": {\n",
    "            \"test_function\": (\n",
    "                \"def test_ok(func):\\n\"\n",
    "                \"    return func() == 'ok'\"\n",
    "            )\n",
    "        }\n",
    "    }\n",
    "\n",
    "    # 执行测试\n",
    "    runner = CodeRunner()\n",
    "    func_results, test_results = runner.run_all_tests(\n",
    "        functions, \n",
    "        test_cases,\n",
    "        timeout=3\n",
    "    )\n",
    "\n",
    "    print(\"Function Results:\")\n",
    "    for func_id, results in func_results.items():\n",
    "        print(f\"  {func_id}:\")\n",
    "        for test_id, result in results.items():\n",
    "            print(f\"    {test_id}: {result}\")\n",
    "\n",
    "    print(\"\\nTest Results:\")\n",
    "    for test_id, results in test_results.items():\n",
    "        print(f\"  {test_id}:\")\n",
    "        for func_id, result in results.items():\n",
    "            print(f\"    {func_id}: {result}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results: {'func1': {'test_sleep_10': False, 'test_sleep_2': False, 'test_sleep_5': False}}\n",
      "Test Results: {'test_sleep_2': {'func1': False}, 'test_sleep_5': {'func1': False}, 'test_sleep_10': {'func1': False}}\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "from multiprocessing import Process, Queue\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "\n",
    "def run_test_in_process(func_code, main_function_name, test_code, output_queue):\n",
    "    \"\"\"\n",
    "    在子进程中运行单个测试：\n",
    "      1. 通过 exec 加载函数代码，并获取待测试的主函数；\n",
    "      2. 加载测试代码，并从中获取测试函数（这里假定测试函数的名称以 \"test_\" 开头）；\n",
    "      3. 调用测试函数，将主函数传入，并将返回值放入结果队列；\n",
    "      4. 出现异常时返回 False。\n",
    "    \"\"\"\n",
    "    try:\n",
    "        local_env = {}\n",
    "        # 加载被测试函数代码\n",
    "        exec(func_code, local_env)\n",
    "        main_func = local_env[main_function_name]\n",
    "        # 加载测试用例代码\n",
    "        exec(test_code, local_env)\n",
    "        test_func = None\n",
    "        # 这里假设测试代码中仅定义了一个以“test_”开头的函数\n",
    "        for key, value in local_env.items():\n",
    "            if callable(value) and key.startswith(\"test_\"):\n",
    "                test_func = value\n",
    "                break\n",
    "        if test_func is None:\n",
    "            output_queue.put(False)\n",
    "            return\n",
    "        # 执行测试：若返回 True 表示通过，否则不通过\n",
    "        result = test_func(main_func)\n",
    "        output_queue.put(result)\n",
    "    except Exception as e:\n",
    "        output_queue.put(False)\n",
    "\n",
    "def run_single_test(func_code, main_function_name, test_code, timeout):\n",
    "    \"\"\"\n",
    "    为单个测试用例创建子进程运行，并在超时（或执行异常）时返回 False 。\n",
    "    \"\"\"\n",
    "    output_queue = Queue()\n",
    "    p = Process(target=run_test_in_process, args=(func_code, main_function_name, test_code, output_queue))\n",
    "    start_time = time.time()\n",
    "    p.start()\n",
    "    # 等待指定的 timeout 秒\n",
    "    p.join(timeout)\n",
    "    elapsed = time.time() - start_time\n",
    "    # 如果仍在运行或耗时超过 timeout，则结束进程，并返回 False\n",
    "    if p.is_alive() or elapsed > timeout:\n",
    "        p.terminate()\n",
    "        p.join()\n",
    "        return False\n",
    "    # 否则从队列中获取结果\n",
    "    if not output_queue.empty():\n",
    "        return output_queue.get()\n",
    "    return False\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=10):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, timeout):\n",
    "        \"\"\"\n",
    "        对每个函数与每个测试用例组合并行运行测试：\n",
    "            - functions: 格式为 { 'func_name': {'code': ..., 'main_function_name': ...} }\n",
    "            - test_cases: 格式为 { 'test_name': {'test_function': ...} }\n",
    "            - timeout: 每个测试的最大运行时间（秒）\n",
    "            \n",
    "        返回两个字典：fun_results 和 test_results\n",
    "        \"\"\"\n",
    "        # 构造存放测试结果的字典\n",
    "        fun_results = {func_name: {} for func_name in functions}\n",
    "        test_results = {test_name: {} for test_name in test_cases}\n",
    "\n",
    "        # 为每个（函数，测试）组合创建任务\n",
    "        tasks = []\n",
    "        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:\n",
    "            for func_name, func_info in functions.items():\n",
    "                func_code = func_info['code']\n",
    "                main_function_name = func_info['main_function_name']\n",
    "                for test_name, test_info in test_cases.items():\n",
    "                    test_code = test_info['test_function']\n",
    "                    tasks.append((\n",
    "                        func_name, test_name,\n",
    "                        executor.submit(run_single_test, func_code, main_function_name, test_code, timeout)\n",
    "                    ))\n",
    "            # 收集各个任务的测试结果\n",
    "            for func_name, test_name, future in tasks:\n",
    "                try:\n",
    "                    result = future.result()\n",
    "                except Exception:\n",
    "                    result = False\n",
    "                fun_results[func_name][test_name] = result\n",
    "                test_results[test_name][func_name] = result\n",
    "        return fun_results, test_results\n",
    "\n",
    "# ---------------------------\n",
    "# 以下为数据样例（包含6个测试用例，用以匹配预期输出）\n",
    "\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\",\n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_1\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(1)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_2\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_3\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(3)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(5)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_10\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(10)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_11\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(11)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# 运行代码示例\n",
    "if __name__ == \"__main__\":\n",
    "    # 设置超时时间为5秒\n",
    "    runner = CodeRunner(max_workers=10)\n",
    "    fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=5)\n",
    "\n",
    "    print(\"Function Results:\", fun_results)\n",
    "    print(\"Test Results:\", test_results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results: {'func1': {'test_sleep_10': False, 'test_sleep_2': False, 'test_sleep_5': False}}\n",
      "Test Results: {'test_sleep_2': {'func1': False}, 'test_sleep_5': {'func1': False}, 'test_sleep_10': {'func1': False}}\n"
     ]
    }
   ],
   "source": [
    "import concurrent.futures\n",
    "import multiprocessing\n",
    "import threading\n",
    "import time\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=10):\n",
    "        self.max_workers = max_workers\n",
    "    \n",
    "    def run_all_tests(self, functions, test_cases, timeout):\n",
    "        tasks = []\n",
    "        for func_name in functions:\n",
    "            func_info = functions[func_name]\n",
    "            for test_name in test_cases:\n",
    "                tasks.append((\n",
    "                    func_name,\n",
    "                    test_name,\n",
    "                    func_info['code'],\n",
    "                    func_info['main_function_name'],\n",
    "                    test_cases[test_name]['test_function']\n",
    "                ))\n",
    "        \n",
    "        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:\n",
    "            futures = []\n",
    "            for task in tasks:\n",
    "                futures.append(executor.submit(\n",
    "                    self._run_single_test,\n",
    "                    *task[2:],  # func_code, main_func_name, test_code\n",
    "                    timeout\n",
    "                ))\n",
    "            \n",
    "            function_results = {fn: {} for fn in functions}\n",
    "            test_results = {tn: {} for tn in test_cases}\n",
    "            \n",
    "            for (func_name, test_name, *_), future in zip(tasks, futures):\n",
    "                result = future.result()\n",
    "                function_results[func_name][test_name] = result\n",
    "                test_results[test_name][func_name] = result\n",
    "        \n",
    "        return function_results, test_results\n",
    "    \n",
    "    def _run_single_test(self, func_code, main_func_name, test_code, timeout):\n",
    "        manager = multiprocessing.Manager()\n",
    "        queue = manager.Queue()\n",
    "        process = multiprocessing.Process(\n",
    "            target=self._worker_process,\n",
    "            args=(func_code, main_func_name, test_code, timeout, queue)\n",
    "        )\n",
    "        process.start()\n",
    "        process.join(timeout + 1)  # 稍等片刻确保进程清理\n",
    "        \n",
    "        if process.is_alive():\n",
    "            process.terminate()\n",
    "            process.join()\n",
    "            return False\n",
    "        \n",
    "        return queue.get() if not queue.empty() else False\n",
    "    \n",
    "    def _worker_process(self, func_code, main_func_name, test_code, timeout, queue):\n",
    "        try:\n",
    "            # 动态加载被测试函数\n",
    "            func_namespace = {}\n",
    "            exec(func_code, func_namespace)\n",
    "            func = func_namespace[main_func_name]\n",
    "            \n",
    "            # 动态加载测试函数\n",
    "            test_namespace = {}\n",
    "            exec(test_code, test_namespace)\n",
    "            test_func = test_namespace['test_sleep']\n",
    "            \n",
    "            # 使用线程控制执行时间\n",
    "            result_container = []\n",
    "            event = threading.Event()\n",
    "            \n",
    "            def worker_thread():\n",
    "                try:\n",
    "                    result_container.append(test_func(func))\n",
    "                except:\n",
    "                    result_container.append(False)\n",
    "                finally:\n",
    "                    event.set()\n",
    "            \n",
    "            thread = threading.Thread(target=worker_thread)\n",
    "            thread.daemon = True\n",
    "            thread.start()\n",
    "            \n",
    "            event.wait(timeout)\n",
    "            \n",
    "            if not event.is_set() or not result_container:\n",
    "                queue.put(False)\n",
    "            else:\n",
    "                queue.put(bool(result_container[0]))\n",
    "        except:\n",
    "            queue.put(False)\n",
    "\n",
    "# 测试样例\n",
    "if __name__ == \"__main__\":\n",
    "    functions = {\n",
    "        \"func1\": {\n",
    "            \"code\": \"def func1():\\n    return 'ok'\", \n",
    "            \"main_function_name\": \"func1\"\n",
    "        }\n",
    "    }\n",
    "\n",
    "    test_cases = {\n",
    "        \"test_sleep_2\": {\n",
    "            \"test_function\": (\n",
    "                \"def test_sleep(func):\\n\"\n",
    "                \"    import time\\n\"\n",
    "                \"    time.sleep(2)\\n\"\n",
    "                \"    return func() == 'ok'\"\n",
    "            )\n",
    "        },\n",
    "        \"test_sleep_5\": {\n",
    "            \"test_function\": (\n",
    "                \"def test_sleep(func):\\n\"\n",
    "                \"    import time\\n\"\n",
    "                \"    time.sleep(5)\\n\"\n",
    "                \"    return func() == 'ok'\"\n",
    "            )\n",
    "        },\n",
    "        \"test_sleep_10\": {\n",
    "            \"test_function\": (\n",
    "                \"def test_sleep(func):\\n\"\n",
    "                \"    import time\\n\"\n",
    "                \"    time.sleep(10)\\n\"\n",
    "                \"    return func() == 'ok'\"\n",
    "            )\n",
    "        }\n",
    "    }\n",
    "\n",
    "    runner = CodeRunner(max_workers=10)\n",
    "    fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=5)\n",
    "    \n",
    "    print(\"Function Results:\", {k: dict(sorted(v.items())) for k, v in fun_results.items()})\n",
    "    print(\"Test Results:\", {k: dict(sorted(v.items())) for k, v in test_results.items()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running tests: 100%|██████████| 3/3 [00:00<00:00, 2994.51it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Function Results: {'func1': {'test_sleep_5': {'status': 'error', 'message': \"'tuple' object is not callable\"}, 'test_sleep_2': {'status': 'error', 'message': \"'tuple' object is not callable\"}, 'test_sleep_10': {'status': 'error', 'message': \"'tuple' object is not callable\"}}}\n",
      "Test Results: {'test_sleep_2': {'func1': {'status': 'error', 'message': \"'tuple' object is not callable\"}}, 'test_sleep_5': {'func1': {'status': 'error', 'message': \"'tuple' object is not callable\"}}, 'test_sleep_10': {'func1': {'status': 'error', 'message': \"'tuple' object is not callable\"}}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_test(self, func_obj, test_func):\n",
    "        try:\n",
    "            result = test_func(func_obj)\n",
    "            return {'status': 'passed', 'result': result}\n",
    "        except AssertionError as e:\n",
    "            return {'status': 'failed', 'message': str(e)}\n",
    "        except Exception as e:\n",
    "            return {'status': 'error', 'message': str(e)}\n",
    "\n",
    "    def compile_code(self, code_str, main_function_name=None):\n",
    "        try:\n",
    "            local_vars = {}\n",
    "            exec(code_str, local_vars)\n",
    "            if main_function_name is not None:\n",
    "                func = local_vars.get(main_function_name)\n",
    "                return func if callable(func) else None\n",
    "            return next((obj for obj in local_vars.values() if callable(obj)), None)\n",
    "        except Exception as e:\n",
    "            print(f\"Compilation Error: {str(e)}, code_str:\\n {code_str}\")\n",
    "            return None\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=1):\n",
    "        compiled_functions = {\n",
    "            fid: self.compile_code(\n",
    "                code_info['code'],\n",
    "                main_function_name=code_info.get('main_function_name')\n",
    "            )\n",
    "            for fid, code_info in functions.items()\n",
    "        }\n",
    "        \n",
    "        compiled_tests = {\n",
    "            tid: self.compile_code(code_info['test_function'])\n",
    "            for tid, code_info in test_cases.items()\n",
    "        }\n",
    "\n",
    "        fun_results = {fid: {} for fid in functions}\n",
    "        test_results = {tid: {} for tid in test_cases}\n",
    "\n",
    "        total_tests = len(functions) * len(test_cases)\n",
    "        \n",
    "        # 创建线程池时不使用 with 语句，避免隐式等待\n",
    "        executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)\n",
    "        try:\n",
    "            futures = {}\n",
    "            pbar = tqdm(total=total_tests, desc=\"Running tests\")\n",
    "            \n",
    "            # 提交所有任务\n",
    "            for func_id, func_obj in compiled_functions.items():\n",
    "                for test_id, test_func in compiled_tests.items():\n",
    "                    if func_obj is None or test_func is None:\n",
    "                        fun_results[func_id][test_id] = False\n",
    "                        test_results[test_id][func_id] = False\n",
    "                        pbar.update(1)\n",
    "                        continue\n",
    "                    \n",
    "                    future = executor.submit(self.run_test, func_obj, test_func)\n",
    "                    futures[future] = (func_id, test_id)\n",
    "\n",
    "            # 等待所有任务完成或达到全局超时\n",
    "            done, not_done = concurrent.futures.wait(\n",
    "                futures.keys(),\n",
    "                timeout=timeout,\n",
    "                return_when=concurrent.futures.ALL_COMPLETED\n",
    "            )\n",
    "\n",
    "            # 处理已完成任务\n",
    "            for future in done:\n",
    "                func_id, test_id = futures[future]\n",
    "                try:\n",
    "                    result = future.result()\n",
    "                except Exception:\n",
    "                    result = False\n",
    "                fun_results[func_id][test_id] = result\n",
    "                test_results[test_id][func_id] = result\n",
    "                pbar.update(1)\n",
    "\n",
    "            # 标记未完成任务为超时\n",
    "            for future in not_done:\n",
    "                func_id, test_id = futures[future]\n",
    "                fun_results[func_id][test_id] = False\n",
    "                test_results[test_id][func_id] = False\n",
    "                pbar.update(1)\n",
    "\n",
    "            pbar.close()\n",
    "            return fun_results, test_results\n",
    "\n",
    "        finally:\n",
    "            # 强制关闭执行器，不等待后台线程完成\n",
    "            executor.shutdown(wait=False)\n",
    "\n",
    "# Configure logging\n",
    "logging.basicConfig(level=logging.WARNING)\n",
    "\n",
    "# Sample data\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\", \n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_2\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(2)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(5)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    },\n",
    "        \"test_sleep_10\": {\n",
    "         \"test_function\": (\n",
    "             \"def test_sleep(func):\\n\"\n",
    "             \"    import time\\n\"\n",
    "             \"    time.sleep(10)\\n\"\n",
    "             \"    return func() == 'ok'\"\n",
    "         )\n",
    "    }\n",
    "}\n",
    "\n",
    "# Run the tests\n",
    "runner = CodeRunner(max_workers=10)\n",
    "fun_results, test_results = runner.run_all_tests(functions, test_cases, timeout=5)\n",
    "\n",
    "print(\"Function Results:\", fun_results)\n",
    "print(\"Test Results:\", test_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Task A failed: A process in the process pool was terminated abruptly while the future was running or pending.\n",
      "Task B failed: A process in the process pool was terminated abruptly while the future was running or pending.\n"
     ]
    }
   ],
   "source": [
    "import concurrent.futures\n",
    "import multiprocessing\n",
    "import os\n",
    "import time\n",
    "import signal\n",
    "\n",
    "def task(name, duration, pid_dict):\n",
    "    pid_dict[name] = os.getpid()  # 存储当前进程PID\n",
    "    print(f\"Task {name} started (PID: {pid_dict[name]})\")\n",
    "    time.sleep(duration)\n",
    "    print(f\"Task {name} completed\")\n",
    "    return f\"Result of {name}\"\n",
    "\n",
    "def run_with_timeout():\n",
    "    manager = multiprocessing.Manager()\n",
    "    pid_dict = manager.dict()  # 共享字典存储PID\n",
    "    \n",
    "    tasks = [(\"A\", 5), (\"B\", 2)]\n",
    "    timeout = 3  # 3秒超时\n",
    "    \n",
    "    with concurrent.futures.ProcessPoolExecutor() as executor:\n",
    "        futures = {\n",
    "            executor.submit(task, name, duration, pid_dict): name\n",
    "            for name, duration in tasks\n",
    "        }\n",
    "        \n",
    "        try:\n",
    "            for future in concurrent.futures.as_completed(futures, timeout=timeout):\n",
    "                name = futures[future]\n",
    "                try:\n",
    "                    result = future.result()\n",
    "                    print(result)\n",
    "                except Exception as e:\n",
    "                    print(f\"Task {name} failed: {e}\")\n",
    "        except concurrent.futures.TimeoutError:\n",
    "            print(f\"\\nTimeout after {timeout} seconds! Killing remaining tasks...\")\n",
    "            for future in futures:\n",
    "                if not future.done():\n",
    "                    name = futures[future]\n",
    "                    pid = pid_dict.get(name)\n",
    "                    if pid:\n",
    "                        print(f\"Terminating Task {name} (PID: {pid})\")\n",
    "                        try:\n",
    "                            os.kill(pid, signal.SIGTERM)  # 发送终止信号\n",
    "                        except ProcessLookupError:\n",
    "                            print(f\"Task {name} already terminated\")\n",
    "                        except Exception as e:\n",
    "                            print(f\"Failed to kill Task {name}: {e}\")\n",
    "\n",
    "run_with_timeout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parsed Function Results: {'func1': {'test_sleep_2': True, 'test_sleep_5': True, 'test_sleep_10': False, 'test_sleep_20': False, 'test_sleep_40': False}}\n",
      "Parsed Test Results: {'test_sleep_2': {'func1': True}, 'test_sleep_5': {'func1': True}, 'test_sleep_10': {'func1': False}, 'test_sleep_20': {'func1': False}, 'test_sleep_40': {'func1': False}}\n",
      "Full Output:\n",
      " Function Results: {'func1': {'test_sleep_2': True, 'test_sleep_5': True, 'test_sleep_10': False, 'test_sleep_20': False, 'test_sleep_40': False}}\n",
      "Test Results: {'test_sleep_2': {'func1': True}, 'test_sleep_5': {'func1': True}, 'test_sleep_10': {'func1': False}, 'test_sleep_20': {'func1': False}, 'test_sleep_40': {'func1': False}}\n",
      "Execution Time: 8.129323244094849\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import subprocess\n",
    "\n",
    "def run_code_runner_in_subprocess(functions, test_cases, max_workers=10, timeout=5, script_path=\"code_runner.py\"):\n",
    "    # 将字典转换为 JSON 字符串\n",
    "    functions_json = json.dumps(functions)\n",
    "    test_cases_json = json.dumps(test_cases)\n",
    "    \n",
    "    cmd = [\n",
    "        \"python\", script_path,\n",
    "        \"--functions\", functions_json,\n",
    "        \"--test_cases\", test_cases_json,\n",
    "        \"--max_workers\", str(max_workers),\n",
    "        \"--timeout\", str(timeout)\n",
    "    ]\n",
    "    \n",
    "    result = subprocess.run(cmd, capture_output=True, text=True)\n",
    "    \n",
    "    output_lines = result.stdout.strip().splitlines()\n",
    "    function_results = {}\n",
    "    test_results = {}\n",
    "    \n",
    "    for line in output_lines:\n",
    "        if line.startswith(\"Function Results:\"):\n",
    "            # 取冒号后的部分作为字典字符串\n",
    "            dict_str = line[len(\"Function Results:\"):].strip()\n",
    "            try:\n",
    "                function_results = eval(dict_str)\n",
    "            except Exception as e:\n",
    "                print(\"Error evaluating function results:\", e)\n",
    "        elif line.startswith(\"Test Results:\"):\n",
    "            dict_str = line[len(\"Test Results:\"):].strip()\n",
    "            try:\n",
    "                test_results = eval(dict_str)\n",
    "            except Exception as e:\n",
    "                print(\"Error evaluating test results:\", e)\n",
    "                \n",
    "    return function_results, test_results, result.stdout\n",
    "\n",
    "# 定义示例输入\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\",\n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_2\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(2)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(5)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_10\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(10)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_20\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(20)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_40\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(40)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    }\n",
    "}\n",
    "\n",
    "# 通过subprocess调用外部脚本\n",
    "fr, tr, full_out = run_code_runner_in_subprocess(functions, test_cases, max_workers=10, timeout=5.1)\n",
    "print(\"Parsed Function Results:\", fr)\n",
    "print(\"Parsed Test Results:\", tr)\n",
    "print(\"Full Output:\\n\", full_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Full Output from subprocess:\n",
      "{\"function_results\": {\"func1\": {\"test_sleep_2\": true, \"test_sleep_5\": true, \"test_sleep_10\": false}}, \"test_results\": {\"test_sleep_2\": {\"func1\": true}, \"test_sleep_5\": {\"func1\": true}, \"test_sleep_10\": {\"func1\": false}}, \"execution_time\": 7.8130621910095215}\n",
      "\n",
      "Parsed Function Results: {'func1': {'test_sleep_2': True, 'test_sleep_5': True, 'test_sleep_10': False}}\n",
      "Parsed Test Results: {'test_sleep_2': {'func1': True}, 'test_sleep_5': {'func1': True}, 'test_sleep_10': {'func1': False}}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import subprocess\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_code_runner_in_subprocess(self, functions, test_cases, max_workers=None, timeout=30, script_path=\"code_runner_test.py\"):\n",
    "        if max_workers is None:\n",
    "            max_workers = self.max_workers\n",
    "\n",
    "        # 将字典转换为 JSON 字符串\n",
    "        functions_json = json.dumps(functions)\n",
    "        test_cases_json = json.dumps(test_cases)\n",
    "        \n",
    "        cmd = [\n",
    "            \"python\", script_path,\n",
    "            \"--functions\", functions_json,\n",
    "            \"--test_cases\", test_cases_json,\n",
    "            \"--max_workers\", str(max_workers),\n",
    "            \"--timeout\", str(timeout)\n",
    "        ]\n",
    "        \n",
    "        result = subprocess.run(cmd, capture_output=True, text=True)\n",
    "        \n",
    "        try:\n",
    "            data = json.loads(result.stdout)\n",
    "            function_results = data.get(\"function_results\", {})\n",
    "            test_results = data.get(\"test_results\", {})\n",
    "            execution_time = data.get(\"execution_time\", None)\n",
    "        except Exception as e:\n",
    "            print(\"Error parsing JSON output from subprocess:\", e)\n",
    "            print(\"Subprocess stdout:\", result.stdout)\n",
    "            function_results, test_results, execution_time = {}, {}, None\n",
    "                    \n",
    "        return function_results, test_results, result.stdout\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=5):\n",
    "        fr, tr, full_out = self.run_code_runner_in_subprocess(functions, test_cases, max_workers, timeout)\n",
    "        \n",
    "        # debug 打印完整输出\n",
    "        print(\"Full Output from subprocess:\")\n",
    "        print(full_out)\n",
    "        return fr, tr\n",
    "\n",
    "# 示例测试数据\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\",\n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_2\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(2)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(5)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_10\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(10)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    }\n",
    "}\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # 通过 subprocess 调用外部脚本执行测试\n",
    "    code_runner = CodeRunner(max_workers=10)\n",
    "    fr, tr = code_runner.run_all_tests(functions, test_cases, timeout=5.1)\n",
    "    print(\"Parsed Function Results:\", fr)\n",
    "    print(\"Parsed Test Results:\", tr)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Testing Progress: 100%|##########| 3/3"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PROGRESS_TASK: 1/3\n",
      "PROGRESS_TASK: 2/3\n",
      "PROGRESS_TASK: 3/3\n",
      "FUNCTION_RESULTS:{\"func1\": {\"test_sleep_2\": true, \"test_sleep_5\": true, \"test_sleep_10\": false}}\n",
      "TEST_RESULTS:{\"test_sleep_2\": {\"func1\": true}, \"test_sleep_5\": {\"func1\": true}, \"test_sleep_10\": {\"func1\": false}}\n",
      "EXECUTION_TIME:6.9023637771606445\n",
      "Parsed Function Results: {'func1': {'test_sleep_2': True, 'test_sleep_5': True, 'test_sleep_10': False}}\n",
      "Parsed Test Results: {'test_sleep_2': {'func1': True}, 'test_sleep_5': {'func1': True}, 'test_sleep_10': {'func1': False}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import subprocess\n",
    "from tqdm import tqdm\n",
    "\n",
    "class CodeRunner:\n",
    "    def __init__(self, max_workers=5):\n",
    "        self.max_workers = max_workers\n",
    "\n",
    "    def run_code_runner_in_subprocess(self, functions, test_cases, max_workers=None, timeout=30, script_path=\"code_runner_test.py\"):\n",
    "        max_workers = max_workers or self.max_workers\n",
    "        cmd = [\n",
    "            \"python\", script_path,\n",
    "            \"--functions\", json.dumps(functions),\n",
    "            \"--test_cases\", json.dumps(test_cases),\n",
    "            \"--max_workers\", str(max_workers),\n",
    "            \"--timeout\", str(timeout)\n",
    "        ]\n",
    "        \n",
    "        process = subprocess.Popen(\n",
    "            cmd, \n",
    "            stdout=subprocess.PIPE,\n",
    "            stderr=subprocess.STDOUT,\n",
    "            text=True,\n",
    "            bufsize=1\n",
    "        )\n",
    "        \n",
    "        pbar = None\n",
    "        output = []\n",
    "        total_tasks = None\n",
    "        while True:\n",
    "            line = process.stdout.readline()\n",
    "            if not line:\n",
    "                break\n",
    "            line = line.strip()\n",
    "            output.append(line)\n",
    "            \n",
    "            if line.startswith(\"PROGRESS_TASK: \"):\n",
    "                # 解析进度数据\n",
    "                progress_part = line.split(\"PROGRESS_TASK: \")[1]\n",
    "                try:\n",
    "                    current, total = map(int, progress_part.split('/'))\n",
    "                except ValueError:\n",
    "                    continue\n",
    "                \n",
    "                # 初始化进度条\n",
    "                if not pbar:\n",
    "                    total_tasks = total\n",
    "                    pbar = tqdm(\n",
    "                        total=total,\n",
    "                        desc=\"Testing Progress\",\n",
    "                        bar_format=\"{l_bar}{bar}| {n_fmt}/{total_fmt}\",\n",
    "                        ascii=True\n",
    "                    )\n",
    "                \n",
    "                # 更新进度（确保不越界）\n",
    "                current = max(0, min(current, total_tasks))\n",
    "                if pbar.n != current:\n",
    "                    pbar.n = current\n",
    "                    pbar.refresh()\n",
    "        \n",
    "        # 收尾工作\n",
    "        if pbar:\n",
    "            pbar.n = total_tasks\n",
    "            pbar.refresh()\n",
    "            pbar.close()\n",
    "        \n",
    "        process.wait()\n",
    "        \n",
    "        # 解析最终结果\n",
    "        func_results = {}\n",
    "        test_results = {}\n",
    "        for line in output:\n",
    "            if line.startswith(\"FUNCTION_RESULTS:\"):\n",
    "                func_results = json.loads(line[len(\"FUNCTION_RESULTS:\"):])\n",
    "            elif line.startswith(\"TEST_RESULTS:\"):\n",
    "                test_results = json.loads(line[len(\"TEST_RESULTS:\"):])\n",
    "        \n",
    "        return func_results, test_results, \"\\n\".join(output)\n",
    "\n",
    "    def run_all_tests(self, functions, test_cases, max_workers=5, timeout=5):\n",
    "        fr, tr, _ = self.run_code_runner_in_subprocess(functions, test_cases, max_workers, timeout)\n",
    "        print(_)\n",
    "        return fr, tr\n",
    "\n",
    "# 示例用法保持不变\n",
    "# 示例测试数据\n",
    "functions = {\n",
    "    \"func1\": {\n",
    "        \"code\": \"def func1():\\n    return 'ok'\",\n",
    "        \"main_function_name\": \"func1\"\n",
    "    }\n",
    "}\n",
    "\n",
    "test_cases = {\n",
    "    \"test_sleep_2\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(2)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_5\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(5)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    },\n",
    "    \"test_sleep_10\": {\n",
    "        \"test_function\": (\n",
    "            \"def test_sleep(func):\\n\"\n",
    "            \"    import time\\n\"\n",
    "            \"    time.sleep(10)\\n\"\n",
    "            \"    return func() == 'ok'\"\n",
    "        )\n",
    "    }\n",
    "}\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # 通过 subprocess 调用外部脚本执行测试\n",
    "    code_runner = CodeRunner(max_workers=10)\n",
    "    fr, tr = code_runner.run_all_tests(functions, test_cases, timeout=5.1)\n",
    "    print(\"Parsed Function Results:\", fr)\n",
    "    print(\"Parsed Test Results:\", tr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "functions = {'code_0': {'code': 'def find_subsequences(nums):\\n    \"\"\"Generate all non-empty subsequences of the given integer array nums.\"\"\"\\n    subsequences = []\\n    \\n    # Using a recursive helper function to find all combinations\\n    def backtrack(start, current):\\n        # Add non-empty subsequence to the list\\n        if current:\\n            subsequences.append(current[:])  # Add a copy of the current subsequence\\n        \\n        for i in range(start, len(nums)):\\n            current.append(nums[i])  # Include the current number\\n            backtrack(i + 1, current)  # Recurse to include further elements\\n            current.pop()  # Exclude the current number and backtrack\\n    \\n    backtrack(0, [])\\n    return subsequences\\n\\ndef calculate_alternating_sum(subsequence):\\n    \"\"\"Calculate the alternating sum of the given subsequence.\"\"\"\\n    even_sum = 0\\n    odd_sum = 0\\n    \\n    for i in range(len(subsequence)):\\n        if i % 2 == 0:  # Even index\\n            even_sum += subsequence[i]\\n        else:  # Odd index\\n            odd_sum += subsequence[i]\\n    \\n    return even_sum - odd_sum\\n\\ndef maximize_product(valid_subsequences, limit):\\n    \"\"\"Maximize the product of numbers in a valid subsequence without exceeding the limit.\"\"\"\\n    max_product = -1\\n    \\n    for subsequence in valid_subsequences:\\n        product = 1\\n        for num in subsequence:\\n            product *= num\\n\\n        if product <= limit:\\n            max_product = max(max_product, product)\\n    \\n    return max_product\\n\\ndef find_max_product_with_alternating_sum(nums, k, limit):\\n    \"\"\"Find the maximum product of subsequences with a specific alternating sum.\"\"\"\\n    # Generate all non-empty subsequences\\n    subsequences = find_subsequences(nums)\\n    max_product = -1\\n    \\n    for subsequence in subsequences:\\n        # Calculate the alternating sum of the subsequence\\n        if calculate_alternating_sum(subsequence) == k:\\n            # If alternating sum matches k, compute possible max product\\n            max_product = max(max_product, maximize_product([subsequence], limit))\\n\\n    return max_product', 'plan': None, 'main_function_name': None}, 'code_1': {'code': 'from itertools import combinations\\nfrom typing import List\\n\\ndef find_subsequences(nums: List[int]) -> List[List[int]]:\\n    \"\"\"Generate all non-empty subsequences of a given integer array nums.\"\"\"\\n    subsequences = []\\n    n = len(nums)\\n    \\n    # Iterate through all possible lengths of subsequences\\n    for length in range(1, n + 1):\\n        # Generate all combinations of the current length\\n        for combo in combinations(nums, length):\\n            subsequences.append(list(combo))\\n    \\n    return subsequences\\n\\ndef calculate_alternating_sum(subsequence: List[int]) -> int:\\n    \"\"\"Calculate the alternating sum of a given subsequence.\"\"\"\\n    even_sum = 0\\n    odd_sum = 0\\n    \\n    # Calculate even and odd sums based on indices\\n    for index, value in enumerate(subsequence):\\n        if index % 2 == 0:\\n            even_sum += value\\n        else:\\n            odd_sum += value\\n            \\n    return even_sum - odd_sum\\n\\ndef maximize_product(subsequence: List[int], limit: int) -> int:\\n    \"\"\"Maximize the product of numbers in a valid subsequence without exceeding the limit.\"\"\"\\n    product = 1\\n    for number in subsequence:\\n        product *= number\\n    \\n    if product <= limit:\\n        return product\\n    return -1\\n\\ndef maximize_product_of_subsequences(nums: List[int], k: int, limit: int) -> int:\\n    \"\"\"Find the maximum product from subsequences with an alternating sum equal to k.\"\"\"\\n    max_product = -1  # Initialize max_product as an invalid state (-1)\\n    subsequences = find_subsequences(nums)\\n\\n    # Evaluate each subsequence\\n    for subsequence in subsequences:\\n        alternating_sum = calculate_alternating_sum(subsequence)\\n        # Check if the alternating sum equals k\\n        if alternating_sum == k:\\n            product = maximize_product(subsequence, limit)\\n            if product > max_product:\\n                max_product = product\\n                \\n    return max_product\\n\\n# Main function to integrate all components\\ndef find_max_product(nums: List[int], k: int, limit: int) -> int:\\n    return maximize_product_of_subsequences(nums, k, limit)', 'plan': None, 'main_function_name': None}, 'code_2': {'code': 'from itertools import combinations\\n\\ndef find_subsequences(nums):\\n    # Initialize an empty list to hold all valid subsequences\\n    subsequences = []\\n    \\n    # Explore all combinations of elements in the nums array\\n    for r in range(1, len(nums) + 1):\\n        for combo in combinations(nums, r):\\n            subsequences.append(combo)\\n    \\n    return subsequences\\n\\ndef calculate_alternating_sum(subsequence):\\n    even_sum = 0\\n    odd_sum = 0\\n    for index, value in enumerate(subsequence):\\n        if index % 2 == 0:\\n            even_sum += value\\n        else:\\n            odd_sum += value\\n    return even_sum - odd_sum\\n\\ndef maximize_product(subsequences, limit):\\n    max_product = -1\\n    for subsequence in subsequences:\\n        product = 1\\n        for number in subsequence:\\n            product *= number\\n        \\n        # Check if the product is within the limit and greater than the current max product\\n        if product <= limit and product > max_product:\\n            max_product = product\\n            \\n    return max_product\\n\\ndef maximize_alternating_sum_product(nums, k, limit):\\n    # Step 1: Find all non-empty subsequences of the nums array\\n    subsequences = find_subsequences(nums)\\n    \\n    max_product = -1\\n    \\n    # Step 2: For each subsequence, calculate the alternating sum\\n    for subsequence in subsequences:\\n        alternating_sum = calculate_alternating_sum(subsequence)\\n        \\n        # Step 3: Check if the alternating sum matches k\\n        if alternating_sum == k:\\n            # Step 4: Maximize product within the given limit\\n            max_product = max(max_product, maximize_product([subsequence], limit))\\n    \\n    return max_product', 'plan': None, 'main_function_name': None}, 'code_3': {'code': 'def find_subsequences(nums):\\n    # Function to find all non-empty subsequences of the given integer array\\n    from itertools import combinations\\n    \\n    subsequences = []\\n    n = len(nums)\\n    \\n    # Generate all non-empty combinations of indices\\n    for r in range(1, n + 1):\\n        for combo in combinations(nums, r):\\n            subsequences.append(combo)\\n    \\n    return subsequences\\n\\ndef calculate_alternating_sum(subsequence):\\n    # Function to calculate the alternating sum\\n    even_sum = 0\\n    odd_sum = 0\\n    \\n    for index, value in enumerate(subsequence):\\n        if index % 2 == 0:  # Even index\\n            even_sum += value\\n        else:               # Odd index\\n            odd_sum += value\\n            \\n    return even_sum - odd_sum\\n\\ndef maximize_product(valid_subsequences, limit):\\n    # Function to maximize the product of numbers in valid subsequences within a limit\\n    max_product = -1\\n    \\n    for subsequence in valid_subsequences:\\n        product = 1\\n        for number in subsequence:\\n            product *= number\\n        \\n        if product <= limit and product > max_product:\\n            max_product = product\\n            \\n    return max_product\\n\\ndef find_maximized_product(nums, k, limit):\\n    # Overall function to find a non-empty subsequence with alternating sum = k\\n    subsequences = find_subsequences(nums)\\n    max_product = -1\\n    \\n    for subsequence in subsequences:\\n        if calculate_alternating_sum(subsequence) == k:\\n            product = maximize_product([subsequence], limit)\\n            if product > max_product:\\n                max_product = product\\n    \\n    return max_product if max_product != -1 else -1', 'plan': None, 'main_function_name': None}, 'code_4': {'code': 'def find_subsequences(nums):\\n    \"\"\"\\n    Generate all non-empty subsequences of the given integer array nums.\\n    \\n    :param nums: List[int]\\n    :return: List[List[int]]\\n    \"\"\"\\n    subsequences = []\\n\\n    def backtrack(start, path):\\n        if path:  # If the current path is not empty\\n            subsequences.append(path)\\n        for i in range(start, len(nums)):\\n            backtrack(i + 1, path + [nums[i]])\\n\\n    backtrack(0, [])\\n    return subsequences\\n\\n\\ndef calculate_alternating_sum(subsequence):\\n    \"\"\"\\n    Calculate the alternating sum of the given subsequence.\\n    \\n    :param subsequence: List[int]\\n    :return: int\\n    \"\"\"\\n    even_sum = 0\\n    odd_sum = 0\\n\\n    for i, num in enumerate(subsequence):\\n        if i % 2 == 0:  # Even index\\n            even_sum += num\\n        else:  # Odd index\\n            odd_sum += num\\n\\n    return even_sum - odd_sum\\n\\n\\ndef maximize_product(subsequence, limit):\\n    \"\"\"\\n    Maximize the product of numbers in the subsequence without exceeding the given limit.\\n    \\n    :param subsequence: List[int]\\n    :param limit: int\\n    :return: int\\n    \"\"\"\\n    product = 1\\n    for num in subsequence:\\n        product *= num\\n    \\n    if product <= limit:\\n        return product\\n    else:\\n        return -1\\n\\n\\ndef find_max_product_with_alternating_sum(nums, k, limit):\\n    \"\"\"\\n    Find the maximum product of any subsequence whose alternating sum equals k, without exceeding the given limit.\\n    \\n    :param nums: List[int]\\n    :param k: int\\n    :param limit: int\\n    :return: int\\n    \"\"\"\\n    max_product = -1\\n    subsequences = find_subsequences(nums)\\n\\n    for subsequence in subsequences:\\n        if calculate_alternating_sum(subsequence) == k:\\n            product = maximize_product(subsequence, limit)\\n            if product > max_product:\\n                max_product = product\\n\\n    return max_product\\n\\n\\n# Test function\\ndef test_case(func):\\n    # Case 1: Typical input with a valid subsequence\\n    nums1 = [1, 2, 3]\\n    k1 = 2\\n    limit1 = 10\\n    expected1 = 6  # [1, 2, 3] gives an alternating sum of 2 and product of 6\\n    result1 = func(nums1, k1, limit1)\\n    if result1 != expected1:\\n        return False\\n\\n    # Case 2: Input where no valid subsequence exists\\n    nums2 = [0, 2, 3]\\n    k2 = -5\\n    limit2 = 12\\n    expected2 = -1  # No subsequence can give an alternating sum of -5\\n    result2 = func(nums2, k2, limit2)\\n    if result2 != expected2:\\n        return False\\n\\n    # Case 3: Larger array with repeated values\\n    nums3 = [2, 2, 3, 3]\\n    k3 = 0\\n    limit3 = 9\\n    expected3 = 9  # Subsequence [3, 3] gives an alternating sum of 0 and product of 9\\n    result3 = func(nums3, k3, limit3)\\n    if result3 != expected3:\\n        return False\\n\\n    return True\\n\\n\\n# Overall function to execute\\ndef main(nums, k, limit):\\n    return find_max_product_with_alternating_sum(nums, k, limit)', 'plan': None, 'main_function_name': None}}\n",
    "\n",
    "test_cases = {'test_case_1': {'test_type': 'correctness', 'purpose': \"I will design a test function to validate the correctness of the provided function by using specific inputs and checking the outputs against expected results. The first test case will use a small array to confirm that the function works for typical inputs. The second test case will check for a scenario in which an alternating sum cannot equal the desired value, expecting a return of -1. The third test case will explore the edge case of a larger array with repeated values to see if it correctly finds a subsequence within limits. I will ensure that the expected output matches what is returned for each case, confirming the function's accuracy.\", 'test_function': 'def test_case(func):\\n    # Case 1: Typical input with a valid subsequence\\n    nums1 = [1, 2, 3]\\n    k1 = 2\\n    limit1 = 10\\n    expected1 = 6  # [1, 2, 3] gives an alternating sum of 2 and product of 6\\n    result1 = func(nums1, k1, limit1)\\n    if result1 != expected1:\\n        return False\\n\\n    # Case 2: Input where no valid subsequence exists\\n    nums2 = [0, 2, 3]\\n    k2 = -5\\n    limit2 = 12\\n    expected2 = -1  # No subsequence can give alternating sum of -5\\n    result2 = func(nums2, k2, limit2)\\n    if result2 != expected2:\\n        return False\\n\\n    # Case 3: Larger array with repeated values\\n    nums3 = [2, 2, 3, 3]\\n    k3 = 0\\n    limit3 = 9\\n    expected3 = 9  # Subsequence [3, 3] gives an alternating sum of 0 and product of 9\\n    result3 = func(nums3, k3, limit3)\\n    if result3 != expected3:\\n        return False\\n\\n    return True'}, 'test_case_1_1': {'test_type': 'correctness', 'purpose': 'The purpose of this test function is to validate the output of the code under test by providing a simple case with a small integer array, a valid k, and a limit. The expected result is an easy-to-calculate product of the elements that meet the alternating sum condition. This will demonstrate that the function correctly identifies subsequences and computes the product while observing the defined constraints.', 'test_function': 'def test_case(func):\\n    nums = [1, 2, 3]\\n    k = 2\\n    limit = 10\\n    expected_output = 6\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_2': {'test_type': 'correctness', 'purpose': 'Now, I will create a test case where no valid subsequence meets the required alternating sum (k). This ensures that the function returns -1 as expected, validating its output for such edge cases.', 'test_function': 'def test_case(func):\\n    nums = [0, 2, 3]\\n    k = -5\\n    limit = 12\\n    expected_output = -1\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_3': {'test_type': 'correctness', 'purpose': 'Here, I will test a case with an empty list for nums, which should appropriately handle edge cases and return -1 since there are no elements to work with. This will ensure that the function correctly detects invalid input scenarios.', 'test_function': 'def test_case(func):\\n    nums = []\\n    k = 0\\n    limit = 12\\n    expected_output = -1\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_4': {'test_type': 'edge_case', 'purpose': 'This time, I will test a scenario with both negative and zero values in nums, ensuring that the function correctly computes the alternating sum and addresses the presence of positive, negative, and zero values.', 'test_function': 'def test_case(func):\\n    nums = [2, 2, 3, 3]\\n    k = 0\\n    limit = 9\\n    expected_output = 9\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_5': {'test_type': 'edge_case', 'purpose': 'Next, I will create an edge case that includes a large array of numbers and checks if the function can handle them efficiently. The test expects a particular product under the specified limit, testing the performance aspect along with correctness.', 'test_function': 'def test_case(func):\\n    nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  \\n    k = 15\\n    limit = 1000\\n    expected_output = 5040  # The maximum product should come from the entire array since the alternating sum can be satisfied.\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_6': {'test_type': 'runtime', 'purpose': 'For performance checks, I will test how the function behaves with input arrays of a larger size, in order to ensure it executes in a reasonable time frame. A standard threshold will be set for acceptable runtime.', 'test_function': 'import time\\n\\ndef test_case(func):\\n    nums = list(range(1, 1000))  # Large array\\n    k = 500\\n    limit = 100000\\n    start_time = time.time()\\n    result = func(nums, k, limit)\\n    end_time = time.time()\\n    runtime = end_time - start_time\\n    return result != -1 and runtime < 1.0  # Expecting some valid output and runtime under 1 second'}, 'test_case_1_2': {'test_type': 'correctness', 'purpose': 'I will create a test case that verifies the correctness of the function by providing a known input that meets the specification of the problem and checking if it produces the expected output accurately. This test will involve a simple integer array where an alternating sum equals a specified value, and the product does not exceed the limit.', 'test_function': 'def test_case(func):\\n    nums = [1, 2, 3]\\n    k = 2\\n    limit = 10\\n    expected_output = 6  # The product of the valid subsequence [1, 2, 3]\\n    result = func(nums, k, limit)\\n    return result == expected_output'}, 'test_case_1_3': {'test_type': 'correctness', 'purpose': 'This test case is designed to verify the correctness of the function by using specific input values that have been selected to ensure the expected output matches exactly what is produced by the function. \\nHere, the input array nums contains combinations of integers that will yield a non-empty subsequence whose alternating sum equals k while the product of the numbers does not exceed the specified limit.\\nThe expected output for these inputs is 6, which is the maximum product for the alternating sum of 2.', 'test_function': 'def test_case(func):\\n    # Test input\\n    nums = [1, 2, 3]\\n    k = 2\\n    limit = 10\\n    expected_output = 6  # Expected output\\n\\n    # Invoke the function with the test input\\n    result = func(nums, k, limit)\\n\\n    # Return whether the test passed\\n    return result == expected_output'}, 'test_case_2_1': {'test_type': 'edge_case', 'purpose': 'In this test case, we are testing the scenario where the input array nums contains all zero values. This is an edge case as the alternating sum of any subsequence will always be zero, and thus, will not satisfy the requirement for k if it is not zero as well. The expected output is -1, indicating that no valid subsequence exists.', 'test_function': \"def test_case(func):\\n    # Test input\\n    nums = [0, 0, 0]\\n    k = 5\\n    limit = 15\\n    expected_output = -1  # Expected output because there's no valid subsequence\\n\\n    # Invoke the function with the test input\\n    result = func(nums, k, limit)\\n\\n    # Return whether the test passed\\n    return result == expected_output\"}, 'test_case_3_1': {'test_type': 'edge_case', 'purpose': 'This test case checks a situation where the array nums has only one element but is significantly large compared to the limit. Here the input is designed such that it contains negative values and the limit is strictly less than any possible product value. The expected output should be -1 since no valid product can be found.', 'test_function': 'def test_case(func):\\n    # Test input with negative and limited element\\n    nums = [5]\\n    k = 1\\n    limit = 1\\n    expected_output = -1  # Expected output since the product exceeds the limit\\n\\n    # Invoke the function with the test input\\n    result = func(nums, k, limit)\\n\\n    # Return whether the test passed\\n    return result == expected_output'}, 'test_case_4_1': {'test_type': 'correctness', 'purpose': 'This test case is kept to evaluate how well the function performs with larger inputs while ensuring all combinations are checked. The input array will contain unique integers that can give valid subsequences. The expected output in this test case is calculated specifically for combinations that yield valid alternating sums and products under the limit.', 'test_function': 'def test_case(func):\\n    # Test input\\n    nums = [2, 2, 3, 3]\\n    k = 0\\n    limit = 9\\n    expected_output = 9  # Expected output which is the product of [3, 3]\\n\\n    # Invoke the function with the test input\\n    result = func(nums, k, limit)\\n\\n    # Return whether the test passed\\n    return result == expected_output'}, 'test_case_5_1': {'test_type': 'error_handling', 'purpose': \"This test will ensure that the function can handle invalid input gracefully. For example, the input array nums is completely empty. This should lead to the function raising a specific exception to indicate invalid input handling. The expected behavior here is to return -1, as there can't be any valid products or subsequences.\", 'test_function': 'def test_case(func):\\n    # Test input with an empty list\\n    nums = []\\n    k = 3\\n    limit = 5\\n    expected_output = -1  # Expected output for invalid input\\n\\n    # Invoke the function with the test input\\n    result = func(nums, k, limit)\\n\\n    # Return whether the test passed\\n    return result == expected_output'}, 'test_case_1_4': {'test_type': 'correctness', 'purpose': \"The purpose of this test function is to validate the output of the main function with a specific input that satisfies the requirements outlined in the problem statement. The case will check if the function can correctly find a subsequence from the given integer array that has an alternating sum equal to 'k' and maximizes the product under the provided limit. \\n\\nFor the test case, I will use a small array of integers that allows us to easily compute expected results and thus ensure that the function behaves correctly. The expected output will be the product of the subsequence that meets the criteria or -1 if no valid subsequence exists.\", 'test_function': 'def test_case(func):\\n    nums = [1, 2, 3]  # Input array\\n    k = 2              # Required alternating sum\\n    limit = 10         # Maximum product limit\\n    \\n    # Call the function under test with the input\\n    result = func(nums, k, limit)\\n    \\n    # The valid subsequence that gives the expected alternating sum of 2 is the entire array.\\n    # The product of the subsequence [1, 2, 3] is 1 * 2 * 3 = 6.\\n    expected_result = 6\\n    \\n    # Return True if the result matches the expected output, otherwise False.\\n    return result == expected_result'}, 'test_case_2_2': {'test_type': 'edge_case', 'purpose': \"This test case is designed to check the behavior of the function when no valid subsequence exists. Specifically, I will use an input array and parameters such that it is impossible to achieve the required alternating sum 'k'. The expected output for this scenario is -1. This is crucial to ensure the function correctly identifies and handles cases where no valid combination is present.\", 'test_function': 'def test_case(func):\\n    nums = [0, 2, 3]  # Input array\\n    k = -5             # Required alternating sum\\n    limit = 12         # Maximum product limit\\n\\n    # Call the function under test with the input\\n    result = func(nums, k, limit)\\n\\n    # Since no valid subsequence can achieve the alternating sum of -5, we expect -1 as the output.\\n    expected_result = -1\\n\\n    # Return True if the result matches the expected output, otherwise False.\\n    return result == expected_result'}, 'test_case_3_2': {'test_type': 'correctness', 'purpose': 'This test function aims to cover a normal case with both positive and negative numbers in the input array. I want to ensure that the function can handle sequences that include both types of integers and still calculate the expected results based on the given parameters. This will increase the robustness of the function against varied input scenarios.', 'test_function': 'def test_case(func):\\n    nums = [2, 2, 3, 3]  # Input array\\n    k = 0                 # Required alternating sum\\n    limit = 9             # Maximum product limit\\n\\n    # Call the function under test with the input\\n    result = func(nums, k, limit)\\n\\n    # Subsequence [3, 3] and [2, 2] both give an alternating sum of 0\\n    # Their respective products are 9 and 4. The maximum product within the limit is 9.\\n    expected_result = 9\\n\\n    # Return True if the result matches the expected output, otherwise False.\\n    return result == expected_result'}, 'test_case_4_2': {'test_type': 'error_handling', 'purpose': \"This test case is designed to ensure that the function raises appropriate errors or returns a defined output when invalid input is provided. Specifically, I will test with an empty list, where it's expected the function should return -1 due to the absence of any subsequences. This is important for establishing the function's resilience against bad input.\", 'test_function': 'def test_case(func):\\n    nums = []             # Empty input array\\n    k = 0                  # Required alternating sum\\n    limit = 10            # Maximum product limit\\n\\n    # Call the function under test with the input\\n    result = func(nums, k, limit)\\n\\n    # Since there are no elements to form subsequences, we expect -1 as the output.\\n    expected_result = -1\\n\\n    # Return True if the result matches the expected output, otherwise False.\\n    return result == expected_result'}}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n",
      "def find_subsequences(nums):\n",
      "    # Function to find all non-empty subsequences of the given integer array\n",
      "    from itertools import combinations\n",
      "    \n",
      "    subsequences = []\n",
      "    n = len(nums)\n",
      "    \n",
      "    # Generate all non-empty combinations of indices\n",
      "    for r in range(1, n + 1):\n",
      "        for combo in combinations(nums, r):\n",
      "            subsequences.append(combo)\n",
      "    \n",
      "    return subsequences\n",
      "\n",
      "def calculate_alternating_sum(subsequence):\n",
      "    # Function to calculate the alternating sum\n",
      "    even_sum = 0\n",
      "    odd_sum = 0\n",
      "    \n",
      "    for index, value in enumerate(subsequence):\n",
      "        if index % 2 == 0:  # Even index\n",
      "            even_sum += value\n",
      "        else:               # Odd index\n",
      "            odd_sum += value\n",
      "            \n",
      "    return even_sum - odd_sum\n",
      "\n",
      "def maximize_product(valid_subsequences, limit):\n",
      "    # Function to maximize the product of numbers in valid subsequences within a limit\n",
      "    max_product = -1\n",
      "    \n",
      "    for subsequence in valid_subsequences:\n",
      "        product = 1\n",
      "        for number in subsequence:\n",
      "            product *= number\n",
      "        \n",
      "        if product <= limit and product > max_product:\n",
      "            max_product = product\n",
      "            \n",
      "    return max_product\n",
      "\n",
      "def find_maximized_product(nums, k, limit):\n",
      "    # Overall function to find a non-empty subsequence with alternating sum = k\n",
      "    subsequences = find_subsequences(nums)\n",
      "    max_product = -1\n",
      "    \n",
      "    for subsequence in subsequences:\n",
      "        if calculate_alternating_sum(subsequence) == k:\n",
      "            product = maximize_product([subsequence], limit)\n",
      "            if product > max_product:\n",
      "                max_product = product\n",
      "    \n",
      "    return max_product if max_product != -1 else -1\n"
     ]
    }
   ],
   "source": [
    "function_name = \"code_3\"\n",
    "print(functions[function_name][\"main_function_name\"])\n",
    "print(functions[function_name][\"code\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "def test_case(func):\n",
      "    # Case 1: Typical input with a valid subsequence\n",
      "    nums1 = [1, 2, 3]\n",
      "    k1 = 2\n",
      "    limit1 = 10\n",
      "    expected1 = 6  # [1, 2, 3] gives an alternating sum of 2 and product of 6\n",
      "    result1 = func(nums1, k1, limit1)\n",
      "    if result1 != expected1:\n",
      "        return False\n",
      "\n",
      "    # Case 2: Input where no valid subsequence exists\n",
      "    nums2 = [0, 2, 3]\n",
      "    k2 = -5\n",
      "    limit2 = 12\n",
      "    expected2 = -1  # No subsequence can give alternating sum of -5\n",
      "    result2 = func(nums2, k2, limit2)\n",
      "    if result2 != expected2:\n",
      "        return False\n",
      "\n",
      "    # Case 3: Larger array with repeated values\n",
      "    nums3 = [2, 2, 3, 3]\n",
      "    k3 = 0\n",
      "    limit3 = 9\n",
      "    expected3 = 9  # Subsequence [3, 3] gives an alternating sum of 0 and product of 9\n",
      "    result3 = func(nums3, k3, limit3)\n",
      "    if result3 != expected3:\n",
      "        return False\n",
      "\n",
      "    return True\n"
     ]
    }
   ],
   "source": [
    "# print test cases\n",
    "print(test_cases[\"test_case_1\"][\"test_function\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello! How can I assist you today?\n"
     ]
    }
   ],
   "source": [
    "# Please install OpenAI SDK first: `pip3 install openai`\n",
    "\n",
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(api_key=\"sk-fe7a17e8d390496d92e4e8d5346b487d\", base_url=\"https://api.deepseek.com/v1\")\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"deepseek-reasoner\", # 'deepseek-reasoner' \"deepseek-chat\"\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"Hello\"},\n",
    "    ],\n",
    "    stream=False\n",
    ")\n",
    "\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "- Error Type: main_func_compile_error\n",
      "Description: Main function not found\n",
      "\n",
      "- Error Type: test_failed\n",
      "Description: Assertion error on empty input\n",
      "Test Function:\n",
      "def test2():\n",
      " assert main(\"\") is None\n",
      "Test Type: edge_case\n",
      "\n",
      "Please carefully review the above errors and modify the code to address each issue. Ensure that all test cases pass and handle any edge cases or runtime errors appropriately.\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "def test_case_error_prompt(dict_1, dict_2, n):\n",
    "    # 筛选共同存在且失败的测试用例\n",
    "    filtered_tests = [\n",
    "        test_id for test_id in dict_1\n",
    "        if test_id in dict_2 and not dict_2[test_id].get('success', True)\n",
    "    ]\n",
    "    \n",
    "    if not filtered_tests or n <= 0:\n",
    "        return \"\"\n",
    "    \n",
    "    # 分离test_failed和其他错误类型\n",
    "    others = []\n",
    "    test_failed = []\n",
    "    for test_id in filtered_tests:\n",
    "        reason = dict_2[test_id]['reason']\n",
    "        if reason == 'test_failed':\n",
    "            test_failed.append(test_id)\n",
    "        else:\n",
    "            others.append(test_id)\n",
    "    \n",
    "    # 随机选择其他错误类型（每个类型最多一个）\n",
    "    random.shuffle(others)\n",
    "    seen_reasons = set()\n",
    "    selected_others = []\n",
    "    for test_id in others:\n",
    "        reason = dict_2[test_id]['reason']\n",
    "        if reason not in seen_reasons:\n",
    "            seen_reasons.add(reason)\n",
    "            selected_others.append(test_id)\n",
    "            if len(selected_others) == n:\n",
    "                break\n",
    "    \n",
    "    # 补充test_failed用例\n",
    "    remaining = max(n - len(selected_others), 0)\n",
    "    selected_test_failed = []\n",
    "    if remaining > 0 and test_failed:\n",
    "        random.shuffle(test_failed)\n",
    "        selected_test_failed = test_failed[:min(remaining, len(test_failed))]\n",
    "    \n",
    "    selected = selected_others + selected_test_failed\n",
    "    selected = selected[:n]  # 确保不超过n个\n",
    "    \n",
    "    # 错误类型描述映射\n",
    "    error_descriptions = {\n",
    "        'main_func_compile_error': 'Main function compilation failed',\n",
    "        'test_func_compile_error': 'Test function compilation failed',\n",
    "        'test_failed': 'Test assertion failed',\n",
    "        'exception': 'An exception was raised during test execution',\n",
    "        'timeout': 'Test execution timed out',\n",
    "        'worker_process_error': 'Worker process encountered an error'\n",
    "    }\n",
    "    \n",
    "    # 构建提示内容\n",
    "    prompt_lines = []\n",
    "    for test_id in selected:\n",
    "        test_result = dict_2[test_id]\n",
    "        reason = test_result['reason']\n",
    "        message = test_result.get('message') or error_descriptions.get(reason, 'Unknown error')\n",
    "        \n",
    "        parts = [f\"- Error Type: {reason}\", f\"Description: {message}\"]\n",
    "        \n",
    "        if reason == 'test_failed':\n",
    "            test_info = dict_1[test_id]\n",
    "            parts.append(f\"Test Function:\\n{test_info['test_function']}\")\n",
    "            parts.append(f\"Test Type: {test_info['test_type']}\")\n",
    "        \n",
    "        prompt_lines.append(\"\\n\".join(parts))\n",
    "    \n",
    "    # 拼接最终提示\n",
    "    final_prompt = \"\\n\\n\".join(prompt_lines)\n",
    "    final_prompt += \"\\n\\nPlease carefully review the above errors and modify the code to address each issue. Ensure that all test cases pass and handle any edge cases or runtime errors appropriately.\"\n",
    "    \n",
    "    return final_prompt\n",
    "\n",
    "dict_1 = {\n",
    "    'test_1': {'test_type': 'correctness', 'purpose': 'Check basic functionality', 'test_function': 'def test1():\\n assert main() == 5'},\n",
    "    'test_2': {'test_type': 'edge_case', 'purpose': 'Test empty input', 'test_function': 'def test2():\\n assert main(\"\") is None'}\n",
    "}\n",
    "\n",
    "dict_2 = {\n",
    "    'test_1': {'success': False, 'reason': 'main_func_compile_error', 'message': 'Main function not found'},\n",
    "    'test_2': {'success': False, 'reason': 'test_failed', 'message': 'Assertion error on empty input'}\n",
    "}\n",
    "\n",
    "print(test_case_error_prompt(dict_1, dict_2, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
