{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a4c73249",
   "metadata": {},
   "source": [
    "# Code For Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "20a8f2de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "\n",
    "def normalize_text(s: str) -> str:\n",
    "    \"\"\"统一格式：小写、去掉标点符号、去掉多余空格\"\"\"\n",
    "    # 转小写\n",
    "    s = s.lower()\n",
    "    # 去掉标点符号（只保留字母和数字）\n",
    "    s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "    # 去掉多余空格\n",
    "    s = ' '.join(s.split())\n",
    "    return s\n",
    "\n",
    "def judge_correct(pred: str, true: str) -> bool:\n",
    "    return normalize_text(pred) == normalize_text(true)\n",
    "\n",
    "def evaluate_jsonl(file_path):\n",
    "    \"\"\"评估JSONL文件中的结果\"\"\"\n",
    "    \n",
    "    # 读取JSONL文件\n",
    "    data = []\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            data.append(json.loads(line.strip()))\n",
    "    \n",
    "    print(f\"📊 加载了 {len(data)} 条数据\")\n",
    "    \n",
    "    # 统计变量\n",
    "    total_correct = 0\n",
    "    total_count = len(data)\n",
    "    total_tokens = 0\n",
    "    level_stats = {}\n",
    "    \n",
    "    # 存储任务详细信息\n",
    "    task_details = []\n",
    "    \n",
    "    # 逐条处理\n",
    "    for item in data:\n",
    "        prediction = str(item.get('prediction', ''))\n",
    "        true_answer = str(item.get('true_answer', ''))\n",
    "        is_correct = judge_correct(prediction, true_answer)\n",
    "        total_correct += is_correct\n",
    "        \n",
    "        # token消耗\n",
    "        token_counts = item.get('token_counts', {})\n",
    "        tokens = token_counts.get('input', 0) + token_counts.get('output', 0)\n",
    "        total_token = token_counts.get('total_token_count', 0)\n",
    "        token_sum = tokens + total_token\n",
    "        total_tokens += token_sum\n",
    "        \n",
    "        # 按level统计\n",
    "        level = item.get('Level', 'unknown')\n",
    "        if level not in level_stats:\n",
    "            level_stats[level] = {'correct': 0, 'total': 0, 'tokens': 0}\n",
    "        \n",
    "        level_stats[level]['correct'] += is_correct\n",
    "        level_stats[level]['total'] += 1\n",
    "        level_stats[level]['tokens'] += token_sum\n",
    "        \n",
    "        # 保存任务详细信息（用于排序）\n",
    "        task_details.append({\n",
    "            \"task_id\": item.get('task_id', 'unknown'),\n",
    "            \"level\": level,\n",
    "            \"is_correct\": is_correct,\n",
    "            \"prediction\": prediction,\n",
    "            \"true_answer\": true_answer,\n",
    "            \"tokens\": token_sum\n",
    "        })\n",
    "    \n",
    "    # 打印总体结果\n",
    "    print(f\"\\n🎯 整体准确率: {total_correct/total_count:.2%} ({total_correct}/{total_count})\")\n",
    "    print(f\"💰 总Token消耗: {total_tokens:,}\")\n",
    "    print(f\"📈 平均每任务Token: {total_tokens/total_count:.1f}\")\n",
    "    \n",
    "    print(f\"\\n📊 各级别准确率:\")\n",
    "    for level in sorted(level_stats.keys()):\n",
    "        stats = level_stats[level]\n",
    "        acc = stats['correct'] / stats['total']\n",
    "        print(f\"   Level {level}: {acc:.2%} ({stats['correct']}/{stats['total']}) - 平均Token: {stats['tokens']/stats['total']:.1f}\")\n",
    "\n",
    "    # 打印每个任务的详细信息\n",
    "    # print(f\"\\n📋 每个任务的详细信息:\")\n",
    "    # for item in task_details:\n",
    "    #     print(f\"   任务: {item['task_id']} - LEVEL {item['level']} {item['is_correct']} - 预测: {item['prediction']} - 真实: {item['true_answer']}\")\n",
    "\n",
    "    # 打印 token 消耗排名前十的任务\n",
    "    print(f\"\\n🔥 Token消耗排名前十的任务:\")\n",
    "    top10 = sorted(task_details, key=lambda x: x['tokens'], reverse=True)[:10]\n",
    "    for idx, item in enumerate(top10, 1):\n",
    "        print(f\"{idx}. 任务: {item['task_id']} - LEVEL {item['level']} - Token: {item['tokens']}\")\n",
    "        print(f\"   正确性: {item['is_correct']} | 预测: {item['prediction']} | 真实: {item['true_answer']}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "id": "981ba815",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "import unicodedata\n",
    "import os\n",
    "import sys\n",
    "from collections import defaultdict\n",
    "from rich.console import Console\n",
    "from rich.table import Table\n",
    "from rich.progress import track\n",
    "\n",
    "# --- 标准化函数 (来自您的代码，保持不变) ---\n",
    "def normalize_text(s: str) -> str:\n",
    "    if not isinstance(s, str):\n",
    "        s = str(s)\n",
    "    s = s.lower()\n",
    "    s = unicodedata.normalize(\"NFKC\", s)\n",
    "    s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "    s = ' '.join(s.split())\n",
    "    return s\n",
    "\n",
    "def judge_correct(pred: str, true: str) -> bool:\n",
    "    return normalize_text(pred) == normalize_text(true)\n",
    "\n",
    "# --- 主逻辑 (使用您验证过的解析器 + 美化输出) ---\n",
    "def analyze_data_with_rich_output(input_filepath: str):\n",
    "    console = Console()\n",
    "    \n",
    "    if not os.path.exists(input_filepath):\n",
    "        console.print(f\"❌ [bold red]错误: 文件未找到 -> {input_filepath}[/bold red]\")\n",
    "        return\n",
    "\n",
    "    console.print(f\"🚀 [bold cyan]正在分析文件:[/bold cyan] [underline]{input_filepath}[/underline]...\")\n",
    "\n",
    "    try:\n",
    "        with open(input_filepath, 'r', encoding='utf-8') as infile:\n",
    "            content = infile.read()\n",
    "\n",
    "        # --- [关键修正] 使用您提供的、已验证成功的解析逻辑 ---\n",
    "        console.print(\"🔍 [dim]正在智能分割文件中的JSON对象...[/dim]\")\n",
    "        object_strings = []\n",
    "        brace_level = 0\n",
    "        start_index = -1\n",
    "        in_string = False\n",
    "\n",
    "        for i, char in enumerate(content):\n",
    "            if char == '\"':\n",
    "                if i == 0 or content[i-1] != '\\\\':\n",
    "                    in_string = not in_string\n",
    "            \n",
    "            if not in_string:\n",
    "                if char == '{':\n",
    "                    if brace_level == 0:\n",
    "                        start_index = i\n",
    "                    brace_level += 1\n",
    "                elif char == '}':\n",
    "                    if brace_level > 0:\n",
    "                        brace_level -= 1\n",
    "                    if brace_level == 0 and start_index != -1:\n",
    "                        object_str = content[start_index : i + 1]\n",
    "                        object_strings.append(object_str)\n",
    "                        start_index = -1\n",
    "        # --- 解析逻辑结束 ---\n",
    "\n",
    "        if not object_strings:\n",
    "            console.print(\"⚠️ [yellow]警告: 在文件中没有找到有效的JSON对象。[/yellow]\")\n",
    "            return\n",
    "\n",
    "        # --- 后续的统计与计算逻辑 (保持不变) ---\n",
    "        total = 0\n",
    "        correct = 0\n",
    "        total_tokens = 0\n",
    "        level_stats = defaultdict(lambda: {\"total\": 0, \"correct\": 0, \"tokens\": 0})\n",
    "\n",
    "        for obj_str in track(object_strings, description=\"处理中...\"):\n",
    "            try:\n",
    "                record = json.loads(obj_str)\n",
    "\n",
    "                prediction = record.get(\"prediction\", \"\")\n",
    "                true_answer = record.get(\"true_answer\", \"\")\n",
    "                tokens = record.get(\"token_counts\", {}).get(\"total_token_count\", 0)\n",
    "                level = record.get(\"Level\", \"N/A\")\n",
    "\n",
    "                is_correct = judge_correct(prediction, true_answer)\n",
    "\n",
    "                total += 1\n",
    "                correct += int(is_correct)\n",
    "                total_tokens += tokens\n",
    "\n",
    "                level_stats[level][\"total\"] += 1\n",
    "                level_stats[level][\"correct\"] += int(is_correct)\n",
    "                level_stats[level][\"tokens\"] += tokens\n",
    "\n",
    "            except json.JSONDecodeError as e:\n",
    "                console.print(f\"⚠️ [yellow]警告: 跳过一个无法解析的JSON对象块。错误: {e}[/yellow]\\n(片段: {obj_str[:150].strip()}...)\")\n",
    "\n",
    "        # --- 使用 Rich 创建美观的表格来展示结果 (保持不变) ---\n",
    "        console.print(\"\\n📊 [bold green]统计分析完成！[/bold green]\")\n",
    "\n",
    "        summary_table = Table(title=\"[bold]📊 总体统计 (Overall Statistics)[/bold]\", show_header=True, header_style=\"bold magenta\")\n",
    "        summary_table.add_column(\"指标 (Metric)\", style=\"dim\", width=25)\n",
    "        summary_table.add_column(\"数值 (Value)\", justify=\"right\")\n",
    "        \n",
    "        overall_acc = (correct / total * 100) if total > 0 else 0\n",
    "        avg_tokens = total_tokens / total if total > 0 else 0\n",
    "        \n",
    "        summary_table.add_row(\"总计数量 (Total Samples)\", f\"{total:,}\")\n",
    "        summary_table.add_row(\"正确数量 (Correct Samples)\", f\"[green]{correct:,}[/green]\")\n",
    "        summary_table.add_row(\"总体正确率 (Overall ACC)\", f\"[bold cyan]{overall_acc:.2f}%[/bold cyan]\")\n",
    "        summary_table.add_row(\"总消耗Tokens (Total Tokens)\", f\"{total_tokens:,}\")\n",
    "        summary_table.add_row(\"平均Tokens (Avg. Tokens/Sample)\", f\"{avg_tokens:,.2f}\")\n",
    "        \n",
    "        console.print(summary_table)\n",
    "\n",
    "        if level_stats:\n",
    "            level_table = Table(title=\"[bold]📑 按级别统计 (Statistics by Level)[/bold]\", show_header=True, header_style=\"bold blue\")\n",
    "            level_table.add_column(\"级别 (Level)\", style=\"cyan\", no_wrap=True)\n",
    "            level_table.add_column(\"正确率 (ACC)\", justify=\"right\")\n",
    "            level_table.add_column(\"样本数 (Samples)\", justify=\"right\")\n",
    "            level_table.add_column(\"总Tokens\", justify=\"right\")\n",
    "            level_table.add_column(\"平均Tokens\", justify=\"right\")\n",
    "\n",
    "            sorted_levels = sorted(level_stats.keys())\n",
    "\n",
    "            for lvl in sorted_levels:\n",
    "                stats = level_stats[lvl]\n",
    "                acc = (stats[\"correct\"] / stats[\"total\"] * 100) if stats[\"total\"] > 0 else 0\n",
    "                avg_lvl_tokens = stats[\"tokens\"] / stats[\"total\"] if stats[\"total\"] > 0 else 0\n",
    "                \n",
    "                acc_color = \"green\" if acc >= 90 else \"yellow\" if acc >= 60 else \"red\"\n",
    "                \n",
    "                level_table.add_row(\n",
    "                    str(lvl), f\"[{acc_color}]{acc:.2f}%[/]\", f\"{stats['total']:,}\",\n",
    "                    f\"{stats['tokens']:,}\", f\"{avg_lvl_tokens:,.2f}\"\n",
    "                )\n",
    "            \n",
    "            console.print(level_table)\n",
    "\n",
    "    except Exception as e:\n",
    "        console.print(f\"❌ [bold red]处理过程中发生严重错误: {e}[/bold red]\")\n",
    "\n",
    "# --- 程序主入口 ---\n",
    "# if __name__ == \"__main__\":\n",
    "#     if len(sys.argv) > 1:\n",
    "#         file_path = sys.argv[1]\n",
    "#         analyze_data_with_rich_output(file_path)\n",
    "#     else:\n",
    "#         print(\"请提供需要统计的文件路径。\")\n",
    "#         print(\"用法: python your_script_name.py /path/to/your/file.jsonl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6475536",
   "metadata": {},
   "source": [
    "## Genral Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "id": "c452ddf1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">🚀 <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">正在分析文件:</span> \n",
       "<span style=\"color: #800080; text-decoration-color: #800080; text-decoration: underline\">/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; text-decoration: underline\">single_agent.json</span>\n",
       "<span style=\"color: #ff00ff; text-decoration-color: #ff00ff; text-decoration: underline\">l</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">...</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "🚀 \u001b[1;36m正在分析文件:\u001b[0m \n",
       "\u001b[4;35m/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/\u001b[0m\u001b[4;95msingle_agent.json\u001b[0m\n",
       "\u001b[4;95ml\u001b[0m\u001b[95m...\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">🔍 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">正在智能分割文件中的JSON对象</span><span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">...</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "🔍 \u001b[2m正在智能分割文件中的JSON对象\u001b[0m\u001b[2;33m...\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "06f4dff112454e25a1fa3caa0883c816",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "📊 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">统计分析完成！</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n",
       "📊 \u001b[1;32m统计分析完成！\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">      </span><span style=\"font-weight: bold; font-style: italic\">📊 总体统计 (Overall Statistics)</span><span style=\"font-style: italic\">      </span>\n",
       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
       "┃<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\"> 指标 (Metric)             </span>┃<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\"> 数值 (Value) </span>┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 总计数量 (Total Samples)  </span>│          165 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 正确数量 (Correct         </span>│           <span style=\"color: #008000; text-decoration-color: #008000\">66</span> │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Samples)                  </span>│              │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 总体正确率 (Overall ACC)  </span>│       <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">40.00%</span> │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 总消耗Tokens (Total       </span>│   19,865,560 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Tokens)                   </span>│              │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 平均Tokens (Avg.          </span>│   120,397.33 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Tokens/Sample)            </span>│              │\n",
       "└───────────────────────────┴──────────────┘\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m      \u001b[0m\u001b[1;3m📊 总体统计 (Overall Statistics)\u001b[0m\u001b[3m      \u001b[0m\n",
       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
       "┃\u001b[1;35m \u001b[0m\u001b[1;35m指标 (Metric)            \u001b[0m\u001b[1;35m \u001b[0m┃\u001b[1;35m \u001b[0m\u001b[1;35m数值 (Value)\u001b[0m\u001b[1;35m \u001b[0m┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
       "│\u001b[2m \u001b[0m\u001b[2m总计数量 (Total Samples) \u001b[0m\u001b[2m \u001b[0m│          165 │\n",
       "│\u001b[2m \u001b[0m\u001b[2m正确数量 (Correct        \u001b[0m\u001b[2m \u001b[0m│           \u001b[32m66\u001b[0m │\n",
       "│\u001b[2m \u001b[0m\u001b[2mSamples)                 \u001b[0m\u001b[2m \u001b[0m│              │\n",
       "│\u001b[2m \u001b[0m\u001b[2m总体正确率 (Overall ACC) \u001b[0m\u001b[2m \u001b[0m│       \u001b[1;36m40.00%\u001b[0m │\n",
       "│\u001b[2m \u001b[0m\u001b[2m总消耗Tokens (Total      \u001b[0m\u001b[2m \u001b[0m│   19,865,560 │\n",
       "│\u001b[2m \u001b[0m\u001b[2mTokens)                  \u001b[0m\u001b[2m \u001b[0m│              │\n",
       "│\u001b[2m \u001b[0m\u001b[2m平均Tokens (Avg.         \u001b[0m\u001b[2m \u001b[0m│   120,397.33 │\n",
       "│\u001b[2m \u001b[0m\u001b[2mTokens/Sample)           \u001b[0m\u001b[2m \u001b[0m│              │\n",
       "└───────────────────────────┴──────────────┘\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    </span><span style=\"font-weight: bold; font-style: italic\">📑 按级别统计 (Statistics by Level)</span><span style=\"font-style: italic\">                     </span>\n",
       "┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n",
       "┃<span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"> 级别 (Level) </span>┃<span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"> 正确率 (ACC) </span>┃<span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"> 样本数 (Samples) </span>┃<span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\">   总Tokens </span>┃<span style=\"color: #000080; text-decoration-color: #000080; font-weight: bold\"> 平均Tokens </span>┃\n",
       "┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n",
       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1            </span>│       <span style=\"color: #800000; text-decoration-color: #800000\">56.60%</span> │               53 │  4,920,560 │  92,840.75 │\n",
       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2            </span>│       <span style=\"color: #800000; text-decoration-color: #800000\">34.88%</span> │               86 │ 11,342,972 │ 131,895.02 │\n",
       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 3            </span>│       <span style=\"color: #800000; text-decoration-color: #800000\">23.08%</span> │               26 │  3,602,028 │ 138,539.54 │\n",
       "└──────────────┴──────────────┴──────────────────┴────────────┴────────────┘\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                    \u001b[0m\u001b[1;3m📑 按级别统计 (Statistics by Level)\u001b[0m\u001b[3m                     \u001b[0m\n",
       "┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n",
       "┃\u001b[1;34m \u001b[0m\u001b[1;34m级别 (Level)\u001b[0m\u001b[1;34m \u001b[0m┃\u001b[1;34m \u001b[0m\u001b[1;34m正确率 (ACC)\u001b[0m\u001b[1;34m \u001b[0m┃\u001b[1;34m \u001b[0m\u001b[1;34m样本数 (Samples)\u001b[0m\u001b[1;34m \u001b[0m┃\u001b[1;34m \u001b[0m\u001b[1;34m  总Tokens\u001b[0m\u001b[1;34m \u001b[0m┃\u001b[1;34m \u001b[0m\u001b[1;34m平均Tokens\u001b[0m\u001b[1;34m \u001b[0m┃\n",
       "┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n",
       "│\u001b[36m \u001b[0m\u001b[36m1           \u001b[0m\u001b[36m \u001b[0m│       \u001b[31m56.60%\u001b[0m │               53 │  4,920,560 │  92,840.75 │\n",
       "│\u001b[36m \u001b[0m\u001b[36m2           \u001b[0m\u001b[36m \u001b[0m│       \u001b[31m34.88%\u001b[0m │               86 │ 11,342,972 │ 131,895.02 │\n",
       "│\u001b[36m \u001b[0m\u001b[36m3           \u001b[0m\u001b[36m \u001b[0m│       \u001b[31m23.08%\u001b[0m │               26 │  3,602,028 │ 138,539.54 │\n",
       "└──────────────┴──────────────┴──────────────────┴────────────┴────────────┘\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/single_agent.jsonl\"\n",
    "analyze_data_with_rich_output(file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2694fa9",
   "metadata": {},
   "source": [
    "## Level 1 Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "ad0cabe0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 86 条数据\n",
      "\n",
      "🎯 整体准确率: 53.49% (46/86)\n",
      "💰 总Token消耗: 53,284,834\n",
      "📈 平均每任务Token: 619591.1\n",
      "\n",
      "📊 各级别准确率:\n",
      "   Level 2: 53.49% (46/86) - 平均Token: 619591.1\n",
      "\n",
      "🔥 Token消耗排名前十的任务:\n",
      "1. 任务: e2d69698-bc99-4e85-9880-67eaccd66e6c - LEVEL 2 - Token: 3455927\n",
      "   正确性: True | 预测: Michele Fitzgerald | 真实: Michele Fitzgerald\n",
      "2. 任务: ecbc4f94-95a3-4cc7-b255-6741a458a625 - LEVEL 2 - Token: 3076576\n",
      "   正确性: False | 预测: 16 | 真实: 13\n",
      "3. 任务: 56137764-b4e0-45b8-9c52-1866420c3df5 - LEVEL 2 - Token: 1921291\n",
      "   正确性: False | 预测: Unable to determine | 真实: Li Peng\n",
      "4. 任务: d5141ca5-e7a0-469f-bf3e-e773507c86e2 - LEVEL 2 - Token: 1861791\n",
      "   正确性: False | 预测: 29/12/2019 | 真实: 19/02/2009\n",
      "5. 任务: d1af70ea-a9a4-421a-b9cc-94b5e02f1788 - LEVEL 2 - Token: 1819733\n",
      "   正确性: True | 预测: 736455 | 真实: 736455\n",
      "6. 任务: 2dfc4c37-fec1-4518-84a7-10095d30ad75 - LEVEL 2 - Token: 1767631\n",
      "   正确性: False | 预测: 3 | 真实: 6\n",
      "7. 任务: b7f857e4-d8aa-4387-af2a-0e844df5b9d8 - LEVEL 2 - Token: 1762707\n",
      "   正确性: True | 预测: 47 | 真实: 47\n",
      "8. 任务: 624cbf11-6a41-4692-af9c-36b3e5ca3130 - LEVEL 2 - Token: 1659658\n",
      "   正确性: False | 预测: Economic Crunch was laid to rest | 真实: So we had to let it die.\n",
      "9. 任务: c8b7e059-c60d-472e-ad64-3b04ae1166dc - LEVEL 2 - Token: 1544115\n",
      "   正确性: True | 预测: 8 | 真实: 8\n",
      "10. 任务: 0ff53813-3367-4f43-bcbd-3fd725c1bf4b - LEVEL 2 - Token: 1504937\n",
      "   正确性: True | 预测: beta geometric | 真实: beta geometric\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_2.jsonl\"\n",
    "task_details = evaluate_jsonl(file_path)\n",
    "# 前15条的token统计有问题"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9bce7ef6",
   "metadata": {},
   "source": [
    "### Level 1 with SupervisorAgent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "a6b4629b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "\n",
    "def normalize_text(s: str) -> str:\n",
    "    \"\"\"统一格式：小写、去掉标点符号、去掉多余空格\"\"\"\n",
    "    s = s.lower()\n",
    "    s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "    s = ' '.join(s.split())\n",
    "    return s\n",
    "\n",
    "def judge_correct(pred: str, true: str) -> bool:\n",
    "    return normalize_text(pred) == normalize_text(true)\n",
    "\n",
    "def read_multiline_jsonl(file_path: str):\n",
    "    \"\"\"读取每个 JSON 对象跨多行的 JSONL 文件\"\"\"\n",
    "    data = []\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        buffer = \"\"\n",
    "        open_braces = 0\n",
    "        for line in f:\n",
    "            line_strip = line.strip()\n",
    "            if not line_strip:\n",
    "                continue\n",
    "            open_braces += line_strip.count('{')\n",
    "            open_braces -= line_strip.count('}')\n",
    "            buffer += line\n",
    "            if open_braces == 0 and buffer.strip():\n",
    "                try:\n",
    "                    obj = json.loads(buffer)\n",
    "                    data.append(obj)\n",
    "                except json.JSONDecodeError as e:\n",
    "                    print(\"⚠️ JSON解析失败:\", e)\n",
    "                    print(\"内容示例:\", buffer[:200])\n",
    "                buffer = \"\"\n",
    "    return data\n",
    "\n",
    "def evaluate_jsonl(file_path: str):\n",
    "    data = read_multiline_jsonl(file_path)\n",
    "    print(f\"📊 加载了 {len(data)} 条数据\")\n",
    "\n",
    "    total_correct = 0\n",
    "    total_count = len(data)\n",
    "    token_list = []\n",
    "    num = 0\n",
    "\n",
    "    for item in data:\n",
    "        num += 1\n",
    "        prediction = str(item.get(\"prediction\", \"\"))\n",
    "        true_answer = str(item.get(\"true_answer\", \"\"))\n",
    "        task_id = item.get(\"task_id\", \"unknown\")\n",
    "        token_count = item.get(\"token_counts\", {}).get(\"total_token_count\", 0)\n",
    "\n",
    "        is_correct = judge_correct(prediction, true_answer)\n",
    "        total_correct += is_correct\n",
    "\n",
    "        token_list.append((token_count, task_id, prediction, true_answer, is_correct))\n",
    "\n",
    "        print(f\"Num:{num} ID: {task_id}\" + f\"   正确性: {is_correct}\" + f\"   Prediction: {prediction}\" + f\"   True Answer: {true_answer}\" + f\"   Total Token Count: {token_count}\\n\")\n",
    "\n",
    "    acc = total_correct / total_count if total_count > 0 else 0\n",
    "    print(f\"\\n🎯 总体准确率: {acc:.2%} ({total_correct}/{total_count})\")\n",
    "\n",
    "    # 输出 token 消耗前10的题目\n",
    "    token_list.sort(reverse=True, key=lambda x: x[0])\n",
    "    print(f\"\\n💰 总Token消耗: {sum(t[0] for t in token_list):,}\")\n",
    "    print(f\"📈 平均每任务Token: {sum(t[0] for t in token_list) / len(token_list) if token_list else 0:.2f}\")\n",
    "    print(\"\\n🔥 Token 消耗排名前10的题目:\")\n",
    "    for idx, (tokens, task_id, pred, true, correct) in enumerate(token_list[-10:], 1):\n",
    "        print(f\"{idx}. 任务ID: {task_id}, Token: {tokens}, 正确: {correct}, Prediction: {pred}, True Answer: {true}\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "1dc5b7f0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 4 条数据\n",
      "Num:1 ID: e1fc63a2-da7a-432f-be78-7c4a95598703   正确性: False   Prediction: 17000   True Answer: 17   Total Token Count: 251384\n",
      "\n",
      "Num:2 ID: 8e867cd7-cff9-4e6c-867a-ff5ddc2550be   正确性: True   Prediction: 3   True Answer: 3   Total Token Count: 852039\n",
      "\n",
      "Num:3 ID: ec09fa32-d03f-4bf8-84b0-1f16922c3ae4   正确性: False   Prediction: 1   True Answer: 3   Total Token Count: 253970\n",
      "\n",
      "Num:4 ID: 5d0080cb-90d7-4712-bc33-848150e917d3   正确性: True   Prediction: 0.1777   True Answer: 0.1777   Total Token Count: 81167\n",
      "\n",
      "\n",
      "🎯 总体准确率: 50.00% (2/4)\n",
      "\n",
      "💰 总Token消耗: 1,438,560\n",
      "📈 平均每任务Token: 359640.00\n",
      "\n",
      "🔥 Token 消耗排名前10的题目:\n",
      "1. 任务ID: 8e867cd7-cff9-4e6c-867a-ff5ddc2550be, Token: 852039, 正确: True, Prediction: 3, True Answer: 3\n",
      "2. 任务ID: ec09fa32-d03f-4bf8-84b0-1f16922c3ae4, Token: 253970, 正确: False, Prediction: 1, True Answer: 3\n",
      "3. 任务ID: e1fc63a2-da7a-432f-be78-7c4a95598703, Token: 251384, 正确: False, Prediction: 17000, True Answer: 17\n",
      "4. 任务ID: 5d0080cb-90d7-4712-bc33-848150e917d3, Token: 81167, 正确: True, Prediction: 0.1777, True Answer: 0.1777\n"
     ]
    }
   ],
   "source": [
    "evaluate_jsonl(\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_1_pass_2.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 13 条数据\n",
      "Num:1 ID: 676e5e31-a554-4acc-9286-b60d90a92d26   正确性: False   Prediction: 100   True Answer: 86   Total Token Count: 271915\n",
      "\n",
      "Num:2 ID: bec74516-02fc-48dc-b202-55e78d0e17cf   正确性: False   Prediction: Unable to determine   True Answer: 26.4   Total Token Count: 172124\n",
      "\n",
      "Num:3 ID: 00d579ea-0889-4fd9-a771-2c8d79835c8d   正确性: False   Prediction: Oliver Selfridge   True Answer: Claude Shannon   Total Token Count: 111776\n",
      "\n",
      "Num:4 ID: 983bba7c-c092-455f-b6c9-7857003d48fc   正确性: False   Prediction: Mouse (Mus musculus)   True Answer: mice   Total Token Count: 932720\n",
      "\n",
      "Num:5 ID: 56db2318-640f-477a-a82f-bc93ad13e882   正确性: True   Prediction: 7, 9   True Answer: 7, 9   Total Token Count: 35729\n",
      "\n",
      "Num:6 ID: 8131e2c0-0083-4265-9ce7-78c2d568425d   正确性: False   Prediction: 67 for Cheater, 56 for Cheater beater   True Answer: 101.376, 84.348   Total Token Count: 119598\n",
      "\n",
      "Num:7 ID: 72c06643-a2fa-4186-aa5c-9ec33ae9b445   正确性: False   Prediction: 156   True Answer: 55   Total Token Count: 1270736\n",
      "\n",
      "Num:8 ID: ebbc1f13-d24d-40df-9068-adcf735b4240   正确性: False   Prediction: no source title   True Answer: The World of the Twenty First Century   Total Token Count: 603539\n",
      "\n",
      "Num:9 ID: c526d8d6-5987-4da9-b24c-83466fa172f3   正确性: False   Prediction: 0.0429   True Answer: 0.0424   Total Token Count: 483136\n",
      "\n",
      "Num:10 ID: 3da89939-209c-4086-8520-7eb734e6b4ef   正确性: False   Prediction: 8,29,14,25,20,21   True Answer: 8, 29, 22, 1, 8, 26   Total Token Count: 162147\n",
      "\n",
      "Num:11 ID: 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b   正确性: False   Prediction: 0.0012   True Answer: 0.00033   Total Token Count: 862171\n",
      "\n",
      "Num:12 ID: e961a717-6b25-4175-8a68-874d28190ee4   正确性: True   Prediction: 12   True Answer: 12   Total Token Count: 853024\n",
      "\n",
      "Num:13 ID: 851e570a-e3de-4d84-bcfa-cc85578baa59   正确性: True   Prediction: briniest   True Answer: Briniest   Total Token Count: 220914\n",
      "\n",
      "\n",
      "🎯 总体准确率: 23.08% (3/13)\n",
      "\n",
      "💰 总Token消耗: 6,099,529\n",
      "📈 平均每任务Token: 469194.54\n",
      "\n",
      "🔥 Token 消耗排名前10的题目:\n",
      "1. 任务ID: e961a717-6b25-4175-8a68-874d28190ee4, Token: 853024, 正确: True, Prediction: 12, True Answer: 12\n",
      "2. 任务ID: ebbc1f13-d24d-40df-9068-adcf735b4240, Token: 603539, 正确: False, Prediction: no source title, True Answer: The World of the Twenty First Century\n",
      "3. 任务ID: c526d8d6-5987-4da9-b24c-83466fa172f3, Token: 483136, 正确: False, Prediction: 0.0429, True Answer: 0.0424\n",
      "4. 任务ID: 676e5e31-a554-4acc-9286-b60d90a92d26, Token: 271915, 正确: False, Prediction: 100, True Answer: 86\n",
      "5. 任务ID: 851e570a-e3de-4d84-bcfa-cc85578baa59, Token: 220914, 正确: True, Prediction: briniest, True Answer: Briniest\n",
      "6. 任务ID: bec74516-02fc-48dc-b202-55e78d0e17cf, Token: 172124, 正确: False, Prediction: Unable to determine, True Answer: 26.4\n",
      "7. 任务ID: 3da89939-209c-4086-8520-7eb734e6b4ef, Token: 162147, 正确: False, Prediction: 8,29,14,25,20,21, True Answer: 8, 29, 22, 1, 8, 26\n",
      "8. 任务ID: 8131e2c0-0083-4265-9ce7-78c2d568425d, Token: 119598, 正确: False, Prediction: 67 for Cheater, 56 for Cheater beater, True Answer: 101.376, 84.348\n",
      "9. 任务ID: 00d579ea-0889-4fd9-a771-2c8d79835c8d, Token: 111776, 正确: False, Prediction: Oliver Selfridge, True Answer: Claude Shannon\n",
      "10. 任务ID: 56db2318-640f-477a-a82f-bc93ad13e882, Token: 35729, 正确: True, Prediction: 7, 9, True Answer: 7, 9\n"
     ]
    }
   ],
   "source": [
    "# file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/validation/0907_supervisor.jsonl\"\n",
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_3.jsonl\"\n",
    "evaluate_jsonl(file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b714622",
   "metadata": {},
   "source": [
    "Average Level 1 high cost examples average cost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "f50ddf83",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "💰 高Token任务总Token消耗: 8,720,517\n",
      "📈 高Token任务平均每任务Token: 872051.7\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/data/supervisor_list/high_token_task_ids.jsonl\"\n",
    "total_tokens = 0\n",
    "data = []\n",
    "with open(file_path, 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        data.append(json.loads(line.strip()))\n",
    "for i in data:\n",
    "    total_tokens += i[\"total_token_count\"]\n",
    "print(f\"💰 高Token任务总Token消耗: {total_tokens:,}\")\n",
    "print(f\"📈 高Token任务平均每任务Token: {total_tokens/len(data):.1f}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c97a892b",
   "metadata": {},
   "source": [
    "### Comparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "dfea848f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import unicodedata\n",
    "\n",
    "# def normalize_text(s: str) -> str:\n",
    "#     \"\"\"统一格式：小写、去掉标点符号、去掉多余空格\"\"\"\n",
    "#     s = s.lower()\n",
    "#     s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "#     s = ' '.join(s.split())\n",
    "#     return s\n",
    "\n",
    "def normalize_text(s: str) -> str:\n",
    "    # 转小写\n",
    "    s = s.lower()\n",
    "    # 统一 Unicode 表示（把奇怪的空格、分号之类统一）\n",
    "    s = unicodedata.normalize(\"NFKC\", s)\n",
    "    # 保留字母数字，其他符号一律换空格\n",
    "    s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "    # 去掉多余空格\n",
    "    s = ' '.join(s.split())\n",
    "    return s\n",
    "\n",
    "def judge_correct(pred: str, true: str) -> bool:\n",
    "    # print( normalize_text(pred), \"||\", normalize_text(true), normalize_text(pred) == normalize_text(true) )\n",
    "    return normalize_text(pred) == normalize_text(true)\n",
    "\n",
    "def load_jsonl(path):\n",
    "    \"\"\"手动读取多行 JSONL 文件\"\"\"\n",
    "    items, buf = [], \"\"\n",
    "    with open(path, \"r\", encoding=\"utf-8\") as f:\n",
    "        for line in f:\n",
    "            line = line.strip()\n",
    "            if not line:\n",
    "                continue\n",
    "            buf += line\n",
    "            try:\n",
    "                obj = json.loads(buf)\n",
    "                items.append(obj)\n",
    "                buf = \"\"\n",
    "            except json.JSONDecodeError:\n",
    "                continue\n",
    "    return pd.DataFrame(items)\n",
    "\n",
    "def compare_jsonl(file1: str, file2: str, export_path: str = None):\n",
    "    df1, df2 = load_jsonl(file1), load_jsonl(file2)\n",
    "\n",
    "    # 0. 数据量\n",
    "    n1, n2 = len(df1), len(df2)\n",
    "\n",
    "    # 1. 准确率 (用 judge_correct)\n",
    "    acc1 = df1.apply(lambda row: judge_correct(str(row[\"prediction\"]), str(row[\"true_answer\"])), axis=1).mean()\n",
    "    acc2 = df2.apply(lambda row: judge_correct(str(row[\"prediction\"]), str(row[\"true_answer\"])), axis=1).mean()\n",
    "\n",
    "    # 2. token消耗\n",
    "    total_tokens1 = df1[\"token_counts\"].apply(lambda x: x[\"total_token_count\"]).sum()\n",
    "    total_tokens2 = df2[\"token_counts\"].apply(lambda x: x[\"total_token_count\"]).sum()\n",
    "    avg_tokens1 = total_tokens1 / n1 if n1 > 0 else 0\n",
    "    avg_tokens2 = total_tokens2 / n2 if n2 > 0 else 0\n",
    "\n",
    "    # 3. 每道题对比信息\n",
    "    def extract_info(df):\n",
    "        return pd.DataFrame({\n",
    "            \"task_id\": df[\"task_id\"],\n",
    "            \"correct\": df.apply(lambda row: judge_correct(str(row[\"prediction\"]), str(row[\"true_answer\"])), axis=1),\n",
    "            \"token_used\": df[\"token_counts\"].apply(lambda x: x[\"total_token_count\"]),\n",
    "            \"prediction\": df[\"prediction\"],\n",
    "            \"true_answer\": df[\"true_answer\"]\n",
    "        })\n",
    "\n",
    "    compare1 = extract_info(df1)\n",
    "    compare2 = extract_info(df2)\n",
    "\n",
    "    merged = compare1.merge(compare2, on=\"task_id\", suffixes=(\"_Supervisor\", \"_Smolagent\"), how=\"outer\")\n",
    "\n",
    "    # 打印摘要信息（整齐表格）\n",
    "    summary = pd.DataFrame({\n",
    "        \"数据量\": [n1, n2],\n",
    "        \"准确率\": [f\"{acc1:.2%}\", f\"{acc2:.2%}\"],\n",
    "        \"总token\": [total_tokens1, total_tokens2],\n",
    "        \"平均token\": [f\"{avg_tokens1:.1f}\", f\"{avg_tokens2:.1f}\"]\n",
    "    }, index=[\"Supervisor\", \"Smolagent\"])\n",
    "\n",
    "    print(\"📊 摘要信息：\")\n",
    "    print(summary.to_string())\n",
    "    print(\"\\n📋 对比表格（前几行预览）：\")\n",
    "    print(merged.head(10).to_string(index=False))\n",
    "\n",
    "    # 如果需要导出 Excel/CSV\n",
    "    if export_path:\n",
    "        merged.to_excel(export_path, index=False)\n",
    "        print(f\"\\n✅ 已导出完整对比表格到 {export_path}\")\n",
    "\n",
    "    return {\"summary\": summary, \"comparison_table\": merged}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6a9b06a",
   "metadata": {},
   "source": [
    "#### LEVEL 1 COMPARATION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "id": "d08040d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 摘要信息：\n",
      "            数据量     准确率    总token   平均token\n",
      "Supervisor   53  75.47%  14672318  276836.2\n",
      "Smolagent    53  71.70%  14953345  282138.6\n",
      "\n",
      "📋 对比表格（前几行预览）：\n",
      "                             task_id  correct_Supervisor  token_used_Supervisor                                  prediction_Supervisor                                 true_answer_Supervisor  correct_Smolagent  token_used_Smolagent                                   prediction_Smolagent                                  true_answer_Smolagent\n",
      "0383a3ee-47a7-41a4-b493-519bdefe0488                True                 236980                                     rockhopper penguin                                     Rockhopper penguin               True                 90870                                     rockhopper penguin                                     Rockhopper penguin\n",
      "11af4e1a-5f45-467d-9aeb-46f4bb0bf034                True                 351429                                                      6                                                      6               True                159288                                                      6                                                      6\n",
      "1f975693-876d-457b-a649-393859e79bf3               False                  20195                               132,133,134,197,245,1197                                132, 133, 134, 197, 245              False                 22454                               132,133,134,197,245,1197                                132, 133, 134, 197, 245\n",
      "23dd907f-1261-4488-b21c-e9185af91d5e                True                 321880                                                      2                                                      2               True                301170                                                      2                                                      2\n",
      "27d5d136-8563-469e-92bf-fd103c28b57c                True                  35817                                    (¬A → B) ↔ (A ∨ ¬B)                                    (¬A → B) ↔ (A ∨ ¬B)               True                 26552                                    (¬A → B) ↔ (A ∨ ¬B)                                    (¬A → B) ↔ (A ∨ ¬B)\n",
      "2d83110e-a098-4ebb-9987-066c06fa42d0                True                  17400                                                  right                                                  Right               True                 17167                                                  right                                                  Right\n",
      "305ac316-eef6-4446-960a-92d80d542f82                True                  90660                                               Wojciech                                               Wojciech               True                129546                                               Wojciech                                               Wojciech\n",
      "389793a7-ca17-4e82-81cb-2b3a2391b4b9                True                  96635                                                      3                                                      3               True                 57678                                                      3                                                      3\n",
      "3cef3a44-215e-4aed-8e3b-b1e3f08063b7                True                 835716 broccoli, celery, fresh basil, lettuce, sweet potatoes broccoli, celery, fresh basil, lettuce, sweet potatoes               True                285211 broccoli, celery, fresh basil, lettuce, sweet potatoes broccoli, celery, fresh basil, lettuce, sweet potatoes\n",
      "3f57289b-8c60-48be-bd80-01f8099ca449                True                 337292                                                    519                                                    519               True                342249                                                    519                                                    519\n",
      "\n",
      "✅ 已导出完整对比表格到 /home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_1_ps3.xlsx\n"
     ]
    }
   ],
   "source": [
    "# pass@1\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_1.jsonl\", \n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_1_p2.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_1_ps1.xlsx\")\n",
    "\n",
    "# pass@2\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_1_p2.jsonl\", \n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_1_p2.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_1_ps2.xlsx\")\n",
    "\n",
    "# pass@3\n",
    "res = compare_jsonl(\n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_1_p3.jsonl\", \n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_1_p3.jsonl\",\n",
    "    export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_1_ps3.xlsx\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43e77aa3",
   "metadata": {},
   "source": [
    "#### LEVEL 2 COMPARATION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "id": "d4211e28",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 摘要信息：\n",
      "            数据量     准确率    总token   平均token\n",
      "Supervisor   86  54.65%  35897321  417410.7\n",
      "Smolagent    86  63.95%  52034102  605047.7\n",
      "\n",
      "📋 对比表格（前几行预览）：\n",
      "                             task_id  correct_Supervisor  token_used_Supervisor  prediction_Supervisor            true_answer_Supervisor  correct_Smolagent  token_used_Smolagent               prediction_Smolagent             true_answer_Smolagent\n",
      "023e9d44-96ae-4eed-b912-244ee8c3b994               False                 233934                   8.00                                 8              False                109342                                  0                                 8\n",
      "04a04a9b-226c-43fd-b319-d5e89743676f               False                 310850                     34                                41               True                171660                                 41                                41\n",
      "05407167-39ec-4d3a-a234-73a9120c325d               False                 455338 No such command exists                   Format Document               True                175427                    Format Document                   Format Document\n",
      "076c8171-9b3b-49b9-a477-244d2a532826                True                  40197                Finance                           Finance               True                 41156                            Finance                           Finance\n",
      "08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715                True                 405607          orange, white                     orange, white               True                474879                      orange, white                     orange, white\n",
      "08cae58d-4084-4616-b6dd-dd6534e4825b               False                 762474    Unable to determine                              2018              False                661702                Unable to determine                              2018\n",
      "08f3a05f-5947-4089-a4c4-d4bcfaa6b7a0                True                  21138                      2                                 2               True                 30308                                  2                                 2\n",
      "0a3cd321-3e76-4622-911b-0fda2e5d6b1a               False                 214357           China, India Brunei, China, Morocco, Singapore              False                145682 Botswana, China, Guinea, Singapore Brunei, China, Morocco, Singapore\n",
      "0a65cb96-cb6e-4a6a-8aae-c1084f613456                True                 483027               Holabird                          Holabird               True                648764                           Holabird                          Holabird\n",
      "0b260a57-3f3a-4405-9f29-6d7a1012dbfb               False                 722723                  0.000                             0.269              False               1927695                              0.000                             0.269\n",
      "\n",
      "✅ 已导出完整对比表格到 /home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_2_ps3.xlsx\n"
     ]
    }
   ],
   "source": [
    "# pass@1\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_2.jsonl\",\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_2.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_2_ps1.xlsx\")\n",
    "\n",
    "# pass@2\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_2_p2.jsonl\",\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_2_p2.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_2_ps2.xlsx\")\n",
    "\n",
    "# pass@3\n",
    "res = compare_jsonl(\n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_2_p2.jsonl\",\n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_2_p3.jsonl\",\n",
    "    export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_2_ps3.xlsx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2f6e27e",
   "metadata": {},
   "source": [
    "#### LEVEL 3 COMPARATION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "id": "d4799a01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 摘要信息：\n",
      "            数据量     准确率    总token   平均token\n",
      "Supervisor   26  38.46%  13759075  529195.2\n",
      "Smolagent    26  38.46%  15908706  611873.3\n",
      "\n",
      "📋 对比表格（前几行预览）：\n",
      "                             task_id  correct_Supervisor  token_used_Supervisor prediction_Supervisor true_answer_Supervisor  correct_Smolagent  token_used_Smolagent prediction_Smolagent true_answer_Smolagent\n",
      "00d579ea-0889-4fd9-a771-2c8d79835c8d                True                 427902        Claude Shannon         Claude Shannon              False                255459     Oliver Selfridge        Claude Shannon\n",
      "0512426f-4d28-49f0-be77-06d05daec096               False                 335853              65000000              100000000              False                120621             66000000             100000000\n",
      "0bdb7c40-671d-4ad1-9ce3-986b159c0ddc               False                 278633            White;5516            White; 5876               True               2144864           White;5876           White; 5876\n",
      "384d0dd8-e8a4-4cfe-963c-d37f256e7662               False                1106412                 12297                   4192              False               1204709  Unable to determine                  4192\n",
      "3da89939-209c-4086-8520-7eb734e6b4ef               False                 349551      8,25,14,13,26,21    8, 29, 22, 1, 8, 26              False                258653      8,25,3,20,26,25   8, 29, 22, 1, 8, 26\n",
      "50f58759-7bd6-406f-9b0d-5692beb2a926               False                 535588   Unable to determine                      3              False                372706  Unable to determine                     3\n",
      "56db2318-640f-477a-a82f-bc93ad13e882                True                  35729                  7, 9                   7, 9               True                 38046                 7, 9                  7, 9\n",
      "5b2a14e8-6e59-479c-80e3-4696e8980152                True                 430557                 bacon                  bacon              False               1442909  Unable to determine                 bacon\n",
      "5f982798-16b9-4051-ab57-cfc7ebdb2a91               False                1264943              0.000585                    0.2              False                512534                  0.8                   0.2\n",
      "676e5e31-a554-4acc-9286-b60d90a92d26               False                3080613                     0                     86              False                315001  Unable to determine                    86\n",
      "\n",
      "✅ 已导出完整对比表格到 /home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_3_ps3.xlsx\n"
     ]
    }
   ],
   "source": [
    "# pass@1\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_3.jsonl\",\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_3.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_3_ps1.xlsx\")\n",
    "\n",
    "# pass@2\n",
    "# res = compare_jsonl(\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_3_p2.jsonl\",\n",
    "#     \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_3_p2.jsonl\",\n",
    "#     export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_3_ps2.xlsx\")\n",
    "\n",
    "# pass@3\n",
    "res = compare_jsonl(\n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/supervisor_level_3_p2.jsonl\",\n",
    "    \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_3_p3.jsonl\",\n",
    "    export_path=\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/comparison_level_3_ps3.xlsx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2c40d1d",
   "metadata": {},
   "source": [
    "## Level 2 Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "78cf86b9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 86 条数据\n",
      "\n",
      "🎯 整体准确率: 53.49% (46/86)\n",
      "💰 总Token消耗: 53,284,834\n",
      "📈 平均每任务Token: 619591.1\n",
      "\n",
      "📊 各级别准确率:\n",
      "   Level 2: 53.49% (46/86) - 平均Token: 619591.1\n",
      "\n",
      "📋 每个任务的详细信息:\n",
      "   任务: c61d22de-5f6c-4958-a7f6-5e9707bd3466 - LEVEL 2 True - 预测: Egalitarian - 真实: egalitarian\n",
      "   任务: 17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc - LEVEL 2 True - 预测: 34689 - 真实: 34689\n",
      "   任务: 04a04a9b-226c-43fd-b319-d5e89743676f - LEVEL 2 False - 预测: 42 - 真实: 41\n",
      "   任务: 14569e28-c88c-43e4-8c32-097d35b9a67d - LEVEL 2 False - 预测: i - 真实: backtick\n",
      "   任务: 32102e3e-d12a-4209-9163-7b3a104efe5d - LEVEL 2 True - 预测: Time-Parking 2: Parallel Universe - 真实: Time-Parking 2: Parallel Universe\n",
      "   任务: 3627a8be-a77f-41bb-b807-7e1bd4c0ebdf - LEVEL 2 True - 预测: 142 - 真实: 142\n",
      "   任务: 7619a514-5fa8-43ef-9143-83b66a43d7a4 - LEVEL 2 False - 预测: 07/03/24 - 真实: 04/15/18\n",
      "   任务: 7dd30055-0198-452e-8c25-f73dbe27dcb8 - LEVEL 2 False - 预测: 146 pm - 真实: 1.456\n",
      "   任务: 2a649bb1-795f-4a01-b3be-9a01868dae73 - LEVEL 2 False - 预测: 3.1.3.1 - 真实: 3.1.3.1; 1.11.1.7\n",
      "   任务: 87c610df-bef7-4932-b950-1d83ef4e282b - LEVEL 2 True - 预测: Morarji Desai - 真实: Morarji Desai\n",
      "   任务: 624cbf11-6a41-4692-af9c-36b3e5ca3130 - LEVEL 2 False - 预测: Economic Crunch was laid to rest - 真实: So we had to let it die.\n",
      "   任务: dd3c7503-f62a-4bd0-9f67-1b63b94194cc - LEVEL 2 True - 预测: 6 - 真实: 6\n",
      "   任务: df6561b2-7ee5-4540-baab-5095f742716a - LEVEL 2 False - 预测: 17.339 - 真实: 17.056\n",
      "   任务: f0f46385-fc03-4599-b5d3-f56496c3e69f - LEVEL 2 True - 预测: Indonesia,Myanmar - 真实: Indonesia, Myanmar\n",
      "   任务: e4e91f1c-1dcd-439e-9fdd-cb976f5293fd - LEVEL 2 True - 预测: cloak - 真实: cloak\n",
      "   任务: 56137764-b4e0-45b8-9c52-1866420c3df5 - LEVEL 2 False - 预测: Unable to determine - 真实: Li Peng\n",
      "   任务: 8b3379c0-0981-4f5b-8407-6444610cb212 - LEVEL 2 False - 预测: 13 - 真实: 1.8\n",
      "   任务: 0ff53813-3367-4f43-bcbd-3fd725c1bf4b - LEVEL 2 True - 预测: beta geometric - 真实: beta geometric\n",
      "   任务: a7feb290-76bb-4cb7-8800-7edaf7954f2f - LEVEL 2 False - 预测: 60 - 真实: 31\n",
      "   任务: b4cc024b-3f5e-480e-b96a-6656493255b5 - LEVEL 2 False - 预测: Russo-German Legion - 真实: Russian-German Legion\n",
      "   任务: 33d8ea3b-6c6b-4ff1-803d-7e270dea8a57 - LEVEL 2 True - 预测: 2 - 真实: 2\n",
      "   任务: e8cb5b03-41e0-4086-99e5-f6806cd97211 - LEVEL 2 True - 预测: shrimp - 真实: shrimp\n",
      "   任务: f46b4380-207e-4434-820b-f32ce04ae2a4 - LEVEL 2 True - 预测: Harbinger, Tidal - 真实: Harbinger, Tidal\n",
      "   任务: 05407167-39ec-4d3a-a234-73a9120c325d - LEVEL 2 False - 预测: Unable to determine - 真实: Format Document\n",
      "   任务: b9763138-c053-4832-9f55-86200cb1f99c - LEVEL 2 True - 预测: 3 - 真实: 3\n",
      "   任务: 16d825ff-1623-4176-a5b5-42e0f5c2b0ac - LEVEL 2 False - 预测: 12:52 PM - 真实: 6:41 PM\n",
      "   任务: 2b3ef98c-cc05-450b-a719-711aee40ac65 - LEVEL 2 False - 预测: givemethatmanthatisnotpassionsslaveandiwillwearhiminmyheartscoreayinmyheartofheart - 真实: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
      "   任务: bfcd99e1-0690-4b53-a85c-0174a8629083 - LEVEL 2 True - 预测: 17 - 真实: 17\n",
      "   任务: 544b7f0c-173a-4377-8d56-57b36eb26ddf - LEVEL 2 True - 预测: A Nightmare on Elm Street - 真实: A Nightmare on Elm Street\n",
      "   任务: 6b078778-0b90-464d-83f6-59511c811b01 - LEVEL 2 True - 预测: Alfonso Visconti - 真实: Alfonso Visconti\n",
      "   任务: 076c8171-9b3b-49b9-a477-244d2a532826 - LEVEL 2 True - 预测: Finance - 真实: Finance\n",
      "   任务: 08cae58d-4084-4616-b6dd-dd6534e4825b - LEVEL 2 False - 预测: Unable to determine\n",
      "\n",
      "Explanation: Google Finance displays only split-adjusted stock prices for AAPL and does not offer unadjusted (raw, non-split-adjusted) price history in its public interface as of 2024. Therefore, according to Google Finance, the answer for when Apple stock first exceeded $50 \"without adjusting for stock split\" cannot be determined using Google Finance data, because that data is not presented or available in this required form. - 真实: 2018\n",
      "   任务: 2dfc4c37-fec1-4518-84a7-10095d30ad75 - LEVEL 2 False - 预测: 3 - 真实: 6\n",
      "   任务: 9f41b083-683e-4dcf-9185-ccfeaa88fa45 - LEVEL 2 True - 预测: 0 - 真实: 0\n",
      "   任务: ecbc4f94-95a3-4cc7-b255-6741a458a625 - LEVEL 2 False - 预测: 16 - 真实: 13\n",
      "   任务: e9a2c537-8232-4c3f-85b0-b52de6bcba99 - LEVEL 2 False - 预测: 2 - 真实: 7\n",
      "   任务: 71345b0a-9c7d-4b50-b2bf-937ec5879845 - LEVEL 2 False - 预测: A dragon is a creature that does not exist - 真实: Here be dragons\n",
      "   任务: 7b5377b0-3f38-4103-8ad2-90fe89864c04 - LEVEL 2 True - 预测: 563.9 - 真实: 563.9\n",
      "   任务: 114d5fd0-e2ae-4b6d-a65a-870da2d19c08 - LEVEL 2 True - 预测: 4 - 真实: 4\n",
      "   任务: 8f80e01c-1296-4371-9486-bb3d68651a60 - LEVEL 2 False - 预测: 7 - 真实: 90\n",
      "   任务: ad37a656-079a-49f9-a493-7b739c9167d1 - LEVEL 2 True - 预测: Bravo - 真实: Bravo\n",
      "   任务: 366e2f2b-8632-4ef2-81eb-bc3877489217 - LEVEL 2 True - 预测: Shelley's Place - 真实: Shelley's place\n",
      "   任务: f3917a3d-1d17-4ee2-90c5-683b072218fe - LEVEL 2 False - 预测: 2736 - 真实: 2732\n",
      "   任务: 48eb8242-1099-4c26-95d4-ef22b002457a - LEVEL 2 True - 预测: 6 - 真实: 6\n",
      "   任务: c8b7e059-c60d-472e-ad64-3b04ae1166dc - LEVEL 2 True - 预测: 8 - 真实: 8\n",
      "   任务: d1af70ea-a9a4-421a-b9cc-94b5e02f1788 - LEVEL 2 True - 预测: 736455 - 真实: 736455\n",
      "   任务: 08f3a05f-5947-4089-a4c4-d4bcfaa6b7a0 - LEVEL 2 True - 预测: 2 - 真实: 2\n",
      "   任务: 54612da3-fd56-4941-80f4-5eb82330de25 - LEVEL 2 True - 预测: 60 - 真实: 60\n",
      "   任务: ded28325-3447-4c56-860f-e497d6fb3577 - LEVEL 2 False - 预测: Picnic is in Polybius Plaza - 真实: Picnic is in Ploybius Plaza.\n",
      "   任务: 6359a0b1-8f7b-499b-9336-840f9ab90688 - LEVEL 2 False - 预测: 84.0 - 真实: 39\n",
      "   任务: 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f - LEVEL 2 True - 预测: Wharvton - 真实: Wharvton\n",
      "   任务: d700d50d-c707-4dca-90dc-4528cddd0c80 - LEVEL 2 True - 预测: Roger Miller - 真实: Roger Miller\n",
      "   任务: 0a3cd321-3e76-4622-911b-0fda2e5d6b1a - LEVEL 2 False - 预测: China, Qatar, Singapore - 真实: Brunei, China, Morocco, Singapore\n",
      "   任务: f2feb6a4-363c-4c09-a804-0db564eafd68 - LEVEL 2 False - 预测: 2017 Komo Mai Drive, 900000 - 真实: 900000\n",
      "   任务: 0b260a57-3f3a-4405-9f29-6d7a1012dbfb - LEVEL 2 False - 预测: 0.577 - 真实: 0.269\n",
      "   任务: ed58682d-bc52-4baa-9eb0-4eb81e1edacc - LEVEL 2 True - 预测: stare - 真实: stare\n",
      "   任务: cca70ce6-1952-45d2-acd4-80c903b0bc49 - LEVEL 2 True - 预测: 85 - 真实: 85\n",
      "   任务: b7f857e4-d8aa-4387-af2a-0e844df5b9d8 - LEVEL 2 True - 预测: 47 - 真实: 47\n",
      "   任务: d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de - LEVEL 2 True - 预测: 0.03 - 真实: 0.03\n",
      "   任务: 67e8878b-5cef-4375-804e-e6291fdbe78a - LEVEL 2 False - 预测: Hotel - 真实: Hotels\n",
      "   任务: 023e9d44-96ae-4eed-b912-244ee8c3b994 - LEVEL 2 False - 预测: 8.25 - 真实: 8\n",
      "   任务: 0e9e85b8-52b9-4de4-b402-5f635ab9631f - LEVEL 2 False - 预测: 1998 - 真实: 1927\n",
      "   任务: 20194330-9976-4043-8632-f8485c6c71b2 - LEVEL 2 False - 预测: 0 - 真实: 4\n",
      "   任务: 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2 - LEVEL 2 False - 预测: 4 - 真实: 8\n",
      "   任务: 65638e28-7f37-4fa7-b7b9-8c19bb609879 - LEVEL 2 False - 预测: Robertson Smith - 真实: Kleinpaul\n",
      "   任务: 3ff6b7a9-a5bd-4412-ad92-0cd0d45c0fee - LEVEL 2 True - 预测: 56000 - 真实: 56000\n",
      "   任务: 708b99c5-e4a7-49cb-a5cf-933c8d46470d - LEVEL 2 False - 预测: citation count - 真实: Citations\n",
      "   任务: 0a65cb96-cb6e-4a6a-8aae-c1084f613456 - LEVEL 2 True - 预测: Holabird - 真实: Holabird\n",
      "   任务: 65da0822-a48a-4a68-bbad-8ed1b835a834 - LEVEL 2 True - 预测: Santa Clara,Boston - 真实: Santa Clara, Boston\n",
      "   任务: 0bb3b44a-ede5-4db5-a520-4e844b0079c5 - LEVEL 2 True - 预测: 536 - 真实: 536\n",
      "   任务: 73c1b9fe-ee1d-4cf4-96ca-35c08f97b054 - LEVEL 2 True - 预测: 1954 - 真实: 1954\n",
      "   任务: e2d69698-bc99-4e85-9880-67eaccd66e6c - LEVEL 2 True - 预测: Michele Fitzgerald - 真实: Michele Fitzgerald\n",
      "   任务: a56f1527-3abf-41d6-91f8-7296d6336c3f - LEVEL 2 True - 预测: 185 - 真实: 185\n",
      "   任务: 42d4198c-5895-4f0a-b0c0-424a66465d83 - LEVEL 2 False - 预测: Unable to determine - 真实: 60\n",
      "   任务: edd4d4f2-1a58-45c4-b038-67337af4e029 - LEVEL 2 True - 预测: Berkshire - 真实: Berkshire\n",
      "   任务: a26649c6-1cb2-470a-871e-6910c64c3e53 - LEVEL 2 True - 预测: 116 - 真实: 116\n",
      "   任务: 4d0aa727-86b1-406b-9b33-f870dd14a4a5 - LEVEL 2 True - 预测: 1 in 3 - 真实: 1 in 3\n",
      "   任务: d5141ca5-e7a0-469f-bf3e-e773507c86e2 - LEVEL 2 False - 预测: 29/12/2019 - 真实: 19/02/2009\n",
      "   任务: 1dcc160f-c187-48c2-b68e-319bd4354f3d - LEVEL 2 False - 预测: 0 - 真实: 3\n",
      "   任务: b2c257e0-3ad7-4f05-b8e3-d9da973be36e - LEVEL 2 True - 预测: +4.6 - 真实: +4.6\n",
      "   任务: e0c10771-d627-4fd7-9694-05348e54ee36 - LEVEL 2 True - 预测: 234.9 - 真实: 234.9\n",
      "   任务: e29834fd-413a-455c-a33e-c3915b07401c - LEVEL 2 False - 预测: 19 - 真实: 21\n",
      "   任务: 08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715 - LEVEL 2 False - 预测: gold - 真实: orange, white\n",
      "   任务: db4fd70a-2d37-40ea-873f-9433dc5e301f - LEVEL 2 True - 预测: 10 - 真实: 10\n",
      "   任务: 853c8244-429e-46ca-89f2-addf40dfb2bd - LEVEL 2 False - 预测: 0 - 真实: 11\n",
      "   任务: 7a4a336d-dcfa-45a0-b014-824c7619e8de - LEVEL 2 False - 预测: 1:52.43 - 真实: 1:41.614\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_2.jsonl\"\n",
    "evaluate_jsonl(file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68f3a5b2",
   "metadata": {},
   "source": [
    "## Level 3 Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "caf8f23e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 26 条数据\n",
      "\n",
      "🎯 整体准确率: 26.92% (7/26)\n",
      "💰 总Token消耗: 15,742,533\n",
      "📈 平均每任务Token: 605482.0\n",
      "\n",
      "📊 各级别准确率:\n",
      "   Level 3: 26.92% (7/26) - 平均Token: 605482.0\n",
      "\n",
      "📋 每个任务的详细信息:\n",
      "   任务: 676e5e31-a554-4acc-9286-b60d90a92d26 - LEVEL 3 False - 预测: 83 - 真实: 86\n",
      "   任务: bec74516-02fc-48dc-b202-55e78d0e17cf - LEVEL 3 False - 预测: Unable to determine - 真实: 26.4\n",
      "   任务: 00d579ea-0889-4fd9-a771-2c8d79835c8d - LEVEL 3 False - 预测: Jerome Wiesner - 真实: Claude Shannon\n",
      "   任务: 384d0dd8-e8a4-4cfe-963c-d37f256e7662 - LEVEL 3 False - 预测: Unable to determine - 真实: 4192\n",
      "   任务: de9887f5-ead8-4727-876f-5a4078f8598c - LEVEL 3 True - 预测: 22 - 真实: 22\n",
      "   任务: 983bba7c-c092-455f-b6c9-7857003d48fc - LEVEL 3 False - 预测: mammals - 真实: mice\n",
      "   任务: 56db2318-640f-477a-a82f-bc93ad13e882 - LEVEL 3 True - 预测: 7, 9 - 真实: 7, 9\n",
      "   任务: 8131e2c0-0083-4265-9ce7-78c2d568425d - LEVEL 3 False - 预测: 99.9 CFM for Cheater, 92.6 CFM for Cheater beater - 真实: 101.376, 84.348\n",
      "   任务: 72c06643-a2fa-4186-aa5c-9ec33ae9b445 - LEVEL 3 False - 预测: 195 - 真实: 55\n",
      "   任务: ebbc1f13-d24d-40df-9068-adcf735b4240 - LEVEL 3 True - 预测: the world of the twenty first century - 真实: The World of the Twenty First Century\n",
      "   任务: c526d8d6-5987-4da9-b24c-83466fa172f3 - LEVEL 3 False - 预测: 0.0429 - 真实: 0.0424\n",
      "   任务: 3da89939-209c-4086-8520-7eb734e6b4ef - LEVEL 3 False - 预测: 8,25,27,16,26,21 - 真实: 8, 29, 22, 1, 8, 26\n",
      "   任务: 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b - LEVEL 3 False - 预测: 0.00031 - 真实: 0.00033\n",
      "   任务: e961a717-6b25-4175-8a68-874d28190ee4 - LEVEL 3 True - 预测: 12 - 真实: 12\n",
      "   任务: 851e570a-e3de-4d84-bcfa-cc85578baa59 - LEVEL 3 False - 预测: Unable to determine - 真实: Briniest\n",
      "   任务: 50f58759-7bd6-406f-9b0d-5692beb2a926 - LEVEL 3 False - 预测: Unable to determine - 真实: 3\n",
      "   任务: 872bfbb1-9ccf-49f6-8c5f-aa22818ccd66 - LEVEL 3 False - 预测: pears - 真实: pears, bananas\n",
      "   任务: c3a79cfe-8206-451f-aca8-3fec8ebe51d3 - LEVEL 3 True - 预测: 8 - 真实: 8\n",
      "   任务: da52d699-e8d2-4dc5-9191-a2199e0b6a9b - LEVEL 3 True - 预测: Out of the Silent Planet - 真实: Out of the Silent Planet\n",
      "   任务: ad2b4d70-9314-4fe6-bfbe-894a45f6055f - LEVEL 3 False - 预测: serpent - 真实: War is not here this is a land of peace\n",
      "   任务: 5b2a14e8-6e59-479c-80e3-4696e8980152 - LEVEL 3 False - 预测: None - 真实: bacon\n",
      "   任务: 9e1fc53b-46ff-49a1-9d05-9e6faac34cc5 - LEVEL 3 False - 预测: Death Knight,Mage,Paladin,Priest,Warlock - 真实: Death Knight, Hunter, Paladin, Priest, Warlock\n",
      "   任务: 5f982798-16b9-4051-ab57-cfc7ebdb2a91 - LEVEL 3 False - 预测: 0 - 真实: 0.2\n",
      "   任务: 0512426f-4d28-49f0-be77-06d05daec096 - LEVEL 3 False - 预测: 65000000 - 真实: 100000000\n",
      "   任务: 0bdb7c40-671d-4ad1-9ce3-986b159c0ddc - LEVEL 3 True - 预测: White;5876 - 真实: White; 5876\n",
      "   任务: 9b54f9d9-35ee-4a14-b62f-d130ea00317f - LEVEL 3 False - 预测: Unable to determine - 真实: Soups and Stews\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_level_3.jsonl\"\n",
    "evaluate_jsonl(file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "153f8b05",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📊 加载了 165 条数据\n",
      "\n",
      "🎯 整体准确率: 52.73% (87/165)\n",
      "💰 总Token消耗: 83,163,465\n",
      "📈 平均每任务Token: 504021.0\n",
      "\n",
      "📊 各级别准确率:\n",
      "   Level 1: 64.15% (34/53) - 平均Token: 266718.8\n",
      "   Level 2: 53.49% (46/86) - 平均Token: 619591.1\n",
      "   Level 3: 26.92% (7/26) - 平均Token: 605482.0\n",
      "\n",
      "📋 每个任务的详细信息:\n",
      "   任务: e1fc63a2-da7a-432f-be78-7c4a95598703 - LEVEL 1 True - 预测: 17 - 真实: 17\n",
      "   任务: 8e867cd7-cff9-4e6c-867a-ff5ddc2550be - LEVEL 1 True - 预测: 3 - 真实: 3\n",
      "   任务: ec09fa32-d03f-4bf8-84b0-1f16922c3ae4 - LEVEL 1 True - 预测: 3 - 真实: 3\n",
      "   任务: 5d0080cb-90d7-4712-bc33-848150e917d3 - LEVEL 1 True - 预测: 0.1777 - 真实: 0.1777\n",
      "   任务: a1e91b78-d3d8-4675-bb8d-62741b4b68a6 - LEVEL 1 True - 预测: 3 - 真实: 3\n",
      "   任务: 46719c30-f4c3-4cad-be07-d5cb21eee6bb - LEVEL 1 False - 预测: A New Software Agent 'Learning' Algorithm - 真实: Mapping Human Oriented Information to Software Agents for Online Systems Usage\n",
      "   任务: 4b6bb5f7-f634-410e-815d-e673ab7f8632 - LEVEL 1 False - 预测: INT. THE CASTLE - DAY - 真实: THE CASTLE\n",
      "   任务: cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb - LEVEL 1 True - 预测: Fred - 真实: Fred\n",
      "   任务: 2d83110e-a098-4ebb-9987-066c06fa42d0 - LEVEL 1 True - 预测: right - 真实: Right\n",
      "   任务: 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2 - LEVEL 1 True - 预测: No - 真实: No\n",
      "   任务: 27d5d136-8563-469e-92bf-fd103c28b57c - LEVEL 1 True - 预测: (¬A → B) ↔ (A ∨ ¬B) - 真实: (¬A → B) ↔ (A ∨ ¬B)\n",
      "   任务: dc28cf18-6431-458b-83ef-64b3ce566c10 - LEVEL 1 True - 预测: 2 - 真实: 2\n",
      "   任务: b816bfce-3d80-4913-a07d-69b752ce6377 - LEVEL 1 True - 预测: fluffy - 真实: fluffy\n",
      "   任务: 72e110e7-464c-453c-a309-90a95aed6538 - LEVEL 1 False - 预测: Kazakhstan - 真实: Guatemala\n",
      "   任务: 42576abe-0deb-4869-8c63-225c2d75a95a - LEVEL 1 True - 预测: Maktay Mato Apple - 真实: Maktay mato apple\n",
      "   任务: b415aba4-4b68-4fc6-9b89-2c812e55a3e1 - LEVEL 1 True - 预测: diamond - 真实: diamond\n",
      "   任务: cca530fc-4052-43b2-b130-b30968d8aa44 - LEVEL 1 False - 预测: Qd1+ - 真实: Rd5\n",
      "   任务: 935e2cff-ae78-4218-b3f5-115589b19dae - LEVEL 1 False - 预测: Unable to determine - 真实: research\n",
      "   任务: 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 - LEVEL 1 True - 预测: FunkMonk - 真实: FunkMonk\n",
      "   任务: 5188369a-3bbe-43d8-8b94-11558f909a08 - LEVEL 1 True - 预测: Annie Levin - 真实: Annie Levin\n",
      "   任务: 6f37996b-2ac7-44b0-8e68-6d28256631b4 - LEVEL 1 True - 预测: b,e - 真实: b, e\n",
      "   任务: 9318445f-fe6a-4e1b-acbf-c68228c9906a - LEVEL 1 False - 预测: 3/4,1/4,3/4,5/35,7/21,30/5,6/8,3/4,4/60,1/15,30/90,1/3,8/18,4/9,9/72,1/8,64/46,32/23,206/340,103/170 - 真实: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170\n",
      "   任务: 389793a7-ca17-4e82-81cb-2b3a2391b4b9 - LEVEL 1 True - 预测: 3 - 真实: 3\n",
      "   任务: 4b650a35-8529-4695-89ed-8dc7a500a498 - LEVEL 1 True - 预测: Guava - 真实: Guava\n",
      "   任务: a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c - LEVEL 1 True - 预测: 4 - 真实: 4\n",
      "   任务: c714ab3a-da30-4603-bacd-d008800188b9 - LEVEL 1 True - 预测: 100 - 真实: 100\n",
      "   任务: 9d191bce-651d-4746-be2d-7ef8ecadb9c2 - LEVEL 1 True - 预测: Extremely - 真实: Extremely\n",
      "   任务: 65afbc8a-89ca-4ad5-8d62-355bb401f61d - LEVEL 1 False - 预测: fff000 - 真实: F478A7\n",
      "   任务: cabe07ed-9eca-40ea-8ead-410ef5e83f91 - LEVEL 1 False - 预测: None - 真实: Louvrier\n",
      "   任务: 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 - LEVEL 1 False - 预测: celery, lettuce, sweet potatoes - 真实: broccoli, celery, fresh basil, lettuce, sweet potatoes\n",
      "   任务: 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 - LEVEL 1 True - 预测: cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries - 真实: cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries\n",
      "   任务: d0633230-7067-47a9-9dbf-ee11e0a2cdd6 - LEVEL 1 False - 预测: transform - 真实: BaseLabelPropagation\n",
      "   任务: 305ac316-eef6-4446-960a-92d80d542f82 - LEVEL 1 True - 预测: Wojciech - 真实: Wojciech\n",
      "   任务: 0383a3ee-47a7-41a4-b493-519bdefe0488 - LEVEL 1 True - 预测: Rockhopper Penguin - 真实: Rockhopper penguin\n",
      "   任务: f918266a-b3e0-4914-865d-4faa564f1aef - LEVEL 1 True - 预测: 0 - 真实: 0\n",
      "   任务: 11af4e1a-5f45-467d-9aeb-46f4bb0bf034 - LEVEL 1 True - 预测: 6 - 真实: 6\n",
      "   任务: e142056d-56ab-4352-b091-b56054bd1359 - LEVEL 1 False - 预测: 20000 - 真实: 16000\n",
      "   任务: 50ad0280-0819-4bd9-b275-5de32d3b5bcb - LEVEL 1 True - 预测: THE SEAGULL GLIDED PEACEFULLY TO MY CHAIR - 真实: The seagull glided peacefully to my chair.\n",
      "   任务: 7673d772-ef80-4f0f-a602-1bf4485c9b43 - LEVEL 1 True - 预测: inference - 真实: inference\n",
      "   任务: c365c1c7-a3db-4d5e-a9a1-66f56eae7865 - LEVEL 1 False - 预测: Honolulu, Plymouth - 真实: Braintree, Honolulu\n",
      "   任务: 7d4a7d1d-cac6-44a8-96e8-ea9584a70825 - LEVEL 1 False - 预测: 27 - 真实: 22\n",
      "   任务: dc22a632-937f-4e6a-b72f-ba0ff3f5ff97 - LEVEL 1 True - 预测: Five Hundred Things to Eat Before It's Too Late and the Very Best Places to Eat Them - 真实: Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them\n",
      "   任务: 3f57289b-8c60-48be-bd80-01f8099ca449 - LEVEL 1 True - 预测: 519 - 真实: 519\n",
      "   任务: 23dd907f-1261-4488-b21c-e9185af91d5e - LEVEL 1 False - 预测: 1 - 真实: 2\n",
      "   任务: 1f975693-876d-457b-a649-393859e79bf3 - LEVEL 1 False - 预测: 132,133,134,197,245,1197 - 真实: 132, 133, 134, 197, 245\n",
      "   任务: 840bfca7-4f7b-481a-8794-c560c340185d - LEVEL 1 True - 预测: 80GSFC21M0002 - 真实: 80GSFC21M0002\n",
      "   任务: a0068077-79f4-461a-adfe-75c1a4148545 - LEVEL 1 False - 预测: Unable to determine - 真实: 90\n",
      "   任务: bda648d7-d618-4883-88f4-3466eabd860e - LEVEL 1 False - 预测: St. Petersburg - 真实: Saint Petersburg\n",
      "   任务: 50ec8903-b81f-4257-9450-1085afd2c319 - LEVEL 1 True - 预测: green,white - 真实: green, white\n",
      "   任务: cf106601-ab4f-4af9-b045-5295fe67b37d - LEVEL 1 True - 预测: CUB - 真实: CUB\n",
      "   任务: a0c07678-e491-4bbc-8f0b-07405144218f - LEVEL 1 False - 预测: None, Uwasawa - 真实: Yoshida, Uehara\n",
      "   任务: 7bd855d8-463d-4ed5-93ca-5fe35145f733 - LEVEL 1 False - 预测: 320123.94 - 真实: 89706.00\n",
      "   任务: 5a0c1adf-205e-4841-a666-7c3ef95def9d - LEVEL 1 True - 预测: Claus - 真实: Claus\n",
      "   任务: c61d22de-5f6c-4958-a7f6-5e9707bd3466 - LEVEL 2 True - 预测: Egalitarian - 真实: egalitarian\n",
      "   任务: 17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc - LEVEL 2 True - 预测: 34689 - 真实: 34689\n",
      "   任务: 04a04a9b-226c-43fd-b319-d5e89743676f - LEVEL 2 False - 预测: 42 - 真实: 41\n",
      "   任务: 14569e28-c88c-43e4-8c32-097d35b9a67d - LEVEL 2 False - 预测: i - 真实: backtick\n",
      "   任务: 32102e3e-d12a-4209-9163-7b3a104efe5d - LEVEL 2 True - 预测: Time-Parking 2: Parallel Universe - 真实: Time-Parking 2: Parallel Universe\n",
      "   任务: 3627a8be-a77f-41bb-b807-7e1bd4c0ebdf - LEVEL 2 True - 预测: 142 - 真实: 142\n",
      "   任务: 7619a514-5fa8-43ef-9143-83b66a43d7a4 - LEVEL 2 False - 预测: 07/03/24 - 真实: 04/15/18\n",
      "   任务: 7dd30055-0198-452e-8c25-f73dbe27dcb8 - LEVEL 2 False - 预测: 146 pm - 真实: 1.456\n",
      "   任务: 2a649bb1-795f-4a01-b3be-9a01868dae73 - LEVEL 2 False - 预测: 3.1.3.1 - 真实: 3.1.3.1; 1.11.1.7\n",
      "   任务: 87c610df-bef7-4932-b950-1d83ef4e282b - LEVEL 2 True - 预测: Morarji Desai - 真实: Morarji Desai\n",
      "   任务: 624cbf11-6a41-4692-af9c-36b3e5ca3130 - LEVEL 2 False - 预测: Economic Crunch was laid to rest - 真实: So we had to let it die.\n",
      "   任务: dd3c7503-f62a-4bd0-9f67-1b63b94194cc - LEVEL 2 True - 预测: 6 - 真实: 6\n",
      "   任务: df6561b2-7ee5-4540-baab-5095f742716a - LEVEL 2 False - 预测: 17.339 - 真实: 17.056\n",
      "   任务: f0f46385-fc03-4599-b5d3-f56496c3e69f - LEVEL 2 True - 预测: Indonesia,Myanmar - 真实: Indonesia, Myanmar\n",
      "   任务: e4e91f1c-1dcd-439e-9fdd-cb976f5293fd - LEVEL 2 True - 预测: cloak - 真实: cloak\n",
      "   任务: 56137764-b4e0-45b8-9c52-1866420c3df5 - LEVEL 2 False - 预测: Unable to determine - 真实: Li Peng\n",
      "   任务: 8b3379c0-0981-4f5b-8407-6444610cb212 - LEVEL 2 False - 预测: 13 - 真实: 1.8\n",
      "   任务: 0ff53813-3367-4f43-bcbd-3fd725c1bf4b - LEVEL 2 True - 预测: beta geometric - 真实: beta geometric\n",
      "   任务: a7feb290-76bb-4cb7-8800-7edaf7954f2f - LEVEL 2 False - 预测: 60 - 真实: 31\n",
      "   任务: b4cc024b-3f5e-480e-b96a-6656493255b5 - LEVEL 2 False - 预测: Russo-German Legion - 真实: Russian-German Legion\n",
      "   任务: 33d8ea3b-6c6b-4ff1-803d-7e270dea8a57 - LEVEL 2 True - 预测: 2 - 真实: 2\n",
      "   任务: e8cb5b03-41e0-4086-99e5-f6806cd97211 - LEVEL 2 True - 预测: shrimp - 真实: shrimp\n",
      "   任务: f46b4380-207e-4434-820b-f32ce04ae2a4 - LEVEL 2 True - 预测: Harbinger, Tidal - 真实: Harbinger, Tidal\n",
      "   任务: 05407167-39ec-4d3a-a234-73a9120c325d - LEVEL 2 False - 预测: Unable to determine - 真实: Format Document\n",
      "   任务: b9763138-c053-4832-9f55-86200cb1f99c - LEVEL 2 True - 预测: 3 - 真实: 3\n",
      "   任务: 16d825ff-1623-4176-a5b5-42e0f5c2b0ac - LEVEL 2 False - 预测: 12:52 PM - 真实: 6:41 PM\n",
      "   任务: 2b3ef98c-cc05-450b-a719-711aee40ac65 - LEVEL 2 False - 预测: givemethatmanthatisnotpassionsslaveandiwillwearhiminmyheartscoreayinmyheartofheart - 真实: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
      "   任务: bfcd99e1-0690-4b53-a85c-0174a8629083 - LEVEL 2 True - 预测: 17 - 真实: 17\n",
      "   任务: 544b7f0c-173a-4377-8d56-57b36eb26ddf - LEVEL 2 True - 预测: A Nightmare on Elm Street - 真实: A Nightmare on Elm Street\n",
      "   任务: 6b078778-0b90-464d-83f6-59511c811b01 - LEVEL 2 True - 预测: Alfonso Visconti - 真实: Alfonso Visconti\n",
      "   任务: 076c8171-9b3b-49b9-a477-244d2a532826 - LEVEL 2 True - 预测: Finance - 真实: Finance\n",
      "   任务: 08cae58d-4084-4616-b6dd-dd6534e4825b - LEVEL 2 False - 预测: Unable to determine\n",
      "\n",
      "Explanation: Google Finance displays only split-adjusted stock prices for AAPL and does not offer unadjusted (raw, non-split-adjusted) price history in its public interface as of 2024. Therefore, according to Google Finance, the answer for when Apple stock first exceeded $50 \"without adjusting for stock split\" cannot be determined using Google Finance data, because that data is not presented or available in this required form. - 真实: 2018\n",
      "   任务: 2dfc4c37-fec1-4518-84a7-10095d30ad75 - LEVEL 2 False - 预测: 3 - 真实: 6\n",
      "   任务: 9f41b083-683e-4dcf-9185-ccfeaa88fa45 - LEVEL 2 True - 预测: 0 - 真实: 0\n",
      "   任务: ecbc4f94-95a3-4cc7-b255-6741a458a625 - LEVEL 2 False - 预测: 16 - 真实: 13\n",
      "   任务: e9a2c537-8232-4c3f-85b0-b52de6bcba99 - LEVEL 2 False - 预测: 2 - 真实: 7\n",
      "   任务: 71345b0a-9c7d-4b50-b2bf-937ec5879845 - LEVEL 2 False - 预测: A dragon is a creature that does not exist - 真实: Here be dragons\n",
      "   任务: 7b5377b0-3f38-4103-8ad2-90fe89864c04 - LEVEL 2 True - 预测: 563.9 - 真实: 563.9\n",
      "   任务: 114d5fd0-e2ae-4b6d-a65a-870da2d19c08 - LEVEL 2 True - 预测: 4 - 真实: 4\n",
      "   任务: 8f80e01c-1296-4371-9486-bb3d68651a60 - LEVEL 2 False - 预测: 7 - 真实: 90\n",
      "   任务: ad37a656-079a-49f9-a493-7b739c9167d1 - LEVEL 2 True - 预测: Bravo - 真实: Bravo\n",
      "   任务: 366e2f2b-8632-4ef2-81eb-bc3877489217 - LEVEL 2 True - 预测: Shelley's Place - 真实: Shelley's place\n",
      "   任务: f3917a3d-1d17-4ee2-90c5-683b072218fe - LEVEL 2 False - 预测: 2736 - 真实: 2732\n",
      "   任务: 48eb8242-1099-4c26-95d4-ef22b002457a - LEVEL 2 True - 预测: 6 - 真实: 6\n",
      "   任务: c8b7e059-c60d-472e-ad64-3b04ae1166dc - LEVEL 2 True - 预测: 8 - 真实: 8\n",
      "   任务: d1af70ea-a9a4-421a-b9cc-94b5e02f1788 - LEVEL 2 True - 预测: 736455 - 真实: 736455\n",
      "   任务: 08f3a05f-5947-4089-a4c4-d4bcfaa6b7a0 - LEVEL 2 True - 预测: 2 - 真实: 2\n",
      "   任务: 54612da3-fd56-4941-80f4-5eb82330de25 - LEVEL 2 True - 预测: 60 - 真实: 60\n",
      "   任务: ded28325-3447-4c56-860f-e497d6fb3577 - LEVEL 2 False - 预测: Picnic is in Polybius Plaza - 真实: Picnic is in Ploybius Plaza.\n",
      "   任务: 6359a0b1-8f7b-499b-9336-840f9ab90688 - LEVEL 2 False - 预测: 84.0 - 真实: 39\n",
      "   任务: 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f - LEVEL 2 True - 预测: Wharvton - 真实: Wharvton\n",
      "   任务: d700d50d-c707-4dca-90dc-4528cddd0c80 - LEVEL 2 True - 预测: Roger Miller - 真实: Roger Miller\n",
      "   任务: 0a3cd321-3e76-4622-911b-0fda2e5d6b1a - LEVEL 2 False - 预测: China, Qatar, Singapore - 真实: Brunei, China, Morocco, Singapore\n",
      "   任务: f2feb6a4-363c-4c09-a804-0db564eafd68 - LEVEL 2 False - 预测: 2017 Komo Mai Drive, 900000 - 真实: 900000\n",
      "   任务: 0b260a57-3f3a-4405-9f29-6d7a1012dbfb - LEVEL 2 False - 预测: 0.577 - 真实: 0.269\n",
      "   任务: ed58682d-bc52-4baa-9eb0-4eb81e1edacc - LEVEL 2 True - 预测: stare - 真实: stare\n",
      "   任务: cca70ce6-1952-45d2-acd4-80c903b0bc49 - LEVEL 2 True - 预测: 85 - 真实: 85\n",
      "   任务: b7f857e4-d8aa-4387-af2a-0e844df5b9d8 - LEVEL 2 True - 预测: 47 - 真实: 47\n",
      "   任务: d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de - LEVEL 2 True - 预测: 0.03 - 真实: 0.03\n",
      "   任务: 67e8878b-5cef-4375-804e-e6291fdbe78a - LEVEL 2 False - 预测: Hotel - 真实: Hotels\n",
      "   任务: 023e9d44-96ae-4eed-b912-244ee8c3b994 - LEVEL 2 False - 预测: 8.25 - 真实: 8\n",
      "   任务: 0e9e85b8-52b9-4de4-b402-5f635ab9631f - LEVEL 2 False - 预测: 1998 - 真实: 1927\n",
      "   任务: 20194330-9976-4043-8632-f8485c6c71b2 - LEVEL 2 False - 预测: 0 - 真实: 4\n",
      "   任务: 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2 - LEVEL 2 False - 预测: 4 - 真实: 8\n",
      "   任务: 65638e28-7f37-4fa7-b7b9-8c19bb609879 - LEVEL 2 False - 预测: Robertson Smith - 真实: Kleinpaul\n",
      "   任务: 3ff6b7a9-a5bd-4412-ad92-0cd0d45c0fee - LEVEL 2 True - 预测: 56000 - 真实: 56000\n",
      "   任务: 708b99c5-e4a7-49cb-a5cf-933c8d46470d - LEVEL 2 False - 预测: citation count - 真实: Citations\n",
      "   任务: 0a65cb96-cb6e-4a6a-8aae-c1084f613456 - LEVEL 2 True - 预测: Holabird - 真实: Holabird\n",
      "   任务: 65da0822-a48a-4a68-bbad-8ed1b835a834 - LEVEL 2 True - 预测: Santa Clara,Boston - 真实: Santa Clara, Boston\n",
      "   任务: 0bb3b44a-ede5-4db5-a520-4e844b0079c5 - LEVEL 2 True - 预测: 536 - 真实: 536\n",
      "   任务: 73c1b9fe-ee1d-4cf4-96ca-35c08f97b054 - LEVEL 2 True - 预测: 1954 - 真实: 1954\n",
      "   任务: e2d69698-bc99-4e85-9880-67eaccd66e6c - LEVEL 2 True - 预测: Michele Fitzgerald - 真实: Michele Fitzgerald\n",
      "   任务: a56f1527-3abf-41d6-91f8-7296d6336c3f - LEVEL 2 True - 预测: 185 - 真实: 185\n",
      "   任务: 42d4198c-5895-4f0a-b0c0-424a66465d83 - LEVEL 2 False - 预测: Unable to determine - 真实: 60\n",
      "   任务: edd4d4f2-1a58-45c4-b038-67337af4e029 - LEVEL 2 True - 预测: Berkshire - 真实: Berkshire\n",
      "   任务: a26649c6-1cb2-470a-871e-6910c64c3e53 - LEVEL 2 True - 预测: 116 - 真实: 116\n",
      "   任务: 4d0aa727-86b1-406b-9b33-f870dd14a4a5 - LEVEL 2 True - 预测: 1 in 3 - 真实: 1 in 3\n",
      "   任务: d5141ca5-e7a0-469f-bf3e-e773507c86e2 - LEVEL 2 False - 预测: 29/12/2019 - 真实: 19/02/2009\n",
      "   任务: 1dcc160f-c187-48c2-b68e-319bd4354f3d - LEVEL 2 False - 预测: 0 - 真实: 3\n",
      "   任务: b2c257e0-3ad7-4f05-b8e3-d9da973be36e - LEVEL 2 True - 预测: +4.6 - 真实: +4.6\n",
      "   任务: e0c10771-d627-4fd7-9694-05348e54ee36 - LEVEL 2 True - 预测: 234.9 - 真实: 234.9\n",
      "   任务: e29834fd-413a-455c-a33e-c3915b07401c - LEVEL 2 False - 预测: 19 - 真实: 21\n",
      "   任务: 08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715 - LEVEL 2 False - 预测: gold - 真实: orange, white\n",
      "   任务: db4fd70a-2d37-40ea-873f-9433dc5e301f - LEVEL 2 True - 预测: 10 - 真实: 10\n",
      "   任务: 853c8244-429e-46ca-89f2-addf40dfb2bd - LEVEL 2 False - 预测: 0 - 真实: 11\n",
      "   任务: 7a4a336d-dcfa-45a0-b014-824c7619e8de - LEVEL 2 False - 预测: 1:52.43 - 真实: 1:41.614\n",
      "   任务: 676e5e31-a554-4acc-9286-b60d90a92d26 - LEVEL 3 False - 预测: 83 - 真实: 86\n",
      "   任务: bec74516-02fc-48dc-b202-55e78d0e17cf - LEVEL 3 False - 预测: Unable to determine - 真实: 26.4\n",
      "   任务: 00d579ea-0889-4fd9-a771-2c8d79835c8d - LEVEL 3 False - 预测: Jerome Wiesner - 真实: Claude Shannon\n",
      "   任务: 384d0dd8-e8a4-4cfe-963c-d37f256e7662 - LEVEL 3 False - 预测: Unable to determine - 真实: 4192\n",
      "   任务: de9887f5-ead8-4727-876f-5a4078f8598c - LEVEL 3 True - 预测: 22 - 真实: 22\n",
      "   任务: 983bba7c-c092-455f-b6c9-7857003d48fc - LEVEL 3 False - 预测: mammals - 真实: mice\n",
      "   任务: 56db2318-640f-477a-a82f-bc93ad13e882 - LEVEL 3 True - 预测: 7, 9 - 真实: 7, 9\n",
      "   任务: 8131e2c0-0083-4265-9ce7-78c2d568425d - LEVEL 3 False - 预测: 99.9 CFM for Cheater, 92.6 CFM for Cheater beater - 真实: 101.376, 84.348\n",
      "   任务: 72c06643-a2fa-4186-aa5c-9ec33ae9b445 - LEVEL 3 False - 预测: 195 - 真实: 55\n",
      "   任务: ebbc1f13-d24d-40df-9068-adcf735b4240 - LEVEL 3 True - 预测: the world of the twenty first century - 真实: The World of the Twenty First Century\n",
      "   任务: c526d8d6-5987-4da9-b24c-83466fa172f3 - LEVEL 3 False - 预测: 0.0429 - 真实: 0.0424\n",
      "   任务: 3da89939-209c-4086-8520-7eb734e6b4ef - LEVEL 3 False - 预测: 8,25,27,16,26,21 - 真实: 8, 29, 22, 1, 8, 26\n",
      "   任务: 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b - LEVEL 3 False - 预测: 0.00031 - 真实: 0.00033\n",
      "   任务: e961a717-6b25-4175-8a68-874d28190ee4 - LEVEL 3 True - 预测: 12 - 真实: 12\n",
      "   任务: 851e570a-e3de-4d84-bcfa-cc85578baa59 - LEVEL 3 False - 预测: Unable to determine - 真实: Briniest\n",
      "   任务: 50f58759-7bd6-406f-9b0d-5692beb2a926 - LEVEL 3 False - 预测: Unable to determine - 真实: 3\n",
      "   任务: 872bfbb1-9ccf-49f6-8c5f-aa22818ccd66 - LEVEL 3 False - 预测: pears - 真实: pears, bananas\n",
      "   任务: c3a79cfe-8206-451f-aca8-3fec8ebe51d3 - LEVEL 3 True - 预测: 8 - 真实: 8\n",
      "   任务: da52d699-e8d2-4dc5-9191-a2199e0b6a9b - LEVEL 3 True - 预测: Out of the Silent Planet - 真实: Out of the Silent Planet\n",
      "   任务: ad2b4d70-9314-4fe6-bfbe-894a45f6055f - LEVEL 3 False - 预测: serpent - 真实: War is not here this is a land of peace\n",
      "   任务: 5b2a14e8-6e59-479c-80e3-4696e8980152 - LEVEL 3 False - 预测: None - 真实: bacon\n",
      "   任务: 9e1fc53b-46ff-49a1-9d05-9e6faac34cc5 - LEVEL 3 False - 预测: Death Knight,Mage,Paladin,Priest,Warlock - 真实: Death Knight, Hunter, Paladin, Priest, Warlock\n",
      "   任务: 5f982798-16b9-4051-ab57-cfc7ebdb2a91 - LEVEL 3 False - 预测: 0 - 真实: 0.2\n",
      "   任务: 0512426f-4d28-49f0-be77-06d05daec096 - LEVEL 3 False - 预测: 65000000 - 真实: 100000000\n",
      "   任务: 0bdb7c40-671d-4ad1-9ce3-986b159c0ddc - LEVEL 3 True - 预测: White;5876 - 真实: White; 5876\n",
      "   任务: 9b54f9d9-35ee-4a14-b62f-d130ea00317f - LEVEL 3 False - 预测: Unable to determine - 真实: Soups and Stews\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/standard/standard_total_level.jsonl\"\n",
    "evaluate_jsonl(file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73a3ccaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n"
     ]
    }
   ],
   "source": [
    "a = ''\n",
    "if a:\n",
    "    print(\"1\")\n",
    "else:\n",
    "    print(\"0\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c55b8a3",
   "metadata": {},
   "source": [
    "# General"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc470cac",
   "metadata": {},
   "source": [
    "## 重复任务检测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "id": "76f5823d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "import unicodedata\n",
    "import os\n",
    "import sys\n",
    "from collections import defaultdict\n",
    "from rich.console import Console\n",
    "from rich.progress import track\n",
    "from rich.table import Table\n",
    "\n",
    "# --- 辅助函数 (来自之前的脚本，保持不变) ---\n",
    "def normalize_text(s: str) -> str:\n",
    "    if not isinstance(s, str): s = str(s)\n",
    "    s = s.lower()\n",
    "    s = unicodedata.normalize(\"NFKC\", s)\n",
    "    s = re.sub(r'[^a-z0-9]+', ' ', s)\n",
    "    s = ' '.join(s.split())\n",
    "    return s\n",
    "\n",
    "def judge_correct(pred: str, true: str) -> bool:\n",
    "    return normalize_text(pred) == normalize_text(true)\n",
    "\n",
    "def parse_json_stream(file_path: str, console: Console) -> list:\n",
    "    \"\"\"使用经过验证的括号匹配法，解析非标准的、拼接的JSON文件。\"\"\"\n",
    "    if not os.path.exists(file_path):\n",
    "        console.print(f\"❌ [bold red]错误: 文件未找到 -> {file_path}[/bold red]\")\n",
    "        return None\n",
    "    try:\n",
    "        with open(file_path, 'r', encoding='utf-8') as infile:\n",
    "            content = infile.read()\n",
    "    except Exception as e:\n",
    "        console.print(f\"❌ [bold red]错误: 读取文件 {file_path} 时失败: {e}[/bold red]\")\n",
    "        return None\n",
    "    \n",
    "    # ... [解析逻辑和之前完全一样] ...\n",
    "    object_strings, brace_level, start_index, in_string = [], 0, -1, False\n",
    "    for i, char in enumerate(content):\n",
    "        if char == '\"' and (i == 0 or content[i-1] != '\\\\'): in_string = not in_string\n",
    "        if not in_string:\n",
    "            if char == '{':\n",
    "                if brace_level == 0: start_index = i\n",
    "                brace_level += 1\n",
    "            elif char == '}':\n",
    "                if brace_level > 0: brace_level -= 1\n",
    "                if brace_level == 0 and start_index != -1:\n",
    "                    object_strings.append(content[start_index : i + 1])\n",
    "                    start_index = -1\n",
    "    records = []\n",
    "    for obj_str in object_strings:\n",
    "        try: records.append(json.loads(obj_str))\n",
    "        except json.JSONDecodeError: console.print(f\"⚠️ [yellow]警告: 在 {os.path.basename(file_path)} 中跳过一个无法解析的JSON对象块。[/yellow]\")\n",
    "    return records\n",
    "\n",
    "# --- 主逻辑：去重与清洗 ---\n",
    "def deduplicate_and_clean_file(file_path: str):\n",
    "    \"\"\"\n",
    "    对文件进行智能去重，并生成一个干净的新文件。\n",
    "    \"\"\"\n",
    "    console = Console()\n",
    "    console.print(f\"🚀 [bold cyan]启动智能去重与清洗任务:[/bold cyan] [underline]{file_path}[/underline]\")\n",
    "\n",
    "    # 步骤 1: 解析文件\n",
    "    records = parse_json_stream(file_path, console)\n",
    "    if not records:\n",
    "        console.print(\"📄 文件中未找到任何记录，任务结束。\")\n",
    "        return\n",
    "\n",
    "    # 步骤 2: 按 task_id 对所有记录进行分组\n",
    "    console.print(f\"🔄 正在按 task_id 对 {len(records):,} 条记录进行分组...\")\n",
    "    grouped_tasks = defaultdict(list)\n",
    "    for record in records:\n",
    "        task_id = record.get(\"task_id\")\n",
    "        if task_id:\n",
    "            grouped_tasks[task_id].append(record)\n",
    "    \n",
    "    # 步骤 3: 智能筛选，生成最终的记录列表\n",
    "    final_records = []\n",
    "    stats = {\"unique\": 0, \"kept_correct\": 0, \"kept_by_tokens\": 0}\n",
    "\n",
    "    for task_id, group in track(grouped_tasks.items(), description=\"清洗中...\"):\n",
    "        if len(group) == 1:\n",
    "            # 没有重复，直接保留\n",
    "            final_records.append(group[0])\n",
    "            stats[\"unique\"] += 1\n",
    "            continue\n",
    "\n",
    "        # 处理重复项\n",
    "        # 规则1: 查找回答正确的记录\n",
    "        correct_records = [rec for rec in group if judge_correct(rec.get(\"prediction\", \"\"), rec.get(\"true_answer\", \"\"))]\n",
    "\n",
    "        if correct_records:\n",
    "            # 如果有正确的，保留第一个正确的记录\n",
    "            final_records.append(correct_records[0])\n",
    "            stats[\"kept_correct\"] += 1\n",
    "        else:\n",
    "            # 规则2: 如果没有正确的，保留 token 消耗最高的记录\n",
    "            # 使用 .get() 避免因缺少键而导致的错误\n",
    "            best_record_by_tokens = max(group, key=lambda r: r.get(\"token_counts\", {}).get(\"total_token_count\", 0))\n",
    "            final_records.append(best_record_by_tokens)\n",
    "            stats[\"kept_by_tokens\"] += 1\n",
    "\n",
    "    # 步骤 4: 将清洗后的数据以原始格式写入新文件\n",
    "    original_name = os.path.splitext(os.path.basename(file_path))[0]\n",
    "    output_filename = f\"{original_name}_cleaned.jsonl\"\n",
    "    \n",
    "    console.print(f\"✍️ 正在以原始美化格式写入新文件: [underline]{output_filename}[/underline]...\")\n",
    "    try:\n",
    "        with open(output_filename, 'w', encoding='utf-8') as outfile:\n",
    "            num_records = len(final_records)\n",
    "            for i, record in enumerate(final_records):\n",
    "                pretty_json_str = json.dumps(record, ensure_ascii=False, indent=2)\n",
    "                outfile.write(pretty_json_str)\n",
    "                if i < num_records - 1:\n",
    "                    outfile.write('\\n')\n",
    "    except Exception as e:\n",
    "        console.print(f\"❌ [bold red]写入文件时发生错误: {e}[/bold red]\")\n",
    "        return\n",
    "\n",
    "    # 步骤 5: 打印总结报告\n",
    "    console.print(\"\\n🎉 [bold green]清洗任务完成！[/bold green]\")\n",
    "    \n",
    "    total_removed = len(records) - len(final_records)\n",
    "    \n",
    "    summary_table = Table(title=\"[bold]📊 清洗结果统计[/bold]\")\n",
    "    summary_table.add_column(\"项目\", style=\"dim\", width=35)\n",
    "    summary_table.add_column(\"数量\", justify=\"right\")\n",
    "\n",
    "    summary_table.add_row(\"原始总记录数\", f\"{len(records):,}\")\n",
    "    summary_table.add_row(\"唯一的任务ID数\", f\"{len(grouped_tasks):,}\")\n",
    "    summary_table.add_row(\"----------------------------------\", \"----------\")\n",
    "    summary_table.add_row(\"保留的无重复记录\", f\"{stats['unique']:,}\")\n",
    "    summary_table.add_row(\"[green]因回答正确而保留的重复记录[/green]\", f\"{stats['kept_correct']:,}\")\n",
    "    summary_table.add_row(\"[cyan]因Token消耗最高而保留的重复记录[/cyan]\", f\"{stats['kept_by_tokens']:,}\")\n",
    "    summary_table.add_row(\"----------------------------------\", \"----------\")\n",
    "    summary_table.add_row(\"清洗后总记录数\", f\"[bold]{len(final_records):,}[/bold]\")\n",
    "    summary_table.add_row(\"被移除的重复记录总数\", f\"[red]{total_removed:,}[/red]\")\n",
    "    \n",
    "    console.print(summary_table)\n",
    "    console.print(f\"✅ 新文件已保存在: [underline]{os.path.abspath(output_filename)}[/underline]\")\n",
    "\n",
    "# # --- 程序主入口 ---\n",
    "# if __name__ == \"__main__\":\n",
    "#     if len(sys.argv) != 2:\n",
    "#         print(\"错误：需要提供一个文件路径作为参数。\")\n",
    "#         print(\"用法: python clean_duplicates.py <文件路径>\")\n",
    "#         print(\"例如: python clean_duplicates.py /path/to/your/file.jsonl\")\n",
    "#     else:\n",
    "#         file_to_clean = sys.argv[1]\n",
    "#         deduplicate_and_clean_file(file_to_clean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "id": "d057c853",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">🚀 <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">启动智能去重与清洗任务:</span> \n",
       "<span style=\"color: #800080; text-decoration-color: #800080; text-decoration: underline\">/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/validation/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; text-decoration: underline\">0916_single_age</span>\n",
       "<span style=\"color: #ff00ff; text-decoration-color: #ff00ff; text-decoration: underline\">nt.jsonl</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "🚀 \u001b[1;36m启动智能去重与清洗任务:\u001b[0m \n",
       "\u001b[4;35m/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/validation/\u001b[0m\u001b[4;95m0916_single_age\u001b[0m\n",
       "\u001b[4;95mnt.jsonl\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">🔄 正在按 task_id 对 <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">253</span> 条记录进行分组<span style=\"color: #808000; text-decoration-color: #808000\">...</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "🔄 正在按 task_id 对 \u001b[1;36m253\u001b[0m 条记录进行分组\u001b[33m...\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1e8892398878456cb046c54823633ffe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">✍️ 正在以原始美化格式写入新文件: <span style=\"text-decoration: underline\">0916_single_agent_cleaned.jsonl</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "✍️ 正在以原始美化格式写入新文件: \u001b[4m0916_single_agent_cleaned.jsonl\u001b[0m\u001b[33m...\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "🎉 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">清洗任务完成！</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n",
       "🎉 \u001b[1;32m清洗任务完成！\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                  </span><span style=\"font-weight: bold; font-style: italic\">📊 清洗结果统计</span><span style=\"font-style: italic\">                   </span>\n",
       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n",
       "┃<span style=\"font-weight: bold\"> 项目                                </span>┃<span style=\"font-weight: bold\">       数量 </span>┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 原始总记录数                        </span>│        253 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 唯一的任务ID数                      </span>│        156 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ----------------------------------  </span>│ ---------- │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 保留的无重复记录                    </span>│         65 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span><span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">因回答正确而保留的重复记录</span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">          </span>│         45 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">因Token消耗最高而保留的重复记录</span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     </span>│         46 │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ----------------------------------  </span>│ ---------- │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 清洗后总记录数                      </span>│        <span style=\"font-weight: bold\">156</span> │\n",
       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 被移除的重复记录总数                </span>│         <span style=\"color: #800000; text-decoration-color: #800000\">97</span> │\n",
       "└─────────────────────────────────────┴────────────┘\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                  \u001b[0m\u001b[1;3m📊 清洗结果统计\u001b[0m\u001b[3m                   \u001b[0m\n",
       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓\n",
       "┃\u001b[1m \u001b[0m\u001b[1m项目                               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      数量\u001b[0m\u001b[1m \u001b[0m┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩\n",
       "│\u001b[2m \u001b[0m\u001b[2m原始总记录数                       \u001b[0m\u001b[2m \u001b[0m│        253 │\n",
       "│\u001b[2m \u001b[0m\u001b[2m唯一的任务ID数                     \u001b[0m\u001b[2m \u001b[0m│        156 │\n",
       "│\u001b[2m \u001b[0m\u001b[2m---------------------------------- \u001b[0m\u001b[2m \u001b[0m│ ---------- │\n",
       "│\u001b[2m \u001b[0m\u001b[2m保留的无重复记录                   \u001b[0m\u001b[2m \u001b[0m│         65 │\n",
       "│\u001b[2m \u001b[0m\u001b[2;32m因回答正确而保留的重复记录\u001b[0m\u001b[2m         \u001b[0m\u001b[2m \u001b[0m│         45 │\n",
       "│\u001b[2m \u001b[0m\u001b[2;36m因Token消耗最高而保留的重复记录\u001b[0m\u001b[2m    \u001b[0m\u001b[2m \u001b[0m│         46 │\n",
       "│\u001b[2m \u001b[0m\u001b[2m---------------------------------- \u001b[0m\u001b[2m \u001b[0m│ ---------- │\n",
       "│\u001b[2m \u001b[0m\u001b[2m清洗后总记录数                     \u001b[0m\u001b[2m \u001b[0m│        \u001b[1m156\u001b[0m │\n",
       "│\u001b[2m \u001b[0m\u001b[2m被移除的重复记录总数               \u001b[0m\u001b[2m \u001b[0m│         \u001b[31m97\u001b[0m │\n",
       "└─────────────────────────────────────┴────────────┘\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">✅ 新文件已保存在: \n",
       "<span style=\"color: #800080; text-decoration-color: #800080; text-decoration: underline\">/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; text-decoration: underline\">0916_single_agent_cleaned.jsonl</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "✅ 新文件已保存在: \n",
       "\u001b[4;35m/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/\u001b[0m\u001b[4;95m0916_single_agent_cleaned.jsonl\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "deduplicate_and_clean_file(\"/home/ofo/project_workflow_auto_generation/smolagents/examples/open_deep_research/output/validation/0916_single_agent.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46120b6f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smolagent",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
