{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "import json\n",
    "from tqdm import tqdm \n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dict'>\n",
      "110\n"
     ]
    }
   ],
   "source": [
    "test_out_times_path = \"/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/val_out_times.json\"\n",
    "with open(test_out_times_path, 'r') as f:\n",
    "    datasets = json.load(f)\n",
    "\n",
    "print(type(datasets))\n",
    "print(len(datasets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_pair_cross(sorted_submissions):\n",
    "    length = len(sorted_submissions)\n",
    "    pairs = []\n",
    "    for i in range(length):\n",
    "        for j in range(i+1, length):\n",
    "            slow, fast = i, j\n",
    "            pair = (sorted_submissions[slow], sorted_submissions[fast])\n",
    "            pairs.append(pair)\n",
    "            \n",
    "    return pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def relative_improvement(slow:float, fast:float):\n",
    "    return round((slow - fast) / slow, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/110 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 110/110 [00:00<00:00, 2883.48it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pairs length = 10389\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# by user\n",
    "items = []\n",
    "for problem_id, value in tqdm(datasets.items()):\n",
    "    # value is a dict\n",
    "    for user_id, submissions in value.items():\n",
    "        # submissions is a dict: {'s494615146': 0.001053309444, 's487731918': 0.0010449860355}\n",
    "        sorted_submission_id = sorted(submissions, key=submissions.get, reverse=True) # llist\n",
    "        if len(sorted_submission_id) > 1:\n",
    "            pairs = make_pair_cross(sorted_submission_id)\n",
    "            for each_pair in pairs:\n",
    "                slow_submission_id, fast_submission_id = each_pair\n",
    "                slow_time = submissions[slow_submission_id]\n",
    "                fast_time = submissions[fast_submission_id]\n",
    "                improvement = relative_improvement(slow_time, fast_time)\n",
    "                item = {\n",
    "                    \"problem_id\": problem_id,\n",
    "                    \"user_id\": user_id,\n",
    "                    \"slow_submission_id\": slow_submission_id,\n",
    "                    \"fast_submission_id\": fast_submission_id,\n",
    "                    \"slow_time\": slow_time,\n",
    "                    \"fast_time\": fast_time,\n",
    "                    \"improvement\": improvement\n",
    "                }\n",
    "                items.append(item)\n",
    "\n",
    "print(f\"pairs length = {len(items)}\")\n",
    "with open(\"/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/val_out_pairs.json\", 'w') as g:\n",
    "    json.dump(items, g, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "length of original pairs = 3751\n"
     ]
    }
   ],
   "source": [
    "datas = []\n",
    "with open(\"/home/tongye/code_generation/pie-perf/data/cpp_original/val.jsonl\", 'r') as pp:\n",
    "    for line in pp:\n",
    "        item = json.loads(line)\n",
    "        datas.append(item)\n",
    "\n",
    "original_pairs = []\n",
    "for item in datas:\n",
    "    problem_id = item[\"problem_id\"]\n",
    "    user_id = item[\"user_id\"]\n",
    "    slow_submission_id = item[\"submission_id_v0\"]\n",
    "    fast_submission_id = item[\"submission_id_v1\"]\n",
    "    pair = f\"{problem_id}_{user_id}_{slow_submission_id}_{fast_submission_id}\"\n",
    "    original_pairs.append(pair)\n",
    "\n",
    "print(f\"length of original pairs = {len(original_pairs)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10389\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10389/10389 [00:00<00:00, 1214860.72it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2769\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "with open(\"/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/val_out_pairs.json\", 'r') as g:\n",
    "    pairs = json.load(g)\n",
    "\n",
    "print(len(pairs))\n",
    "count = 0\n",
    "pairs_in_improvement10 = []\n",
    "for pair in tqdm(pairs):\n",
    "    problem_id = pair[\"problem_id\"]\n",
    "    user_id = pair[\"user_id\"]\n",
    "    slow_submission_id = pair[\"slow_submission_id\"]\n",
    "    fast_submission_id = pair[\"fast_submission_id\"]\n",
    "    improvement = pair[\"improvement\"]\n",
    "    identifier = f\"{problem_id}_{user_id}_{slow_submission_id}_{fast_submission_id}\"\n",
    "    if improvement > 0.1:\n",
    "        count += 1\n",
    "        pairs_in_improvement10.append(pair)\n",
    "print(count)\n",
    "\n",
    "with open(\"/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/val_out_pairs_improvment10.json\", 'w') as p:\n",
    "    json.dump(pairs_in_improvement10, p, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10389/10389 [00:00<00:00, 1763084.13it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2769\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "with open(\"/largespace/tydata/code_optimization/cpp/dataset/benchmark_gem5_testcases3/val_out_pairs.json\", 'r') as g:\n",
    "    pairs = json.load(g)\n",
    "\n",
    "count = 0\n",
    "for pair in tqdm(pairs):\n",
    "    problem_id = pair[\"problem_id\"]\n",
    "    user_id = pair[\"user_id\"]\n",
    "    slow_submission_id = pair[\"slow_submission_id\"]\n",
    "    fast_submission_id = pair[\"fast_submission_id\"]\n",
    "    improvement = pair[\"improvement\"]\n",
    "    if improvement > 0.1:\n",
    "        \n",
    "        count += 1\n",
    "\n",
    "print(count)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "56086\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 56086/56086 [01:28<00:00, 631.27it/s] \n"
     ]
    }
   ],
   "source": [
    "with open(\"/data3/tydata3/code_optimization/cpp/benchmark_gem5_testcases3/train_out_pairs_improvement10.json\", 'r') as p:\n",
    "    datas = json.load(p)\n",
    "\n",
    "print(len(datas))\n",
    "speedups = []\n",
    "items_new = []\n",
    "for item in tqdm(datas):\n",
    "    problem_id = item[\"problem_id\"]\n",
    "    user_id = item[\"user_id\"]\n",
    "    slow_submission_id = item[\"slow_submission_id\"]\n",
    "    fast_submission_id = item[\"fast_submission_id\"]\n",
    "    slow_time = item[\"slow_time\"]\n",
    "    fast_time = item[\"fast_time\"]\n",
    "    improvement = item[\"improvement\"]\n",
    "\n",
    "    slow_cpp_file_path = os.path.join(\"/data3/tydata3/code_optimization/cpp/cpp_code/train\", f\"{problem_id}_{slow_submission_id}_{user_id}.cpp\")\n",
    "    with open(slow_cpp_file_path, 'r') as f_slow:\n",
    "        slow_cpp_content = f_slow.read()\n",
    "    slow_code = slow_cpp_content\n",
    "\n",
    "    fast_cpp_file_path = os.path.join(\"/data3/tydata3/code_optimization/cpp/cpp_code/train\", f\"{problem_id}_{fast_submission_id}_{user_id}.cpp\")\n",
    "    with open(fast_cpp_file_path, 'r') as f_fast:\n",
    "        fast_cpp_content = f_fast.read()\n",
    "    fast_code = fast_cpp_content\n",
    "\n",
    "    item[\"slow_code\"] = slow_code\n",
    "    item[\"fast_code\"] = fast_code\n",
    "\n",
    "    items_new.append(item)\n",
    "\n",
    "with open(\"/data3/tydata3/code_optimization/cpp/dataset/by_user/train_out_pair_improvement10.json\", 'w') as ff:\n",
    "    json.dump(items_new, ff, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "233554\n",
      "4.037934317541367\n",
      "56086\n"
     ]
    }
   ],
   "source": [
    "with open(\"/data3/tydata3/code_optimization/cpp/benchmark_gem5_testcases3/train_out_pairs.json\", 'r') as p:\n",
    "    datas = json.load(p)\n",
    "\n",
    "print(len(datas))\n",
    "speedups = []\n",
    "for item in datas:\n",
    "    if item[\"improvement\"] > 0.1:\n",
    "        speedup = item[\"slow_time\"] / item[\"fast_time\"]\n",
    "        speedups.append(speedup)\n",
    "\n",
    "print(sum(speedups)/len(speedups))\n",
    "print(len(speedups))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
