{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "val_dataset has 3751 items.\n"
     ]
    }
   ],
   "source": [
    "cpp_test_original = \"/home/tongye/code_generation/pie-perf/data/cpp_original/val.jsonl\"\n",
    "test_dataset = []\n",
    "with open(cpp_test_original, 'r') as jsonl_file:\n",
    "    for line in jsonl_file:\n",
    "        data_item = json.loads(line)\n",
    "        test_dataset.append(data_item)\n",
    "\n",
    "print(f\"val_dataset has {len(test_dataset)} items.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dict'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "dict_keys(['user_id', 'problem_id', 'language', 'submission_id_v0', 'submission_id_v1', 'cpu_time_v0', 'cpu_time_v1', 'memory_v0', 'memory_v1', 'status_v0', 'status_v1', 'improvement_frac', 'input', 'target', 'code_v0_loc', 'code_v1_loc', 'code_v0_num_chars', 'code_v1_num_chars', 'code_v0_no_empty_lines', 'code_v1_no_empty_lines', 'code_same', 'relative_loc_diff_percent', 'diff', 'diff_only_import_comment'])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(type(test_dataset[0]))\n",
    "test_dataset[0].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_unique_problem_id(dataset):\n",
    "    unique_problem_ids = set()\n",
    "    for data_item in dataset:\n",
    "        problem_id = data_item['problem_id']\n",
    "        unique_problem_ids.add(problem_id)\n",
    "    return unique_problem_ids\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test set has 118 unique problem ids.\n"
     ]
    }
   ],
   "source": [
    "unique_problem_ids = get_unique_problem_id(test_dataset)\n",
    "print(f\"test set has {len(unique_problem_ids)} unique problem ids.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_unique_submission(dataset):\n",
    "    unique_submission = set()\n",
    "    for data_item in dataset:\n",
    "        problem_id = data_item['problem_id']\n",
    "        user_id = data_item['user_id']\n",
    "        submission_id_v0 = data_item['submission_id_v0']\n",
    "        submission_id_v1 = data_item['submission_id_v1']\n",
    "        unique_identifier_0 = f\"{problem_id}_{user_id}_{submission_id_v0}\"\n",
    "        unique_identifier_1 = f\"{problem_id}_{user_id}_{submission_id_v1}\"\n",
    "\n",
    "        unique_submission.add(unique_identifier_0)\n",
    "        unique_submission.add(unique_identifier_1)\n",
    "    \n",
    "    return unique_submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test set has 7230 unique submission.\n"
     ]
    }
   ],
   "source": [
    "unique_submission = get_unique_submission(test_dataset)\n",
    "print(f\"test set has {len(unique_submission)} unique submission.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "there are 7230 in /largespace/tydata/code_optimization/cpp/dataset/cpp_code/val/\n"
     ]
    }
   ],
   "source": [
    "test_store_path = \"/largespace/tydata/code_optimization/cpp/dataset/cpp_code/val/\"\n",
    "import glob \n",
    "cpp_files = glob.glob(test_store_path + '*.cpp')\n",
    "print(f\"there are {len(cpp_files)} in {test_store_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "there are 7115 in /largespace/tydata/code_optimization/cpp/dataset/cpp_binary/val_out/\n"
     ]
    }
   ],
   "source": [
    "test_out_store_path = \"/largespace/tydata/code_optimization/cpp/dataset/cpp_binary/val_out/\"\n",
    "cpp_out_files = glob.glob(test_out_store_path + '*.out')\n",
    "print(f\"there are {len(cpp_out_files)} in {test_out_store_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
