{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "92c1a5d8-90d0-4d9e-8ea2-9131dea01d0d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-06T17:39:16.171045427Z",
     "start_time": "2024-04-06T17:39:16.151714313Z"
    }
   },
   "source": [
    "# Data Analysis for Bug Localization "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b3b5a75a112dce89",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-09T08:39:24.989014148Z",
     "start_time": "2024-04-09T08:39:24.985771729Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "from src.utils.jsonl_utils import get_jsonl_data, get_repos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5e821c0e-0aaf-4549-bc9b-624c8e823db5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-09T22:46:26.681600559Z",
     "start_time": "2024-04-09T22:46:26.633120285Z"
    }
   },
   "outputs": [],
   "source": [
    "from omegaconf import OmegaConf\n",
    "\n",
    "config = OmegaConf.load('/home/ANON/bug-localization/configs/data/server.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2c5bc7eb-1f69-4855-a197-e8d9149b3475",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-09T08:39:25.066937792Z",
     "start_time": "2024-04-09T08:39:25.024914175Z"
    }
   },
   "outputs": [],
   "source": [
    "def count_jsonl_data(jsonls_path: str, repo_owner: str, repo_name: str) -> int:\n",
    "    jsonl_data = get_jsonl_data(jsonls_path, repo['owner'], repo['name'])\n",
    "    if jsonl_data is None:\n",
    "        return 0\n",
    "    return len(jsonl_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00278131-d14f-4b06-8698-3b3005cf47e9",
   "metadata": {},
   "source": [
    "# Statistics about GitHub data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9a5e0a60896f4e27",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-09T08:39:42.205151162Z",
     "start_time": "2024-04-09T08:39:25.066843073Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/jformdesigner__flatlaf.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/cms-sw__cmssw.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/yelp__paasta.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/zephyrproject-rtos__zephyr.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/shipshapecode__tether.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/lightninglabs__loop.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/odyseeteam__odysee-api.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/issues_prs_updated_dedup/draios__agent-libs.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/comments_updated_dedup/draios__agent-libs.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_updated_dedup/draios__agent-libs.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/draios__agent-libs.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/issues_prs_updated_dedup/mintlayer__mintlayer-core.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/comments_updated_dedup/mintlayer__mintlayer-core.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/mintlayer__mintlayer-core.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/haikuports__haikuporter.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/gjsify__ts-for-gir.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/pulls_comments_updated/fprime-community__fpp.jsonl does not exists\n"
     ]
    }
   ],
   "source": [
    "issues_count = 0\n",
    "issue_comments_count = 0\n",
    "prs_count = 0\n",
    "prs_comments_count = 0\n",
    "\n",
    "for repo in get_repos(config.repos_list_path):\n",
    "    issues_count += count_jsonl_data(config.issues_path, repo['owner'], repo['name'])\n",
    "    issue_comments_count += count_jsonl_data(config.issues_comments_path, repo['owner'], repo['name'])\n",
    "    prs_count += count_jsonl_data(config.pulls_path, repo['owner'], repo['name'])\n",
    "    prs_comments_count += count_jsonl_data(config.pull_requests_comments_path, repo['owner'], repo['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "287d7880-c5cb-47ef-a573-ae89c041e0ca",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.202050302Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    Issues count: 15580465\n",
      "    Issues comments count: 34437308\n",
      "    Pulls count: 7027484\n",
      "    Pulls comments count: 17251762\n"
     ]
    }
   ],
   "source": [
    "print(f\"\"\"\n",
    "    Issues count: {issues_count}\n",
    "    Issues comments count: {issue_comments_count}\n",
    "    Pulls count: {prs_count}\n",
    "    Pulls comments count: {prs_comments_count}\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "83d9ce71-e291-43df-886f-11b45ff70e53",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-09T08:39:42.257492132Z",
     "start_time": "2024-04-09T08:39:42.243754902Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    Repos count: 7755\n"
     ]
    }
   ],
   "source": [
    "print(f\"\"\"\n",
    "    Repos count: {len(get_repos(config.repos_list_path))}\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "90618b9e-3871-4ede-b367-cf6a85ba3a82",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.244013190Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Path /mnt/data/shared-data/benchname/issues_links_filtered_updated/draios__agent-libs.jsonl does not exists\n",
      "Path /mnt/data/shared-data/benchname/issues_links_filtered_updated/mintlayer__mintlayer-core.jsonl does not exists\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "parsed_links_count = 0\n",
    "links_with_status_count = 0\n",
    "links_by_status_count = defaultdict(int)\n",
    "\n",
    "for repo in get_repos(config.repos_list_path):\n",
    "    issue_links = get_jsonl_data(config.issues_links_filtered_path, repo['owner'], repo['name'])\n",
    "    if issue_links is None:\n",
    "        continue\n",
    "    for issue_link in issue_links:\n",
    "        links_by_status_count[issue_link['status']] += 1\n",
    "    parsed_links_count += count_jsonl_data(config.issues_links_path, repo['owner'], repo['name'])\n",
    "    links_with_status_count += len(issue_links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d1ef8c94-2769-4c2d-9f2f-49b9536cd2da",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.244262658Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    Issues links count: 25544771\n",
      "    Issues links with status count: 25544771\n"
     ]
    }
   ],
   "source": [
    "print(f\"\"\"\n",
    "    Issues links count: {parsed_links_count}\n",
    "    Issues links with status count: {links_with_status_count}\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "0be6d41f-bea2-43f6-87bc-c6f615d7cdf3",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.244410097Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "not_enough_info: 21193707\n",
      "issue_not_a_bug: 3472057\n",
      "ok: 10971\n",
      "no_fix_keyword: 10406\n",
      "pr_to_multi_issues: 7376\n",
      "issue_to_multi_prs: 1934\n",
      "diff_has_new_files: 30572\n",
      "diff_can_not_extract: 475447\n",
      "diff_can_not_extract_changed_files: 6198\n",
      "issue_not_english: 35942\n",
      "issue_has_media: 145225\n",
      "diff_non_code_files: 138653\n",
      "issue_empty: 16265\n",
      "diff_non_utf8: 18\n"
     ]
    }
   ],
   "source": [
    "for status, status_count in links_by_status_count.items():\n",
    "    print(f\"{status}: {status_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "9e8cdd12-20d9-4454-a208-e129946b900e",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.244558286Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25544771\n",
      "not_enough_info 21193707 82.97% 487.09%\n",
      "4351064\n",
      "issue_not_a_bug 3472057 79.80% 79.80%\n",
      "879007\n",
      "issue_empty 16265 1.85% 0.37%\n",
      "862742\n",
      "issue_has_media 145225 16.83% 3.34%\n",
      "717517\n",
      "issue_not_english 35942 5.01% 0.83%\n",
      "681575\n",
      "diff_can_not_extract 475447 69.76% 10.93%\n",
      "206128\n",
      "diff_has_new_files 30572 14.83% 0.70%\n",
      "175556\n",
      "diff_non_code_files 138653 78.98% 3.19%\n",
      "36903\n",
      "diff_non_utf8 18 0.05% 0.00%\n",
      "36885\n",
      "diff_can_not_extract_changed_files 6198 16.80% 0.14%\n",
      "30687\n",
      "pr_to_multi_issues 7376 24.04% 0.17%\n",
      "23311\n",
      "issue_to_multi_prs 1934 8.30% 0.04%\n",
      "21377\n",
      "no_fix_keyword 10406 48.68% 0.24%\n",
      "10971\n"
     ]
    }
   ],
   "source": [
    "filters_list = ['not_enough_info', \n",
    "               'issue_not_a_bug', 'issue_empty', 'issue_has_media', 'issue_not_english',\n",
    "               'diff_can_not_extract', 'diff_has_new_files', 'diff_non_code_files', 'diff_non_utf8', \n",
    "               'diff_can_not_extract_changed_files', \n",
    "               'pr_to_multi_issues', 'issue_to_multi_prs', 'no_fix_keyword'\n",
    "              ]\n",
    "initial_count = links_with_status_count - links_by_status_count['not_enough_info']\n",
    "cur_count = parsed_links_count\n",
    "print(cur_count)\n",
    "for f in filters_list:\n",
    "    print(f, links_by_status_count[f], '{:.2f}% {:.2f}%'.format(links_by_status_count[f] / cur_count * 100, links_by_status_count[f] / initial_count * 100))\n",
    "    cur_count -= links_by_status_count[f]\n",
    "    print(cur_count)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3018b3a9-e10f-42b9-bc2f-6a3115a95372",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2024-04-09T08:39:42.244694755Z"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
