{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Statistics for Commit Message Generation dataset from BenchName"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8994451ee581764c"
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:56:27.080564Z",
     "start_time": "2024-06-05T16:56:21.289757Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "                                       hash       repo                 date  \\\n0  c27d31c06520c3df4c820ea10d5d16316f4d88cb  cupy/cupy  19.07.2017 16:24:41   \n1  6683a9aa7bae67e855cd9d1f17fdc49eb3f6dea0  cupy/cupy  17.06.2020 22:41:09   \n2  dad51485282b6e05c4993b0733bd54aa3c0bacef  cupy/cupy  12.01.2021 16:21:46   \n3  76eb888612183768d9e1b0c818fcf5416c5f28c7  cupy/cupy  20.01.2021 18:25:20   \n4  994ce07595026d5de54f52ef5748b578f9fae1bc  cupy/cupy  09.07.2021 13:57:44   \n\n       license                                            message  \\\n0  MIT License  Support CUDA stream on memory pool\\n\\nNow, mem...   \n1  MIT License  Complete overhaul of filter testing.\\n\\nThese ...   \n2  MIT License  Use \"import numpy as np\" in the array_api subm...   \n3  MIT License  Use _implementation on all functions that have...   \n4  MIT License  Use better type signatures in the array API mo...   \n\n                                                mods  \n0  [{'change_type': 'MODIFY', 'old_path': 'cupy/c...  \n1  [{'change_type': 'MODIFY', 'old_path': 'tests/...  \n2  [{'change_type': 'MODIFY', 'old_path': 'numpy/...  \n3  [{'change_type': 'MODIFY', 'old_path': 'numpy/...  \n4  [{'change_type': 'MODIFY', 'old_path': 'numpy/...  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>hash</th>\n      <th>repo</th>\n      <th>date</th>\n      <th>license</th>\n      <th>message</th>\n      <th>mods</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>c27d31c06520c3df4c820ea10d5d16316f4d88cb</td>\n      <td>cupy/cupy</td>\n      <td>19.07.2017 16:24:41</td>\n      <td>MIT License</td>\n      <td>Support CUDA stream on memory pool\\n\\nNow, mem...</td>\n      <td>[{'change_type': 'MODIFY', 'old_path': 'cupy/c...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>6683a9aa7bae67e855cd9d1f17fdc49eb3f6dea0</td>\n      <td>cupy/cupy</td>\n      <td>17.06.2020 22:41:09</td>\n      <td>MIT License</td>\n      <td>Complete overhaul of filter testing.\\n\\nThese ...</td>\n      <td>[{'change_type': 'MODIFY', 'old_path': 'tests/...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>dad51485282b6e05c4993b0733bd54aa3c0bacef</td>\n      <td>cupy/cupy</td>\n      <td>12.01.2021 16:21:46</td>\n      <td>MIT License</td>\n      <td>Use \"import numpy as np\" in the array_api subm...</td>\n      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>76eb888612183768d9e1b0c818fcf5416c5f28c7</td>\n      <td>cupy/cupy</td>\n      <td>20.01.2021 18:25:20</td>\n      <td>MIT License</td>\n      <td>Use _implementation on all functions that have...</td>\n      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>994ce07595026d5de54f52ef5748b578f9fae1bc</td>\n      <td>cupy/cupy</td>\n      <td>09.07.2021 13:57:44</td>\n      <td>MIT License</td>\n      <td>Use better type signatures in the array API mo...</td>\n      <td>[{'change_type': 'MODIFY', 'old_path': 'numpy/...</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "df = load_dataset(\n",
    "    \"anon-iclr-submission/benchname-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\"\n",
    ").to_pandas()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "163"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:56:27.084801Z",
     "start_time": "2024-06-05T16:56:27.079822Z"
    }
   },
   "id": "4274e640f2525848",
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "34"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.repo.nunique()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:56:27.090971Z",
     "start_time": "2024-06-05T16:56:27.086150Z"
    }
   },
   "id": "e8ee057016c2cc54",
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Commit Messages"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "31ce0f2e7f999208"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "                    count        mean        std   min    1%     5%    10%  \\\nnum_characters_msg  163.0  199.957055  63.854614  86.0  96.1  114.0  123.2   \nnum_words_msg       163.0   28.797546   9.930101   9.0  11.0   15.0   16.0   \nnum_lines_msg       163.0    4.638037   1.400274   2.0   2.0    3.0    3.0   \n\n                      25%    50%    75%    90%    95%     99%    max  \nnum_characters_msg  148.0  188.0  242.0  290.0  315.9  355.80  367.0  \nnum_words_msg        21.5   28.0   36.0   43.0   45.9   51.38   58.0  \nnum_lines_msg         4.0    4.0    5.0    7.0    7.0    8.38    9.0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_characters_msg</th>\n      <td>163.0</td>\n      <td>199.957055</td>\n      <td>63.854614</td>\n      <td>86.0</td>\n      <td>96.1</td>\n      <td>114.0</td>\n      <td>123.2</td>\n      <td>148.0</td>\n      <td>188.0</td>\n      <td>242.0</td>\n      <td>290.0</td>\n      <td>315.9</td>\n      <td>355.80</td>\n      <td>367.0</td>\n    </tr>\n    <tr>\n      <th>num_words_msg</th>\n      <td>163.0</td>\n      <td>28.797546</td>\n      <td>9.930101</td>\n      <td>9.0</td>\n      <td>11.0</td>\n      <td>15.0</td>\n      <td>16.0</td>\n      <td>21.5</td>\n      <td>28.0</td>\n      <td>36.0</td>\n      <td>43.0</td>\n      <td>45.9</td>\n      <td>51.38</td>\n      <td>58.0</td>\n    </tr>\n    <tr>\n      <th>num_lines_msg</th>\n      <td>163.0</td>\n      <td>4.638037</td>\n      <td>1.400274</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>5.0</td>\n      <td>7.0</td>\n      <td>7.0</td>\n      <td>8.38</td>\n      <td>9.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"num_characters_msg\"] = df.message.str.len()\n",
    "df[\"num_words_msg\"] = df.message.str.split(\" \").str.len()\n",
    "df[\"num_lines_msg\"] = df.message.str.split(\"\\n\").str.len()\n",
    "\n",
    "df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(\n",
    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ").T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:56:27.134214Z",
     "start_time": "2024-06-05T16:56:27.104327Z"
    }
   },
   "id": "cf07b2d3708d61f4",
   "execution_count": 5
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Diffs"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "cb83a819ac6fa374"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "                     count         mean          std     min       1%      5%  \\\nnum_modified_files   163.0     3.417178     2.617170     1.0     1.00     1.0   \nnum_characters_diff  163.0  8697.766871  6055.710918  3346.0  3363.20  3470.5   \nnum_words_diff       163.0  2086.349693  1553.492646   388.0   608.46   720.7   \nnum_lines_diff       163.0   212.453988   146.325791    67.0    72.48    81.1   \n\n                        10%     25%     50%      75%      90%      95%  \\\nnum_modified_files      1.0     2.0     3.0      4.0      7.0      8.9   \nnum_characters_diff  3825.2  4863.5  6639.0  10020.0  17010.4  21030.3   \nnum_words_diff        823.6  1039.0  1608.0   2414.0   4077.6   4842.8   \nnum_lines_diff         92.8   113.5   159.0    248.0    413.8    499.3   \n\n                          99%      max  \nnum_modified_files      12.14     15.0  \nnum_characters_diff  27513.18  41714.0  \nnum_words_diff        7756.82   9645.0  \nnum_lines_diff         710.58    864.0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_modified_files</th>\n      <td>163.0</td>\n      <td>3.417178</td>\n      <td>2.617170</td>\n      <td>1.0</td>\n      <td>1.00</td>\n      <td>1.0</td>\n      <td>1.0</td>\n      <td>2.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>7.0</td>\n      <td>8.9</td>\n      <td>12.14</td>\n      <td>15.0</td>\n    </tr>\n    <tr>\n      <th>num_characters_diff</th>\n      <td>163.0</td>\n      <td>8697.766871</td>\n      <td>6055.710918</td>\n      <td>3346.0</td>\n      <td>3363.20</td>\n      <td>3470.5</td>\n      <td>3825.2</td>\n      <td>4863.5</td>\n      <td>6639.0</td>\n      <td>10020.0</td>\n      <td>17010.4</td>\n      <td>21030.3</td>\n      <td>27513.18</td>\n      <td>41714.0</td>\n    </tr>\n    <tr>\n      <th>num_words_diff</th>\n      <td>163.0</td>\n      <td>2086.349693</td>\n      <td>1553.492646</td>\n      <td>388.0</td>\n      <td>608.46</td>\n      <td>720.7</td>\n      <td>823.6</td>\n      <td>1039.0</td>\n      <td>1608.0</td>\n      <td>2414.0</td>\n      <td>4077.6</td>\n      <td>4842.8</td>\n      <td>7756.82</td>\n      <td>9645.0</td>\n    </tr>\n    <tr>\n      <th>num_lines_diff</th>\n      <td>163.0</td>\n      <td>212.453988</td>\n      <td>146.325791</td>\n      <td>67.0</td>\n      <td>72.48</td>\n      <td>81.1</td>\n      <td>92.8</td>\n      <td>113.5</td>\n      <td>159.0</td>\n      <td>248.0</td>\n      <td>413.8</td>\n      <td>499.3</td>\n      <td>710.58</td>\n      <td>864.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"num_modified_files\"] = df.mods.str.len()\n",
    "df[\"num_characters_diff\"] = [sum(len(mod[\"diff\"]) for mod in mods) for mods in df.mods]\n",
    "df[\"num_words_diff\"] = [sum(len(mod[\"diff\"].split(\" \")) for mod in mods) for mods in df.mods]\n",
    "df[\"num_lines_diff\"] = [sum(len(mod[\"diff\"].split(\"\\n\")) for mod in mods) for mods in df.mods]\n",
    "\n",
    "df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(\n",
    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ").T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:56:27.176036Z",
     "start_time": "2024-06-05T16:56:27.124321Z"
    }
   },
   "id": "f920b14ca39629e9",
   "execution_count": 6
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Files"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "2d6f8dc4d4028dad"
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Utils"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "4d41c63b9cb88b79"
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Downloading repositories"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "77378ace39916d7"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "\n",
    "data_dir = \"../data\""
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T16:58:33.079344Z",
     "start_time": "2024-06-05T16:58:33.076410Z"
    }
   },
   "id": "b0d34485b5c78d79",
   "execution_count": 15
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "from huggingface_hub import list_repo_tree, hf_hub_download\n",
    "import tarfile\n",
    "\n",
    "\n",
    "for repo_file in list_repo_tree(\"anon-iclr-submission/benchname-commit-message-generation\", \"repos\", repo_type=\"dataset\"):\n",
    "    file_path = hf_hub_download(\n",
    "        repo_id=\"anon-iclr-submission/benchname-commit-message-generation\",\n",
    "        filename=repo_file.path,\n",
    "        repo_type=\"dataset\",\n",
    "        local_dir=data_dir,\n",
    "    )\n",
    "\n",
    "    with tarfile.open(file_path, \"r:gz\") as tar:\n",
    "        tar.extractall(path=os.path.join(data_dir, \"extracted_repos\"))"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "963581ac977ac3fc",
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Helper function"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "a3bd3246d1f16d53"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "import git\n",
    "from collections import defaultdict\n",
    "from typing import Dict\n",
    "\n",
    "\n",
    "def get_changed_files_before_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
    "    repo = git.Repo(repo_path)\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    repo.git.clean(\"-fd\")\n",
    "    commit = repo.commit(commit_hash)\n",
    "\n",
    "    if len(commit.parents) > 1:\n",
    "        raise ValueError(\"More than one parent\")\n",
    "\n",
    "    changed_files = list(commit.stats.files.keys())\n",
    "    try:\n",
    "        repo.git.checkout(commit.parents[0].hexsha)\n",
    "    except git.GitCommandError as e:\n",
    "        print(os.path.basename(repo_path), commit_hash, e)\n",
    "        if repo.is_dirty(untracked_files=True):\n",
    "            repo.git.stash(\"save\", \"--include-untracked\")\n",
    "            repo.git.clean(\"-fd\")\n",
    "            repo.git.checkout(commit.parents[0].hexsha)\n",
    "\n",
    "    stats = defaultdict(int)\n",
    "    for file_path in changed_files:\n",
    "        try:\n",
    "            with open(os.path.join(repo_path, file_path), \"r\") as file:\n",
    "                content = file.read()\n",
    "                stats[\"num_chars\"] += len(content)\n",
    "                stats[\"num_words\"] += len(content.split(\" \"))\n",
    "                stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
    "        except FileNotFoundError:\n",
    "            print(\n",
    "                f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
    "            )\n",
    "\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    return stats\n",
    "\n",
    "\n",
    "def get_changed_files_after_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
    "    repo = git.Repo(repo_path)\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    repo.git.clean(\"-fd\")\n",
    "    commit = repo.commit(commit_hash)\n",
    "    changed_files = list(commit.stats.files.keys())\n",
    "    try:\n",
    "        repo.git.checkout(commit_hash)\n",
    "    except git.GitCommandError as e:\n",
    "        print(os.path.basename(repo_path), commit_hash, e)\n",
    "        repo.git.stash(\"save\", \"--include-untracked\")\n",
    "        repo.git.clean(\"-fd\")\n",
    "        repo.git.checkout(commit_hash)\n",
    "\n",
    "    stats = defaultdict(int)\n",
    "    for file_path in changed_files:\n",
    "        try:\n",
    "            with open(os.path.join(repo_path, file_path), \"r\") as file:\n",
    "                content = file.read()\n",
    "                stats[\"num_chars\"] += len(content)\n",
    "                stats[\"num_words\"] += len(content.split(\" \"))\n",
    "                stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
    "        except FileNotFoundError:\n",
    "            print(\n",
    "                f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
    "            )\n",
    "\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    return stats\n",
    "\n",
    "\n",
    "def get_changed_files_full_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
    "    stats_before = get_changed_files_before_commit_stats(repo_path, commit_hash)\n",
    "    stats_after = get_changed_files_after_commit_stats(repo_path, commit_hash)\n",
    "    for key in stats_before:\n",
    "        stats_before[key] += stats_after[key]\n",
    "    return stats_before\n",
    "\n",
    "\n",
    "def get_all_files_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
    "    repo = git.Repo(repo_path)\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    repo.git.clean(\"-fd\")\n",
    "    commit = repo.commit(commit_hash)\n",
    "    try:\n",
    "        repo.git.checkout(commit_hash)\n",
    "    except git.GitCommandError as e:\n",
    "        print(os.path.basename(repo_path), commit_hash, e)\n",
    "        if repo.is_dirty(untracked_files=True):\n",
    "            repo.git.stash(\"save\", \"--include-untracked\")\n",
    "            repo.git.clean(\"-fd\")\n",
    "            repo.git.checkout(commit_hash)\n",
    "\n",
    "    stats = defaultdict(int)\n",
    "\n",
    "    for blob in commit.tree.traverse():\n",
    "        if blob.type == \"blob\":\n",
    "            try:\n",
    "                with open(os.path.join(repo_path, str(blob.path)), \"r\") as file:\n",
    "                    content = file.read()\n",
    "                    stats[\"num_chars\"] += len(content)\n",
    "                    stats[\"num_words\"] += len(content.split(\" \"))\n",
    "                    stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
    "            except:\n",
    "                continue\n",
    "\n",
    "    repo.git.checkout(\"HEAD\", \".\")\n",
    "    return stats"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:09:43.374599Z",
     "start_time": "2024-06-05T17:09:43.368601Z"
    }
   },
   "id": "79ef9015a3976da1",
   "execution_count": 24
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 21%|██▏       | 35/163 [00:11<00:56,  2.27it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\ttest cases/fortran/2 modules/mymod.F90\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 79%|███████▊  | 128/163 [01:13<00:04,  7.74it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\tdoc/user_guide/API.rst\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 163/163 [01:30<00:00,  1.80it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "all_file_stats = []\n",
    "\n",
    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
    "    all_file_stats.append(get_all_files_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:11:14.525387Z",
     "start_time": "2024-06-05T17:09:43.939936Z"
    }
   },
   "id": "a93dd5e8b0428fbe",
   "execution_count": 25
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 22%|██▏       | 36/163 [00:13<00:45,  2.80it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout 6f3f43bb2d31797b0f3128e1664652571fe314e6\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\ttest cases/fortran/2 modules/mymod.F90\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 79%|███████▊  | 128/163 [00:46<00:05,  6.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout ea9ae53a60a7fbb0516ea020c5c0846f479d2546\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\tdoc/user_guide/API.rst\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n",
      "File altair/vegalite/v5/tests/test_api.py before commit not found for commit 846a842a6dbd6c7f989bff5232c697be94ffb7b1 in repo altair-viz__altair\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 163/163 [00:56<00:00,  2.88it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "changed_files_before_commit_stats = []\n",
    "\n",
    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
    "    changed_files_before_commit_stats.append(\n",
    "        get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
    "    )"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:12:11.219659Z",
     "start_time": "2024-06-05T17:11:14.525230Z"
    }
   },
   "id": "327efc5957e44232",
   "execution_count": 26
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 22%|██▏       | 36/163 [00:10<00:41,  3.04it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\ttest cases/fortran/2 modules/mymod.F90\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 79%|███████▊  | 128/163 [00:39<00:04,  7.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\tdoc/user_guide/API.rst\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 163/163 [00:49<00:00,  3.30it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "changed_files_after_commit_stats = []\n",
    "\n",
    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
    "    try:\n",
    "        changed_files_after_commit_stats.append(\n",
    "            get_changed_files_after_commit_stats(\n",
    "                os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash\n",
    "            )\n",
    "        )\n",
    "    except git.GitCommandError:  # TODO: idk what's happening here\n",
    "        continue"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:13:47.031936Z",
     "start_time": "2024-06-05T17:12:57.693215Z"
    }
   },
   "id": "c366d5eaf2de1965",
   "execution_count": 29
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 21%|██▏       | 35/163 [00:15<01:05,  1.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout 6f3f43bb2d31797b0f3128e1664652571fe314e6\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\ttest cases/fortran/2 modules/mymod.F90\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 22%|██▏       | 36/163 [00:16<01:08,  1.86it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mesonbuild__meson f21685a83330a4bbe1e59c3641a0d24f1efe8825 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout f21685a83330a4bbe1e59c3641a0d24f1efe8825\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\ttest cases/fortran/2 modules/mymod.F90\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 79%|███████▊  | 128/163 [01:04<00:08,  3.93it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout ea9ae53a60a7fbb0516ea020c5c0846f479d2546\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\tdoc/user_guide/API.rst\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n",
      "File altair/vegalite/v5/tests/test_api.py before commit not found for commit 846a842a6dbd6c7f989bff5232c697be94ffb7b1 in repo altair-viz__altair\n",
      "altair-viz__altair 846a842a6dbd6c7f989bff5232c697be94ffb7b1 Cmd('git') failed due to: exit code(1)\n",
      "  cmdline: git checkout 846a842a6dbd6c7f989bff5232c697be94ffb7b1\n",
      "  stderr: 'error: The following untracked working tree files would be overwritten by checkout:\n",
      "\tdoc/user_guide/API.rst\n",
      "Please move or remove them before you switch branches.\n",
      "Aborting'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 163/163 [01:20<00:00,  2.02it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "changed_files_full_stats = []\n",
    "\n",
    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
    "    try:\n",
    "        changed_files_full_stats.append(\n",
    "            get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
    "        )\n",
    "    except git.GitCommandError:\n",
    "        continue"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:15:07.782471Z",
     "start_time": "2024-06-05T17:13:47.028707Z"
    }
   },
   "id": "e12916ded944fc5b",
   "execution_count": 30
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Statistics"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "6482116f449481d1"
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Full repositories"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "aebd74b38a751ec9"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "           count        mean         std       min        1%        5%  \\\nnum_chars 163.00 27551314.64 45993503.29 143958.00 150366.42 433326.40   \nnum_words 163.00  5298042.83  8981031.25  30055.00  31357.42 101402.40   \nnum_lines 163.00   737517.10  1348961.81   4116.00   4347.10  13112.60   \n\n                10%        25%        50%         75%          90%  \\\nnum_chars 735432.20 1575603.50 3621059.00 16665826.00 123637940.40   \nnum_words 155376.20  380582.50  714079.00  4428093.50  19158339.40   \nnum_lines  21951.80   45627.50  102059.00   339901.50   3947990.20   \n\n                   95%          99%          max  \nnum_chars 125353336.00 139253525.60 156086294.00  \nnum_words  22839020.90  35143839.36  35146112.00  \nnum_lines   3965309.40   4011263.44   5036935.00  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_chars</th>\n      <td>163.00</td>\n      <td>27551314.64</td>\n      <td>45993503.29</td>\n      <td>143958.00</td>\n      <td>150366.42</td>\n      <td>433326.40</td>\n      <td>735432.20</td>\n      <td>1575603.50</td>\n      <td>3621059.00</td>\n      <td>16665826.00</td>\n      <td>123637940.40</td>\n      <td>125353336.00</td>\n      <td>139253525.60</td>\n      <td>156086294.00</td>\n    </tr>\n    <tr>\n      <th>num_words</th>\n      <td>163.00</td>\n      <td>5298042.83</td>\n      <td>8981031.25</td>\n      <td>30055.00</td>\n      <td>31357.42</td>\n      <td>101402.40</td>\n      <td>155376.20</td>\n      <td>380582.50</td>\n      <td>714079.00</td>\n      <td>4428093.50</td>\n      <td>19158339.40</td>\n      <td>22839020.90</td>\n      <td>35143839.36</td>\n      <td>35146112.00</td>\n    </tr>\n    <tr>\n      <th>num_lines</th>\n      <td>163.00</td>\n      <td>737517.10</td>\n      <td>1348961.81</td>\n      <td>4116.00</td>\n      <td>4347.10</td>\n      <td>13112.60</td>\n      <td>21951.80</td>\n      <td>45627.50</td>\n      <td>102059.00</td>\n      <td>339901.50</td>\n      <td>3947990.20</td>\n      <td>3965309.40</td>\n      <td>4011263.44</td>\n      <td>5036935.00</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
    "pd.DataFrame(all_file_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:15:07.795184Z",
     "start_time": "2024-06-05T17:15:07.781235Z"
    }
   },
   "id": "fcfc3e36e4e6d096",
   "execution_count": 31
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Changed files (before commit)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d1318d7d72e0223c"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "           count      mean       std     min      1%      5%      10%  \\\nnum_chars 163.00 110043.72 128096.72 3694.00 4877.34 8137.50 12402.40   \nnum_words 163.00  29335.04  38953.63  731.00  888.28 2031.20  3129.80   \nnum_lines 163.00   2754.34   2979.61   80.00  157.66  253.80   320.60   \n\n               25%      50%       75%       90%       95%       99%       max  \nnum_chars 26225.50 58855.00 150216.50 267828.20 410150.10 546357.64 690367.00  \nnum_words  6275.50 13917.00  37355.50  73427.40 105687.10 173032.20 259090.00  \nnum_lines   718.50  1495.00   3986.50   6895.80   9269.60  13248.38  15599.00  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_chars</th>\n      <td>163.00</td>\n      <td>110043.72</td>\n      <td>128096.72</td>\n      <td>3694.00</td>\n      <td>4877.34</td>\n      <td>8137.50</td>\n      <td>12402.40</td>\n      <td>26225.50</td>\n      <td>58855.00</td>\n      <td>150216.50</td>\n      <td>267828.20</td>\n      <td>410150.10</td>\n      <td>546357.64</td>\n      <td>690367.00</td>\n    </tr>\n    <tr>\n      <th>num_words</th>\n      <td>163.00</td>\n      <td>29335.04</td>\n      <td>38953.63</td>\n      <td>731.00</td>\n      <td>888.28</td>\n      <td>2031.20</td>\n      <td>3129.80</td>\n      <td>6275.50</td>\n      <td>13917.00</td>\n      <td>37355.50</td>\n      <td>73427.40</td>\n      <td>105687.10</td>\n      <td>173032.20</td>\n      <td>259090.00</td>\n    </tr>\n    <tr>\n      <th>num_lines</th>\n      <td>163.00</td>\n      <td>2754.34</td>\n      <td>2979.61</td>\n      <td>80.00</td>\n      <td>157.66</td>\n      <td>253.80</td>\n      <td>320.60</td>\n      <td>718.50</td>\n      <td>1495.00</td>\n      <td>3986.50</td>\n      <td>6895.80</td>\n      <td>9269.60</td>\n      <td>13248.38</td>\n      <td>15599.00</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
    "pd.DataFrame(changed_files_before_commit_stats).describe(\n",
    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ").T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:15:07.806156Z",
     "start_time": "2024-06-05T17:15:07.797095Z"
    }
   },
   "id": "222d4a1729d4fa7f",
   "execution_count": 32
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Changed files (after commit)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "5fbe7074f84bef9"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "           count      mean       std     min      1%      5%      10%  \\\nnum_chars 161.00 109222.15 127191.31 4161.00 5434.00 9549.00 12661.00   \nnum_words 161.00  29055.65  38812.42  808.00 1023.80 1896.00  2957.00   \nnum_lines 161.00   2741.02   2964.80   95.00  179.60  270.00   349.00   \n\n               25%      50%       75%       90%       95%       99%       max  \nnum_chars 26524.00 57502.00 146896.00 261596.00 411628.00 552013.60 691296.00  \nnum_words  6293.00 13875.00  36675.00  67751.00 109455.00 175884.80 259183.00  \nnum_lines   738.00  1520.00   3960.00   6726.00   9287.00  13337.40  15620.00  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_chars</th>\n      <td>161.00</td>\n      <td>109222.15</td>\n      <td>127191.31</td>\n      <td>4161.00</td>\n      <td>5434.00</td>\n      <td>9549.00</td>\n      <td>12661.00</td>\n      <td>26524.00</td>\n      <td>57502.00</td>\n      <td>146896.00</td>\n      <td>261596.00</td>\n      <td>411628.00</td>\n      <td>552013.60</td>\n      <td>691296.00</td>\n    </tr>\n    <tr>\n      <th>num_words</th>\n      <td>161.00</td>\n      <td>29055.65</td>\n      <td>38812.42</td>\n      <td>808.00</td>\n      <td>1023.80</td>\n      <td>1896.00</td>\n      <td>2957.00</td>\n      <td>6293.00</td>\n      <td>13875.00</td>\n      <td>36675.00</td>\n      <td>67751.00</td>\n      <td>109455.00</td>\n      <td>175884.80</td>\n      <td>259183.00</td>\n    </tr>\n    <tr>\n      <th>num_lines</th>\n      <td>161.00</td>\n      <td>2741.02</td>\n      <td>2964.80</td>\n      <td>95.00</td>\n      <td>179.60</td>\n      <td>270.00</td>\n      <td>349.00</td>\n      <td>738.00</td>\n      <td>1520.00</td>\n      <td>3960.00</td>\n      <td>6726.00</td>\n      <td>9287.00</td>\n      <td>13337.40</td>\n      <td>15620.00</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
    "pd.DataFrame(changed_files_after_commit_stats).describe(\n",
    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
    ").T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:15:07.815198Z",
     "start_time": "2024-06-05T17:15:07.803601Z"
    }
   },
   "id": "141feb987029fabb",
   "execution_count": 33
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Changed files (full)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "6fd7857313b0150b"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "           count      mean       std     min       1%       5%      10%  \\\nnum_chars 161.00 217227.40 254058.25 8093.00 11028.80 19504.00 25150.00   \nnum_words 161.00  57859.11  77569.59 1718.00  2021.80  4220.00  5555.00   \nnum_lines 161.00   5450.88   5924.83  175.00   359.20   568.00   654.00   \n\n               25%       50%       75%       90%       95%        99%  \\\nnum_chars 50868.00 116308.00 293780.00 524357.00 821099.00 1102607.80   \nnum_words 12572.00  27646.00  73037.00 135488.00 215250.00  351514.80   \nnum_lines  1449.00   2994.00   7918.00  13493.00  18568.00   26650.80   \n\n                 max  \nnum_chars 1381663.00  \nnum_words  518273.00  \nnum_lines   31219.00  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>count</th>\n      <th>mean</th>\n      <th>std</th>\n      <th>min</th>\n      <th>1%</th>\n      <th>5%</th>\n      <th>10%</th>\n      <th>25%</th>\n      <th>50%</th>\n      <th>75%</th>\n      <th>90%</th>\n      <th>95%</th>\n      <th>99%</th>\n      <th>max</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>num_chars</th>\n      <td>161.00</td>\n      <td>217227.40</td>\n      <td>254058.25</td>\n      <td>8093.00</td>\n      <td>11028.80</td>\n      <td>19504.00</td>\n      <td>25150.00</td>\n      <td>50868.00</td>\n      <td>116308.00</td>\n      <td>293780.00</td>\n      <td>524357.00</td>\n      <td>821099.00</td>\n      <td>1102607.80</td>\n      <td>1381663.00</td>\n    </tr>\n    <tr>\n      <th>num_words</th>\n      <td>161.00</td>\n      <td>57859.11</td>\n      <td>77569.59</td>\n      <td>1718.00</td>\n      <td>2021.80</td>\n      <td>4220.00</td>\n      <td>5555.00</td>\n      <td>12572.00</td>\n      <td>27646.00</td>\n      <td>73037.00</td>\n      <td>135488.00</td>\n      <td>215250.00</td>\n      <td>351514.80</td>\n      <td>518273.00</td>\n    </tr>\n    <tr>\n      <th>num_lines</th>\n      <td>161.00</td>\n      <td>5450.88</td>\n      <td>5924.83</td>\n      <td>175.00</td>\n      <td>359.20</td>\n      <td>568.00</td>\n      <td>654.00</td>\n      <td>1449.00</td>\n      <td>2994.00</td>\n      <td>7918.00</td>\n      <td>13493.00</td>\n      <td>18568.00</td>\n      <td>26650.80</td>\n      <td>31219.00</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
    "pd.DataFrame(changed_files_full_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-05T17:15:07.832617Z",
     "start_time": "2024-06-05T17:15:07.811990Z"
    }
   },
   "id": "2d2c1c18b9040f1",
   "execution_count": 34
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
