{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import os.path as osp\n",
    "from pathlib import Path\n",
    "\n",
    "BASE_MODULO_PATH = Path(os.getcwd())\n",
    "\n",
    "DATA_PATH = Path(\"/path/to/a/folder/containing/another/folder/named/graphs/with/all/real/world/graphs/\")\n",
    "REAL_DATA_PATH = DATA_PATH / \"graphs\"\n",
    "\n",
    "SAVE_PATH = BASE_MODULO_PATH / \"excluded\" / \"hephaestus_storage_unit\" / \"plots\" / \"stats_real_dataset\"\n",
    "SAVE_PATH.mkdir(exist_ok=True, parents=True)\n",
    "\n",
    "os.chdir(BASE_MODULO_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "\n",
    "pd.options.mode.chained_assignment = None  # default='warn'\n",
    "\n",
    "sns.set_context(\"paper\", font_scale=2)\n",
    "sns.set_style(\"whitegrid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = [\n",
    "    \"#000000\",\n",
    "    \"#E69F00\",\n",
    "    \"#56B4E9\",\n",
    "    \"#009E73\",\n",
    "    \"#FB6467FF\",\n",
    "    \"#808282\",\n",
    "    \"#F0E442\",\n",
    "    \"#440154FF\",\n",
    "    \"#0072B2\",\n",
    "    \"#D55E00\",\n",
    "    \"#CC79A7\",\n",
    "    \"#C2CD23\",\n",
    "    \"#918BC3\",\n",
    "    \"#FFFFFF\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_stats = []\n",
    "node_stats = []\n",
    "cluster_coef_stats = []\n",
    "\n",
    "data_size = []\n",
    "category = []\n",
    "network_names = []\n",
    "for i in os.listdir(REAL_DATA_PATH):\n",
    "    g = nx.from_pandas_edgelist(\n",
    "        pd.read_csv(\n",
    "            osp.join(REAL_DATA_PATH, i),\n",
    "            sep=\" \",\n",
    "            header=None,\n",
    "            usecols=[0, 1],\n",
    "            names=[\"source\", \"target\"],\n",
    "            comment=\"%\",\n",
    "        )\n",
    "    )\n",
    "    edge_stats.append(g.number_of_edges())\n",
    "    node_stats.append(g.number_of_nodes())\n",
    "    # cluster_coef_stats.append(nx.average_clustering(g))\n",
    "\n",
    "    if \"mlreal\" in i.split(\"@\")[0]:\n",
    "        data_size.append(\"Medium-Lage\")\n",
    "        category.append(i.split(\"@\")[0].split(\"mlreal\")[1])\n",
    "    else:\n",
    "        data_size.append(\"Small\")\n",
    "        category.append(i.split(\"@\")[0].split(\"sreal\")[1])\n",
    "\n",
    "    network_names.append(i.split(\"@\")[1].split(\".csv\")[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"nodes\": node_stats,\n",
    "        \"edges\": edge_stats,\n",
    "        # \"cluster_coef\": cluster_coef_stats,\n",
    "        \"node_edge_ration\": list(map(lambda x: x[0]/x[1], zip(edge_stats, node_stats))),\n",
    "        \"category\": category,\n",
    "        \"network_names\": network_names,\n",
    "        \"Scale Category\": data_size,\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Breakdown of nodes and edges for real dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_context(\"paper\", font_scale=4)\n",
    "\n",
    "plt.subplots(figsize=(29, 21))\n",
    "g = sns.boxplot(\n",
    "    x=\"nodes\",\n",
    "    y=\"category\",\n",
    "    hue=\"Scale Category\",\n",
    "    linewidth=2,\n",
    "    width=0.75,\n",
    "    whiskerprops={\"color\": \"black\", \"alpha\": 1.0, \"linewidth\": 3},\n",
    "    flierprops={\n",
    "        \"marker\": \"o\",\n",
    "        \"markerfacecolor\": \"black\",\n",
    "        \"markeredgecolor\": \"black\",\n",
    "        \"linewidth\": 1,\n",
    "        \"alpha\": 0.35,\n",
    "    },\n",
    "    fliersize=10,\n",
    "    data=df,\n",
    "    palette=p[1:3],\n",
    "    orient=\"y\",\n",
    "    saturation=0.95,\n",
    "    # width=1,\n",
    ")\n",
    "g.set(xscale=\"log\")\n",
    "g.set_xlabel(\"Node Count\")\n",
    "g.set_ylabel(\"\")\n",
    "\n",
    "plt.legend(title=\"Scale Category\")\n",
    "plt.axvline(x=641, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.axvline(x=1577, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.axvline(x=2993, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.tight_layout()\n",
    "plt.savefig(SAVE_PATH / \"nodes_real_set.pdf\", dpi=1200, bbox_inches=\"tight\")\n",
    "plt.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_context(\"paper\", font_scale=4)\n",
    "\n",
    "plt.figure(figsize=(29, 21))\n",
    "g = sns.boxplot(\n",
    "    x=\"edges\",\n",
    "    y=\"category\",\n",
    "    hue=\"Scale Category\",\n",
    "    linewidth=2,\n",
    "    width=0.75,\n",
    "    whiskerprops={\"color\": \"black\", \"alpha\": 1.0, \"linewidth\": 3},\n",
    "    flierprops={\n",
    "        \"marker\": \"o\",\n",
    "        \"markerfacecolor\": \"black\",\n",
    "        \"markeredgecolor\": \"black\",\n",
    "        \"linewidth\": 1,\n",
    "        \"alpha\": 0.35,\n",
    "    },\n",
    "    fliersize=10,\n",
    "    data=df,\n",
    "    palette=p[1:3],\n",
    "    orient=\"y\",\n",
    "    saturation=0.95,\n",
    "    # width=0.8,\n",
    ")\n",
    "g.set(xscale=\"log\")\n",
    "g.set_xlabel(\"Edge Count\")\n",
    "g.set_ylabel(\"\")\n",
    "\n",
    "plt.legend(title='Scale Category')\n",
    "plt.axvline(x=1526, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.axvline(x=4066, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.axvline(x=10820, c=\"red\", linestyle=\"-.\", zorder=0, alpha=0.7, lw=3)\n",
    "plt.tight_layout()\n",
    "plt.savefig(SAVE_PATH/'edges_real_set.pdf', dpi=1200, bbox_inches = 'tight')\n",
    "plt.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "## Generate unformated LaTeX table of real networks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_l = df.copy(deep=True)\n",
    "\n",
    "df_l.rename(\n",
    "    columns={\n",
    "        \"nodes\": \"Num Nodes\",\n",
    "        \"edges\": \"Num Edges\",\n",
    "        \"cluster_coef\": \"Cluster Coefficient\",\n",
    "        \"node_edge_ration\": \"Edge/Node\",\n",
    "        \"category\": \"Type\",\n",
    "        \"network_names\": \"Network Name\",\n",
    "    },\n",
    "    inplace=True,\n",
    ")\n",
    "\n",
    "df_l[\"Edge/Node\"] = df_l[\"Edge/Node\"].apply(lambda x: np.round(x, 3))\n",
    "\n",
    "print(\n",
    "    df_l.drop([\"Edge/Node\"], axis=1)\n",
    "    .sort_values(by=[\"Scale Category\", \"Type\", \"Num Nodes\", \"Num Edges\"])\n",
    "    .to_latex(index=False, float_format=\"{:.3f}\".format)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_of_truth",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
