{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ba1887c2-65ab-4dd2-abc9-201868f9e7fe",
   "metadata": {},
   "source": [
    "### Python packages used in this code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90da4964-9e85-4919-8bdd-d9c9c95dbc27",
   "metadata": {},
   "outputs": [],
   "source": [
    "import platform\n",
    "import sys\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "import matplotlib.pyplot as plt\n",
    "from pylab import rcParams\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12680a1d-9c0e-425b-b7e2-62ac94b40b4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Environments\n",
    "\n",
    "--Platform--\n",
    "OS : Windows-10-10.0.19044-SP0\n",
    "--Version--\n",
    "python :  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]\n",
    "numpy : 1.23.1\n",
    "pandas : 1.4.3\n",
    "\"\"\"\n",
    "\n",
    "print('--Platform--')\n",
    "print('OS :', platform.platform())\n",
    "print('--Version--')\n",
    "print('python : ', sys.version)\n",
    "print('numpy :', np.__version__)\n",
    "print('pandas :', pd.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72cfd490-d68d-41b8-8a2f-f2eacac61715",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08de4ca2-2199-4bf8-be88-fb16d77877ee",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Create output directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a380489-70f8-48da-9798-9457c2b70210",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.isdir('../30_Output/20_Plot/100_CheckData'):\n",
    "    os.makedirs('../30_Output/20_Plot/100_CheckData')\n",
    "if not os.path.isdir('../30_Output/30_csv/100_CheckData'):\n",
    "    os.makedirs('../30_Output/30_csv/100_CheckData')\n",
    "if not os.path.isdir('../30_Output/40_pkl/100_CheckData'):\n",
    "    os.makedirs('../30_Output/40_pkl/100_CheckData')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f929cec7-c34c-4b37-ad7d-091a615c586d",
   "metadata": {},
   "source": [
    "# Main codes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe7e7849-b441-4e75-968b-3d333ac6184f",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b9e952b-b83d-479a-86dc-af2f21bb789d",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_all = pd.read_csv('../10_Data/PI_lib_FFKM.csv', index_col=2)\n",
    "ys_all = pd.read_csv('../10_Data/MD_PI1070.csv', index_col=0)\n",
    "# y_all = pd.read_csv('../10_Data/PI_Cp.csv', index_col=0)\n",
    "y_all = pd.read_csv('../10_Data/PI_Cp_masked.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcbd7c13-c26f-4317-8e22-e9804381f3ba",
   "metadata": {},
   "source": [
    "## Check data size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3d4f87d-307d-498e-ab07-437cc83a8a0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create flag matrix (0:not exist, 1:exist)\n",
    "idx_x = pd.Series(np.repeat(1, len(x_all.index.values)), index=x_all.index.values).rename('x')\n",
    "idx_y = pd.Series(np.repeat(1, len(y_all.index.values)), index=y_all.index.values).rename('y')\n",
    "idx_ys = pd.Series(np.repeat(1, len(ys_all.index.values)), index=ys_all.index.values).rename('ys')\n",
    "df_idx = pd.concat([idx_x, idx_y, idx_ys], axis=1, join='outer').fillna(0)\n",
    "\n",
    "# Count data\n",
    "df_count = pd.DataFrame(columns=['x : Input','y : Cp(PolyInfo)','ys : Cp(MD)','Count'])\n",
    "for ix in [0,1]:\n",
    "    for iy in [0,1]:\n",
    "        for iys in [0,1]:\n",
    "            df_count.loc[str(ix)+str(iy)+str(iys)] = [ix, iy, iys, sum((df_idx['x']==ix)&(df_idx['y']==iy)&(df_idx['ys']==iys))]\n",
    "\n",
    "print(df_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "791bf4c3-da52-4c11-8638-6aba1a4e2905",
   "metadata": {},
   "source": [
    "## Save data to be used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "493e95f1-011b-4397-ba47-9a3d37fa0c61",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PIDs of the data\n",
    "PID = df_idx[(df_idx['x']==1)&(df_idx['y']==1)&(df_idx['ys']==1)].index\n",
    "print(PID)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2980418a-9006-42b5-94c1-b276cb326cbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Inputs, outputs and source features \n",
    "x = x_all.loc[PID].iloc[:,2:]\n",
    "y = y_all.loc[PID]['Cp mean']\n",
    "ys = ys_all.loc[PID]['Cp']\n",
    "\n",
    "# Create the dictionary\n",
    "data_dict = {\n",
    "    'x_all': x_all,\n",
    "    'y_all' : y_all,\n",
    "    'ys_all' : ys_all,\n",
    "    'idx_count' : df_count,\n",
    "    'PID' : PID,\n",
    "    'x' : x,\n",
    "    'y' : y,\n",
    "    'ys' : ys\n",
    "}\n",
    "\n",
    "# Save\n",
    "# f = open('../30_Output/40_pkl/100_CheckData/100_Data.pkl','wb')\n",
    "f = open('../30_Output/40_pkl/100_CheckData/100_Data_masked.pkl','wb')\n",
    "pickle.dump(data_dict,f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4de4b68-ca4d-4d5d-abca-14181c96e1f6",
   "metadata": {},
   "source": [
    "## Check data distribution"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2cc392c8-d79e-4fac-8f7a-0a8feeee97c9",
   "metadata": {},
   "source": [
    "### Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea3f1fe2-d309-4e8e-9bde-76b3f7e212ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# For paper\n",
    "df_plt = pd.DataFrame([y,ys], index=['CP_PI','CP_MD']).T\n",
    "sns.set_style(style='ticks')\n",
    "g = sns.jointplot(\n",
    "    x='CP_PI', \n",
    "    y='CP_MD',\n",
    "    kind=\"reg\",\n",
    "    data=df_plt,\n",
    "    xlim=(500,5000),\n",
    "    ylim=(500,5000),\n",
    "    marginal_kws={'bins':20, 'color':'steelblue'},\n",
    "    joint_kws={'fit_reg':False, 'scatter_kws':{'s':50, 'edgecolor':'white'}}              \n",
    ")\n",
    "g.ax_joint.grid(color='gray', linestyle='dotted', linewidth=1.1, alpha=0.4, zorder=1)\n",
    "g.ax_joint.axline((0, 0), (.1, .1), linestyle='dotted', linewidth=1.1, alpha=0.4, zorder=2, color='gray')\n",
    "g.ax_joint.set_xlabel('Experimentally-observed heat capacity (log J/kg'+r'$\\cdot$'+'K)', size=13, labelpad=8)\n",
    "g.ax_joint.set_ylabel('MD-calculated heat capacity (log J/kg'+r'$\\cdot$'+'K)', size=13, labelpad=8)\n",
    "g.ax_joint.text(2590,700,'Correlation : 0.7788', size=18, backgroundcolor='white')\n",
    "plt.savefig('../30_Output/20_Plot/100_CheckData/100_CheckData_y_paper_joint.pdf')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
