{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4f68be2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\oriel\\anaconda3\\lib\\site-packages\\scipy\\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.25.2\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import preprocessing\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, mean_absolute_error\n",
    "import scipy.stats as stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "60c93771",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cv_shuffle(df):\n",
    "    # Shuffle the DataFrame and reset the index\n",
    "    shuffled_df = df.sample(frac=1).reset_index(drop=True)\n",
    "\n",
    "    # Calculate the lengths of the five dataframes\n",
    "    total_length = len(shuffled_df)\n",
    "    partition_lengths = [total_length // 5] * 4 + [total_length - (total_length // 5) * 4]\n",
    "\n",
    "    # Split the DataFrame into five parts\n",
    "    dfs = [shuffled_df.iloc[sum(partition_lengths[:i]):sum(partition_lengths[:i+1])] for i in range(5)]\n",
    "\n",
    "    # Optional: Reset the index of each DataFrame\n",
    "    dfs = [df.reset_index(drop=True) for df in dfs]\n",
    "    return dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "803a9281",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CI\n",
    "def calc_ci(data):\n",
    "    result = {}\n",
    "\n",
    "    for key, values in data.items():\n",
    "        # Calculate mean and standard deviation\n",
    "        mean_value = np.mean(values)\n",
    "        std_dev = np.std(values)\n",
    "\n",
    "        # Calculate confidence interval\n",
    "        confidence_interval = stats.norm.interval(0.95, loc=mean_value, scale=std_dev/np.sqrt(len(values)))\n",
    "\n",
    "        result[key] = {'mean': mean_value, 'confidence_interval': confidence_interval}\n",
    "\n",
    "    print(result)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd0c4a75",
   "metadata": {},
   "source": [
    "## Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d9e018d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# data import\n",
    "adult = pd.read_csv('datasets/clean/adult_scaled.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "25e4170e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recreate train_test_split of the model\n",
    "train_set = adult.sample(frac=0.7, random_state=42)\n",
    "val_set = adult.drop(train_set.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae74f433",
   "metadata": {},
   "source": [
    "#### No Feedback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d37cf341",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_data_nof = pd.read_csv('sampled/adult_nof_100e_500b.csv')\n",
    "syn_data_nof = syn_data_nof.drop('Unnamed: 0', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0e00a752",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision:0.572289156626506, recall:0.4498493327593629\n",
      "precision:0.5700619020821609, recall:0.4360740421868274\n",
      "precision:0.5719844357976653, recall:0.4429616874730951\n",
      "precision:0.5740011254924029, recall:0.4390873869995695\n",
      "precision:0.5795257374204743, recall:0.4313387860525183\n"
     ]
    }
   ],
   "source": [
    "dfs = cv_shuffle(syn_data_nof)\n",
    "res_dict = {'precision':[], 'recall':[]}\n",
    "for df in dfs:\n",
    "    logreg = LogisticRegression()\n",
    "    logreg.fit(df.iloc[:, :-1], df.iloc[:, -1])\n",
    "    precision = precision_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))\n",
    "    recall = recall_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))\n",
    "    res_dict['precision'].append(precision)\n",
    "    res_dict['recall'].append(recall)\n",
    "    print(f'precision:{precision}, recall:{recall}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "daca56ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'precision': {'mean': 0.5735724714838419, 'confidence_interval': (0.570742566600792, 0.5764023763668918)}, 'recall': {'mean': 0.4398622470942747, 'confidence_interval': (0.4343610823662003, 0.44536341182234906)}}\n"
     ]
    }
   ],
   "source": [
    "res = calc_ci(res_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb4f2e8e",
   "metadata": {},
   "source": [
    "#### Feedback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61b8e9e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_data_feedback = pd.read_csv('sampled/adult_feedback_100e_500b.csv')\n",
    "syn_data_feedback = syn_data_feedback.drop('Unnamed: 0', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51d79a25",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = cv_shuffle(syn_data_feedback)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64d36ad2",
   "metadata": {},
   "outputs": [],
   "source": [
    "res_dict = {'precision':[], 'recall':[]}\n",
    "for df in dfs:\n",
    "    logreg = LogisticRegression()\n",
    "    logreg.fit(df.iloc[:, :-1], df.iloc[:, -1])\n",
    "    precision = precision_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))\n",
    "    recall = recall_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))\n",
    "    res_dict['precision'].append(precision)\n",
    "    res_dict['recall'].append(recall)\n",
    "    print(f'precision:{precision}, recall:{recall}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40cc5072",
   "metadata": {},
   "outputs": [],
   "source": [
    "res = calc_ci(res_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaeddf2e",
   "metadata": {},
   "source": [
    "## Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c0e1cf31",
   "metadata": {},
   "outputs": [],
   "source": [
    "# data import\n",
    "house = pd.read_csv('datasets/clean/house_price.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "42d6016c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recreate train_test_split of the model\n",
    "train_set = house.sample(frac=0.7, random_state=42)\n",
    "val_set = house.drop(train_set.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dde0f456",
   "metadata": {},
   "source": [
    "#### No Feedback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "50883812",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_data_nof = pd.read_csv('sampled/house_nof_500e_200b.csv')\n",
    "syn_data_nof = syn_data_nof.drop('Unnamed: 0', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0b426030",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('datasets/raw/house_price.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "df421df9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = cv_shuffle(syn_data_nof)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7d6df5e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rmse:0.011593275383325295, r2:0.3806921606739512\n",
      "rmse:0.011790829085683758, r2:0.3594058601632941\n",
      "rmse:0.012123981225061274, r2:0.32269421726899383\n",
      "rmse:0.011613713248734208, r2:0.37850667181156883\n",
      "rmse:0.01176469542093801, r2:0.36224239007582903\n"
     ]
    }
   ],
   "source": [
    "res_dict = {'rmse':[], 'r2':[]}\n",
    "for df in dfs:\n",
    "    linreg = LinearRegression()\n",
    "    linreg.fit(df.iloc[:, :-1], df.iloc[:, -1])\n",
    "    rmse = np.sqrt(mean_squared_error(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1])))\n",
    "    rsqrt = r2_score(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1]))\n",
    "    res_dict['rmse'].append(rmse)\n",
    "    res_dict['r2'].append(rsqrt)\n",
    "    print(f'rmse:{rmse}, r2:{rsqrt}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1bf1282b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'rmse': {'mean': 0.01177729887274851, 'confidence_interval': (0.011610457478498537, 0.011944140266998481)}, 'r2': {'mean': 0.3607082599987274, 'confidence_interval': (0.3424679249360426, 0.3789485950614122)}}\n"
     ]
    }
   ],
   "source": [
    "res = calc_ci(res_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a960149",
   "metadata": {},
   "source": [
    "#### Feedback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "124bdab3",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_data_feedback = pd.read_csv('sampled/house_feedback_500e_200b_new.csv')\n",
    "syn_data_feedback = syn_data_feedback.drop('Unnamed: 0', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8eaa0fe8",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = cv_shuffle(syn_data_feedback)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "82cafbde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rmse:0.011470106099226779, r2:0.39378155866730014\n",
      "rmse:0.011456215973495672, r2:0.39524891246133453\n",
      "rmse:0.011457711581260717, r2:0.3950910017753292\n",
      "rmse:0.01163397269274467, r2:0.37633646298299817\n",
      "rmse:0.011495882484004313, r2:0.39105382839258496\n"
     ]
    }
   ],
   "source": [
    "res_dict = {'rmse':[], 'r2':[]}\n",
    "for df in dfs:\n",
    "    linreg = LinearRegression()\n",
    "    linreg.fit(df.iloc[:, :-1], df.iloc[:, -1])\n",
    "    rmse = np.sqrt(mean_squared_error(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1])))\n",
    "    rsqrt = r2_score(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1]))\n",
    "    res_dict['rmse'].append(rmse)\n",
    "    res_dict['r2'].append(rsqrt)\n",
    "    print(f'rmse:{rmse}, r2:{rsqrt}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "696767f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'rmse': {'mean': 0.011502777766146432, 'confidence_interval': (0.011443944814425643, 0.01156161071786722)}, 'r2': {'mean': 0.39030235285590936, 'confidence_interval': (0.3840413151194081, 0.39656339059241064)}}\n"
     ]
    }
   ],
   "source": [
    "res = calc_ci(res_dict)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
