{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "414921f9-152f-46a5-a5ab-6784a6cdd2ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from xgboost import XGBClassifier\n",
    "from pytorch_tabnet.tab_model import TabNetClassifier\n",
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "33964f39-a68f-440c-bef4-8bc77944b0de",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "n_samples = 10000000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0d8ee403-e40d-491d-8ec2-69e7e3e172d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def simulate_correlated_categorical(n_samples, n_variables, categories_per_variable, correlation_matrix):\n",
    "    \"\"\"\n",
    "    Simulates correlated categorical data.\n",
    "\n",
    "    Args:\n",
    "        n_samples (int): The number of samples to generate.\n",
    "        n_variables (int): The number of categorical variables.\n",
    "        categories_per_variable (list): A list where each element is the number of categories\n",
    "                                         for the corresponding variable.\n",
    "        correlation_matrix (np.ndarray): The desired correlation matrix for the\n",
    "                                         underlying continuous variables.\n",
    "\n",
    "    Returns:\n",
    "        pd.DataFrame: A DataFrame containing the simulated correlated categorical data.\n",
    "    \"\"\"\n",
    "\n",
    "\n",
    "    # 1. Generate independent standard normal variables\n",
    "    independent_normals = np.random.randn(n_samples, n_variables)\n",
    "\n",
    "    # 2. Apply Cholesky decomposition to introduce correlation\n",
    "    L = np.linalg.cholesky(correlation_matrix)\n",
    "    correlated_normals = independent_normals @ L.T\n",
    "\n",
    "    # 3. Discretize into categories\n",
    "    categorical_data = np.zeros_like(correlated_normals, dtype=int)\n",
    "    \n",
    "    for i in range(n_variables):\n",
    "        \n",
    "        num_categories = categories_per_variable[i]\n",
    "        \n",
    "        thresholds = np.linspace(correlated_normals[:, i].min(),\n",
    "                                 correlated_normals[:, i].max(),\n",
    "                                 num_categories + 1)[1:-1] # Exclude min/max\n",
    "        categorical_data[:, i] = np.digitize(correlated_normals[:, i], bins=thresholds)\n",
    "\n",
    "    column_names = [f'Var_{i+1}' for i in range(n_variables)]\n",
    "    return pd.DataFrame(categorical_data, columns=column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9cce156a-d111-47c6-85e2-dcf966393b67",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_variables = 2\n",
    "categories_per_variable = [100000, 30000]\n",
    "\n",
    "correlation_matrix = np.array([\n",
    "    [1.0, 0.2],\n",
    "    [0.2, 1.0]\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e6f668dd-03a4-446b-a2fc-667e908b491b",
   "metadata": {},
   "outputs": [],
   "source": [
    "simulated_categorical_data = simulate_correlated_categorical(n_samples, n_variables, categories_per_variable, correlation_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "29408c4e-e47b-4ae5-b24d-742b10503c95",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of simulated data: 152.5880470275879 MB\n"
     ]
    }
   ],
   "source": [
    "simulated_data_size = sys.getsizeof(simulated_categorical_data)\n",
    "print(f\"Size of simulated data: {simulated_data_size/1024**2} MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "319c9f53-8032-4dc3-9adc-aa068f7d867b",
   "metadata": {},
   "source": [
    "## Simulated study for regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0da206cc-6c09-4bec-af40-a80e7cbd7781",
   "metadata": {},
   "outputs": [],
   "source": [
    "beta_var1 = 5\n",
    "beta_var2 = 10\n",
    "intercept = 20\n",
    "\n",
    "error = np.random.normal(loc=0, scale=3, size=n_samples)\n",
    "\n",
    "y = (intercept +\n",
    "     beta_group_B * data['Group_B'] +\n",
    "     beta_group_C * data['Group_C'] +\n",
    "     error)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c1de903-a90f-484a-9d1f-9afdef0924cf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60a7d3c0-3520-4e83-92e3-d8846d5d10c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "my_list = [1, 2, 3, \"hello\"]\n",
    "list_size = sys.getsizeof(my_list)\n",
    "print(f\"Size of my_list: {list_size} bytes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06b3e33a-4830-4b8f-aed0-f6a288178fde",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0761abe7-a4fd-40e1-9fbb-114b49826adc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "971e64fd-9765-45b4-b108-fd2a3a9a2d81",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf7ef45e-fce8-4033-9c8d-ec060fe758d0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc8b696b-ab2b-44c0-aecc-94743a85f809",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db54fa70-1033-436c-9217-30dc54769ec9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea5c3823-95fa-4aeb-8a8f-41949c3604cd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0764978a-b7a0-49f8-a631-6c6c2350993d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4f02afe-fa7d-45f1-8d9e-a1d40eee3cf6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5cae22f-6102-4496-b230-aaf4fbeccc95",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aeb69729-a3b1-486a-be05-5f1ddc95f547",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.datasets import make_classification\n",
    "\n",
    "# 1. Generate numerical features and a target variable using make_classification\n",
    "n_samples = 1000\n",
    "n_features_numeric = 5\n",
    "n_classes = 2\n",
    "\n",
    "X_numeric, y = make_classification(\n",
    "    n_samples=n_samples,\n",
    "    n_features=n_features_numeric,\n",
    "    n_informative=n_features_numeric,\n",
    "    n_redundant=0,\n",
    "    n_classes=n_classes,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "# 2. Create categorical variables\n",
    "# Define categories for each categorical feature\n",
    "categories_gender = ['Male', 'Female', 'Other']\n",
    "categories_city = ['New York', 'London', 'Paris', 'Tokyo']\n",
    "categories_education = ['High School', 'Bachelors', 'Masters', 'PhD']\n",
    "\n",
    "# Generate random categorical assignments\n",
    "gender = np.random.choice(categories_gender, size=n_samples)\n",
    "city = np.random.choice(categories_city, size=n_samples)\n",
    "education = np.random.choice(categories_education, size=n_samples)\n",
    "\n",
    "# 3. Combine numerical and categorical features into a DataFrame\n",
    "df_numeric = pd.DataFrame(X_numeric, columns=[f'numeric_feature_{i+1}' for i in range(n_features_numeric)])\n",
    "df_categorical = pd.DataFrame({'gender': gender, 'city': city, 'education': education})\n",
    "df_target = pd.DataFrame({'target': y})\n",
    "\n",
    "# Concatenate all parts\n",
    "simulated_data = pd.concat([df_numeric, df_categorical, df_target], axis=1)\n",
    "\n",
    "# Display the first few rows of the simulated data\n",
    "print(simulated_data.head())\n",
    "\n",
    "# Display data types to confirm categorical columns\n",
    "print(\"\\nData types of simulated data:\")\n",
    "print(simulated_data.dtypes)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
