{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_dataset(X, y, filename, test_ratio=0.5, val_ratio=0.5):\n",
    "    n_features = X.shape[1]\n",
    "    feature_names = [f'feature_{i}' for i in range(n_features)]\n",
    "    X_df = pd.DataFrame(X, columns=feature_names)\n",
    "    X_df['target'] = y\n",
    "\n",
    "    # split into train and test\n",
    "    train_df, test_df = train_test_split(X_df, test_size=test_ratio, random_state=42)\n",
    "    val_df, test_df = train_test_split(test_df, test_size=val_ratio, random_state=42)\n",
    "\n",
    "    print(train_df.shape, val_df.shape, test_df.shape)\n",
    "    \n",
    "    train_df.to_csv(f'../data/synthetic_cls/{filename}/train.csv', index=False)\n",
    "    val_df.to_csv(f'../data/synthetic_cls/{filename}/val.csv', index=False)\n",
    "    test_df.to_csv(f'../data/synthetic_cls/{filename}/test.csv', index=False)\n",
    "\n",
    "    return train_df, val_df, test_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 1\n",
    "\n",
    "p = 3\n",
    "\n",
    "\\begin{equation}\n",
    "    \\begin{aligned}\n",
    "    y = & +2 \\cdot x_0 \\cdot [1 + 2\\tanh(x_1)] \\\\\n",
    "    &-2 \\cdot x_1 \\cdot [1 + 3\\sin(2x_0) + \\tanh(2x_0)] \\\\\n",
    "    &+1\n",
    "    \\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 4) (100, 4) (100, 4)\n"
     ]
    }
   ],
   "source": [
    "n, p = 400, 3\n",
    "\n",
    "np.random.seed(42)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = 2\n",
    "beta[1] = -2\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * np.tanh(X[:, 1])\n",
    "y += X[:, 1] * beta[1] * 3 * np.sin(2*X[:, 0])\n",
    "y += X[:, 1] * beta[1] * np.tanh(2*X[:, 0])\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "y += 1\n",
    "\n",
    "pi = 1 / (1 + np.exp(-y))\n",
    "label = np.random.binomial(1, pi)\n",
    "\n",
    "train_df, val_df, test_df = save_dataset(X, label, 'setting1', test_ratio=0.5, val_ratio=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 2\n",
    "\n",
    "p = 10\n",
    "\n",
    "\\begin{equation}\n",
    "    \\begin{aligned}\n",
    "    y = & +10 \\cdot x_0 \\cdot [1 + 2 \\cdot \\tanh(2x_1) + \\sin(x_3)] \\\\\n",
    "    &+ 20 \\cdot x_1 \\cdot [1 + 2 \\cdot \\cos(2x_0)] \\\\\n",
    "    &-20 \\cdot x_2 \\cdot [1 + 2 \\cdot \\arctan(x_1 x_3)] \\\\\n",
    "    & + 10 \\cdot x_3 \\\\\n",
    "    & - 10\n",
    "    \\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n, p = 400, 10\n",
    "\n",
    "np.random.seed(42)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "\n",
    "beta[0] = 10\n",
    "beta[1] = 20\n",
    "beta[2] = -20\n",
    "beta[3] = 10\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * 2 * (np.tanh(2*X[:, 1]) + np.sin(X[:, 3]))\n",
    "y += X[:, 1] * beta[1] * 2 * np.cos(2*X[:, 0])\n",
    "y += X[:, 2] * beta[2] * 2 * np.arctan(X[:, 1]*X[:, 3])\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "y -= 10\n",
    "\n",
    "pi = 1 / (1 + np.exp(-y))\n",
    "label = np.random.binomial(1, pi)\n",
    "\n",
    "train_df, val_df, test_df = save_dataset(X, label, 'setting2', test_ratio=0.5, val_ratio=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 3\n",
    "\n",
    "p = 50\n",
    "\\begin{equation}\n",
    "\\begin{aligned}\n",
    "y =\\;& -20 \\cdot x_0 \\cdot [1 + \\tanh(x_{1} x_{3})] \\\\\n",
    "&+ 20 \\cdot x_1 \\cdot \\left[1 + \\frac{2}{\\pi} \\arctan(x_{3} - x_{4})\\right] \\\\\n",
    "&+ 30 \\cdot x_3 \\cdot \\left[1 + \\tanh(x_{1} + \\sin(x_{4}))\\right] \\\\\n",
    "&-10 \\cdot x_4 \\cdot \\left[1 + (2 \\sigma(x_{0} x_{3})-1)\\right]\n",
    "\\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n, p = 400, 50\n",
    "\n",
    "np.random.seed(7)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "def sigmoid(z):\n",
    "    return 1 / (1 + np.exp(-z))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = -20\n",
    "beta[1] = 20\n",
    "# beta[2] = 0\n",
    "beta[3] = 30\n",
    "beta[4] = -10\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * np.tanh(X[:, 1]*X[:, 3])\n",
    "y += X[:, 1] * beta[1] * 2*np.arctan(X[:, 3] - X[:, 4])/np.pi\n",
    "y += X[:, 3] * beta[3] * np.tanh(X[:, 1] + np.sin(X[:, 4]))\n",
    "y += X[:, 4] * beta[4] * (2*sigmoid(X[:, 0]*X[:, 3]) - 1)\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "\n",
    "pi = 1 / (1 + np.exp(-y))\n",
    "label = np.random.binomial(1, pi)\n",
    "\n",
    "train_df, val_df, test_df = save_dataset(X, label, 'setting3', test_ratio=0.5, val_ratio=0.5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
