{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_dataset(X, y, filename, test_ratio=0.5, val_ratio=0.5):\n",
    "    n_features = X.shape[1]\n",
    "    feature_names = [f'feature_{i}' for i in range(n_features)]\n",
    "    X_df = pd.DataFrame(X, columns=feature_names)\n",
    "    X_df['target'] = y\n",
    "\n",
    "    # split into train and test\n",
    "    train_df, test_df = train_test_split(X_df, test_size=test_ratio, random_state=42)\n",
    "    val_df, test_df = train_test_split(test_df, test_size=val_ratio, random_state=42)\n",
    "\n",
    "    print(train_df.shape, val_df.shape, test_df.shape)\n",
    "    \n",
    "    train_df.to_csv(f'../data/synthetic_reg/{filename}/train.csv', index=False)\n",
    "    val_df.to_csv(f'../data/synthetic_reg/{filename}/val.csv', index=False)\n",
    "    test_df.to_csv(f'../data/synthetic_reg/{filename}/test.csv', index=False)\n",
    "\n",
    "    return train_df, val_df, test_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 0, toy example\n",
    "\n",
    "p = 3\n",
    "\n",
    "\\begin{equation}\n",
    "    y = 3 \\cdot x_0 \\cdot [1+ \\tanh(10 x_1)] + (-3) \\cdot x_1 \\cdot \\left[1 + \\sin(-2 x_0) \\right]\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n, p = 400, 3\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = 3\n",
    "beta[1] = -3\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * np.tanh(10 * X[:, 1])\n",
    "y += X[:, 1] * beta[1] * np.sin(-2* X[:, 0])\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "\n",
    "train_df, val_df, test_df = save_dataset(X, y, 'setting0', test_ratio=0.5, val_ratio=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 1\n",
    "p = 5\n",
    "\n",
    "\\begin{equation}\n",
    "    \\begin{aligned}\n",
    "    y = & +3 \\cdot x_0 \\cdot \\left[1 + (2 \\sigma(x_1 x_2)-1)\\right] \\\\\n",
    "    &- 2 \\cdot x_1 \\\\\n",
    "    &+ 2 \\cdot x_2 \n",
    "    \\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 6) (100, 6) (100, 6)\n"
     ]
    }
   ],
   "source": [
    "n, p = 400, 5\n",
    "\n",
    "np.random.seed(42)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "def sigmoid(z):\n",
    "    return 1 / (1 + np.exp(-z))\n",
    "\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = 3\n",
    "beta[1] = -2\n",
    "beta[2] = 2\n",
    "\n",
    "y = X@beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * (2*sigmoid(X[:, 1]*X[:, 2]) - 1)\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "\n",
    "train_df, val_df, test_df = save_dataset(X, y, filename='setting1', test_ratio=0.5, val_ratio=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 2\n",
    "\n",
    "p = 10\n",
    "\n",
    "\\begin{equation}\n",
    "    \\begin{aligned}\n",
    "    y = & +1 \\cdot x_0 \\cdot [1 + \\tanh(x_1 x_2 + \\sin(x_3))] \\\\\n",
    "    &+ 2 \\cdot x_1 \\cdot [1 + \\sin(2x_0)] \\\\\n",
    "    &-1 \\cdot x_2 \\cdot [1 + \\frac{2}{\\pi}\\arctan(x_1 x_3)]\n",
    "    \\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 11) (100, 11) (100, 11)\n"
     ]
    }
   ],
   "source": [
    "n, p = 400, 10\n",
    "\n",
    "np.random.seed(42)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = 1\n",
    "beta[1] = 2\n",
    "beta[2] = -1\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * np.tanh(X[:, 1]*X[:, 2] + np.sin(X[:, 3]))\n",
    "y += X[:, 1] * beta[1] * np.sin(2*X[:, 0])\n",
    "y += X[:, 2] * beta[2] * 2*np.arctan(X[:, 1]*X[:, 3])/np.pi\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "train_df, val_df, test_df = save_dataset(X, y, 'setting2', test_ratio=0.5, val_ratio=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setting 3\n",
    "p = 50\n",
    "\\begin{equation}\n",
    "\\begin{aligned}\n",
    "y =\\;& -2 \\cdot x_0 \\cdot [1 + \\tanh(x_{1} x_{3})] \\\\\n",
    "&+ 2 \\cdot x_1 \\cdot \\left[1 + \\frac{2}{\\pi} \\arctan(x_{3} - x_{4})\\right] \\\\\n",
    "%&+ (-3) \\cdot x_2 \\cdot \\left[1 + \\cos(x_{0} x_{1})\\right] \\\\\n",
    "&+ 3 \\cdot x_3 \\cdot \\left[1 + \\tanh(x_{1} + \\sin(x_{4}))\\right] \\\\\n",
    "&-1 \\cdot x_4 \\cdot \\left[1 + (2 \\sigma(x_{0} x_{3})-1)\\right]\n",
    "\\end{aligned}\n",
    "\\end{equation}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 51) (100, 51) (100, 51)\n"
     ]
    }
   ],
   "source": [
    "n, p = 400, 50\n",
    "\n",
    "np.random.seed(42)\n",
    "X = np.random.uniform(-2, 2, (n, p))\n",
    "\n",
    "def sigmoid(z):\n",
    "    return 1 / (1 + np.exp(-z))\n",
    "\n",
    "beta = np.zeros(p)\n",
    "beta[0] = -2\n",
    "beta[1] = 2\n",
    "# beta[2] = 0\n",
    "beta[3] = 3\n",
    "beta[4] = -1\n",
    "\n",
    "y = X @ beta\n",
    "\n",
    "y += X[:, 0] * beta[0] * np.tanh(X[:, 1]*X[:, 3])\n",
    "y += X[:, 1] * beta[1] * 2*np.arctan(X[:, 3] - X[:, 4])/np.pi\n",
    "# y += X[:, 2] * beta[2] * np.cos(X[:, 0] * X[:, 1])\n",
    "y += X[:, 3] * beta[3] * np.tanh(X[:, 1] + np.sin(X[:, 4]))\n",
    "y += X[:, 4] * beta[4] * (2*sigmoid(X[:, 0]*X[:, 3]) - 1)\n",
    "\n",
    "y += np.random.normal(0, 0.1, n)\n",
    "train_df, val_df, test_df = save_dataset(X, y, 'setting3', test_ratio=0.5, val_ratio=0.5)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nimo",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.23"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
