{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_159473/3647332689.py:7: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = openml.datasets.get_dataset('diabetes')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train (576, 9) val (57, 9) test (135, 9)\n"
     ]
    }
   ],
   "source": [
    "import openml\n",
    "import pandas as pd\n",
    "import datetime\n",
    "import os\n",
    "import json\n",
    "\n",
    "dataset = openml.datasets.get_dataset('diabetes')\n",
    "df, _, _, _ = dataset.get_data(dataset_format=\"dataframe\")\n",
    "\n",
    "cols = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age', 'diagnosis']\n",
    "ords = []\n",
    "labs = ['diagnosis']\n",
    "nums = ['pregnancies', 'glucose-plasma', 'blood-pressure', 'skin-thickness', 'insulin', 'BMI', 'pedigree', 'age']\n",
    "\n",
    "df.columns = cols \n",
    "df['diagnosis'] = df['diagnosis'].map(lambda x: 'positive' if x=='tested_positive' else 'negative')\n",
    "\n",
    "config = {\n",
    "    'dataset_name': 'diabetes-new',\n",
    "    'task': 'classification',\n",
    "    'raw_path': \"openml.datasets.get_dataset('diabetes')\",\n",
    "    'random_state': 42,\n",
    "    'train_frac': 0.75,\n",
    "    'val_frac': 0.075,\n",
    "    'creation_time': str(datetime.datetime.now()),\n",
    "    'max_col_length': 20,\n",
    "    'cols': cols,\n",
    "    'ords': ords,\n",
    "    'nums': nums,\n",
    "    'labs': labs,\n",
    "}\n",
    "assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) \n",
    "assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])\n",
    "\n",
    "# shuffle data\n",
    "df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)\n",
    "\n",
    "# split into train/val/test sets\n",
    "n = len(df)\n",
    "train_size = int(config['train_frac'] * n)\n",
    "val_size = int(config['val_frac'] * n)\n",
    "train = df.iloc[:train_size, :]\n",
    "val = df.iloc[train_size:train_size+val_size, :]\n",
    "test = df.iloc[train_size+val_size:, :]\n",
    "print('train', train.shape, 'val', val.shape, 'test', test.shape)\n",
    "\n",
    "# write everything out\n",
    "datedirname = '.'.join(config['creation_time'].split())\n",
    "outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)\n",
    "outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')\n",
    "\n",
    "for path in [outpath_date, outpath_latest]:\n",
    "    os.makedirs(path, exist_ok=True)\n",
    "    train.to_csv(os.path.join(path, 'train.csv'), index=False)\n",
    "    val.to_csv(os.path.join(path, 'val.csv'), index=False)\n",
    "    test.to_csv(os.path.join(path, 'test.csv'), index=False)\n",
    "    df.to_csv(os.path.join(path, 'all.csv'), index=False)\n",
    "    with open(os.path.join(path, 'config.json'), 'w') as f:\n",
    "        json.dump(config, f)\n",
    "        \n",
    "ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CA-Housing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train (15480, 9) val (1548, 9) test (3612, 9)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import datetime\n",
    "import os\n",
    "import json\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "\n",
    "df = fetch_california_housing(as_frame=True).frame\n",
    "\n",
    "#rename cols so none start with same token\n",
    "cols = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', \n",
    "        'occupancy', 'latitude', 'longitude', 'value_median_house']\n",
    "df.columns = cols\n",
    "ints = ['age_median', 'rooms', 'bedrooms', 'population', 'households', 'value_median_house']\n",
    "df = df.fillna('?')\n",
    "\n",
    "config = {\n",
    "    'dataset_name': 'house-new',\n",
    "    'raw_path': 'fetch_california_housing(as_frame=True).frame',\n",
    "    'random_state': 42,\n",
    "    'train_frac': 0.75,\n",
    "    'val_frac': 0.075,\n",
    "    'creation_time': str(datetime.datetime.now()),\n",
    "    'max_col_length': 20,\n",
    "    'task': 'regression',\n",
    "}\n",
    "\n",
    "config['cols'] = list(df.columns)\n",
    "config[\"ords\"] = []\n",
    "config[\"nums\"] = ['income_median', 'age_median', 'rooms', 'bedrooms', 'population', \n",
    "        'occupancy', 'latitude', 'longitude',]\n",
    "config[\"labs\"] = [\"value_median_house\"]\n",
    "assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) \n",
    "assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])\n",
    "\n",
    "df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)\n",
    "\n",
    "# split into train/val/test sets\n",
    "n = len(df)\n",
    "train_size = int(config['train_frac'] * n)\n",
    "val_size = int(config['val_frac'] * n)\n",
    "train = df.iloc[:train_size, :]\n",
    "val = df.iloc[train_size:train_size+val_size, :]\n",
    "test = df.iloc[train_size+val_size:, :]\n",
    "print('train', train.shape, 'val', val.shape, 'test', test.shape)\n",
    "\n",
    "# write everything out\n",
    "datedirname = '.'.join(config['creation_time'].split())\n",
    "outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)\n",
    "outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')\n",
    "\n",
    "for path in [outpath_date, outpath_latest]:\n",
    "    os.makedirs(path, exist_ok=True)\n",
    "    train.to_csv(os.path.join(path, 'train.csv'), index=False)\n",
    "    val.to_csv(os.path.join(path, 'val.csv'), index=False)\n",
    "    test.to_csv(os.path.join(path, 'test.csv'), index=False)\n",
    "    df.to_csv(os.path.join(path, 'all.csv'), index=False)\n",
    "    with open(os.path.join(path, 'config.json'), 'w') as f:\n",
    "        json.dump(config, f)\n",
    "        \n",
    "ddpmout =tabddpm(config, train, val, test, df, outpath_date, outpath_latest)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Rain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_51891/1533411410.py:7: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = openml.datasets.get_dataset('rainfall_bangladesh')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train (12566, 4) val (1256, 4) test (2933, 4)\n"
     ]
    }
   ],
   "source": [
    "import openml\n",
    "import pandas as pd\n",
    "import datetime\n",
    "import os\n",
    "import json\n",
    "\n",
    "dataset = openml.datasets.get_dataset('rainfall_bangladesh')\n",
    "df, _, _, _ = dataset.get_data(dataset_format=\"dataframe\")\n",
    "\n",
    "config = {\n",
    "    'dataset_name': 'rain',\n",
    "    'task': 'regression',\n",
    "    'raw_path': \"openml.datasets.get_dataset('rainfall_bangladesh')\",\n",
    "    'random_state': 42,\n",
    "    'train_frac': 0.75,\n",
    "    'val_frac': 0.075,\n",
    "    'creation_time': str(datetime.datetime.now()),\n",
    "    'max_col_length': 20,\n",
    "    'cols': list(df.columns),\n",
    "}\n",
    "config['ords'] = ['Station', 'Month']\n",
    "config['nums'] = ['Year']\n",
    "config['labs'] = ['Rainfall']\n",
    "\n",
    "assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) \n",
    "assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])\n",
    "\n",
    "# shuffle data\n",
    "df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)\n",
    "\n",
    "# split into train/val/test sets\n",
    "n = len(df)\n",
    "train_size = int(config['train_frac'] * n)\n",
    "val_size = int(config['val_frac'] * n)\n",
    "train = df.iloc[:train_size, :]\n",
    "val = df.iloc[train_size:train_size+val_size, :]\n",
    "test = df.iloc[train_size+val_size:, :]\n",
    "print('train', train.shape, 'val', val.shape, 'test', test.shape)\n",
    "\n",
    "# write everything out\n",
    "datedirname = '.'.join(config['creation_time'].split())\n",
    "outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)\n",
    "outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')\n",
    "\n",
    "for path in [outpath_date, outpath_latest]:\n",
    "    os.makedirs(path, exist_ok=True)\n",
    "    train.to_csv(os.path.join(path, 'train.csv'), index=False)\n",
    "    val.to_csv(os.path.join(path, 'val.csv'), index=False)\n",
    "    test.to_csv(os.path.join(path, 'test.csv'), index=False)\n",
    "    df.to_csv(os.path.join(path, 'all.csv'), index=False)\n",
    "    with open(os.path.join(path, 'config.json'), 'w') as f:\n",
    "        json.dump(config, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Abalone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_112632/408466766.py:7: FutureWarning: Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.\n",
      "  dataset = openml.datasets.get_dataset('abalone')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train (3132, 9) val (313, 9) test (732, 9)\n"
     ]
    }
   ],
   "source": [
    "import openml\n",
    "import pandas as pd\n",
    "import datetime\n",
    "import os\n",
    "import json\n",
    "\n",
    "dataset = openml.datasets.get_dataset('abalone')\n",
    "df, _, _, _ = dataset.get_data(dataset_format=\"dataframe\")\n",
    "\n",
    "config = {\n",
    "    'dataset_name': 'abalone',\n",
    "    'task': 'regression',\n",
    "    'raw_path': \"openml.datasets.get_dataset('abalone')\",\n",
    "    'random_state': 42,\n",
    "    'train_frac': 0.75,\n",
    "    'val_frac': 0.075,\n",
    "    'creation_time': str(datetime.datetime.now()),\n",
    "    'max_col_length': 20,\n",
    "    'cols': list(df.columns),\n",
    "}\n",
    "config['ords'] = ['Sex']\n",
    "config['nums'] = ['Length',\t'Diameter',\t'Height',\t'Whole_weight',\t'Shucked_weight',\t'Viscera_weight',\t'Shell_weight']\n",
    "config['labs'] = ['Class_number_of_rings']\n",
    "\n",
    "assert set(config['ords']+config['nums']+config['labs'])==set(config['cols']) \n",
    "assert len(config['ords'])+len(config['nums'])+len(config['labs']) == len(config['cols'])\n",
    "\n",
    "# shuffle data\n",
    "df = df.sample(frac=1, random_state=config['random_state'], ignore_index=True)\n",
    "\n",
    "# split into train/val/test sets\n",
    "n = len(df)\n",
    "train_size = int(config['train_frac'] * n)\n",
    "val_size = int(config['val_frac'] * n)\n",
    "train = df.iloc[:train_size, :]\n",
    "val = df.iloc[train_size:train_size+val_size, :]\n",
    "test = df.iloc[train_size+val_size:, :]\n",
    "print('train', train.shape, 'val', val.shape, 'test', test.shape)\n",
    "\n",
    "# write everything out\n",
    "datedirname = '.'.join(config['creation_time'].split())\n",
    "outpath_date   = os.path.join('./data/', config['dataset_name'], datedirname)\n",
    "outpath_latest = os.path.join('./data/', config['dataset_name'], 'latest')\n",
    "\n",
    "for path in [outpath_date, outpath_latest]:\n",
    "    os.makedirs(path, exist_ok=True)\n",
    "    train.to_csv(os.path.join(path, 'train.csv'), index=False)\n",
    "    val.to_csv(os.path.join(path, 'val.csv'), index=False)\n",
    "    test.to_csv(os.path.join(path, 'test.csv'), index=False)\n",
    "    df.to_csv(os.path.join(path, 'all.csv'), index=False)\n",
    "    with open(os.path.join(path, 'config.json'), 'w') as f:\n",
    "        json.dump(config, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "great",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
