{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27d824d5-9e26-409d-9d2f-a6af0d5669de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Paths shown reflect the default Jupyter Docker Stacks user directory (/home/jovyan).\n",
    "code_path = '/home/jovyan/code/'\n",
    "\n",
    "# source utility functions \n",
    "file_path = os.path.join(code_path, 'utility_functions_implementing_tabpfn_generators_iclr.py')\n",
    "with open(os.path.expanduser(file_path)) as file:\n",
    "    exec(file.read())\n",
    "\n",
    "# source additional utility functions \n",
    "file_path = os.path.join(code_path, 'additional_utility_functions_for_tabpfn_generators_iclr.py')\n",
    "with open(os.path.expanduser(file_path)) as file:\n",
    "    exec(file.read())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35b49987-5d71-48de-9bda-032bffdbafc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from sklearn.model_selection import train_test_split\n",
    "import openml\n",
    "import pyarrow.feather as feather"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "168051d6-e77d-4158-ba1d-b9ab33198d55",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_noisy_miav_over_baselines(split_seeds, percent: float = 0.0, show_progress: bool = False):\n",
    "    \n",
    "    # ----------------- Abalone data --------------------------------------------------------------\n",
    "    from sklearn.datasets import fetch_openml\n",
    "    # Fetch the Abalone dataset\n",
    "    abalone = fetch_openml(name=\"abalone\", version=1, as_frame=True)\n",
    "    # Access the data and target\n",
    "    X = abalone.data\n",
    "    y = abalone.target\n",
    "    X['target'] =  y # Rings\n",
    "    num_idx = [1, 2, 3, 4, 5, 6, 7, 8]\n",
    "    cat_idx = [0]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on Abalone')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='abalone',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"abalone_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ----------------- Bank marketing data --------------------------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44126) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 7))\n",
    "    cat_idx = [7]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on Bank marketing')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='bank',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"bank_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ----------------- Credit dataset -------------------------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44089) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 10))\n",
    "    cat_idx = [10]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on Credit')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='credit',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"credit_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ----------------- Eye movements dataset ----------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44130) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 20))\n",
    "    cat_idx = [20]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on Eye movements')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='eye',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"eye_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ---------------- House_16H dataset --------------------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44123) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 16))\n",
    "    cat_idx = [16]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on House16H')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='house16h',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"house16h_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ---------------------- MagicTelescope data ---------------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44125) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 10))\n",
    "    cat_idx = [10]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on MagicTelescope')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='magic',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"magic_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "    # ---------------- Pol data ----------------------------------------------------------------\n",
    "    dataset = openml.datasets.get_dataset(44122) \n",
    "    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)\n",
    "    X['target'] = y\n",
    "    num_idx = list(range(0, 26))\n",
    "    cat_idx = [26]\n",
    "    X = enforce_dtypes(dat = X, \n",
    "                   num_variables = num_idx, \n",
    "                   cat_variables = cat_idx)\n",
    "    print('running noisy MIAV on Pol')\n",
    "    syn_long_miav, failures_miav = build_all_synthetics_miav_noisy(\n",
    "        X=X,\n",
    "        split_seeds=split_seeds,\n",
    "        task_id=0,\n",
    "        ds_name='pol',\n",
    "        percent = percent,\n",
    "        show_progress = show_progress\n",
    "    )\n",
    "    # Save the data\n",
    "    fname = os.path.join(output_path, f\"pol_syn_noisy_miav_{percent}.feather\")\n",
    "    feather.write_feather(syn_long_miav, fname)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52722dee-88fb-4fb7-af79-3044d4a8b8d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "split_seeds = list(range(1, 11))  # 10 splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ec7e794-ab8b-4ab7-9f3a-d2f107a49d7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.05\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78c3f73a-47b2-470b-aaed-56186b6be218",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.10\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "055b89f4-357c-494c-b990-26c2981d2aaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.15\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8201b7a1-30a9-49db-8311-561369b956c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.2\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6032783f-d40b-49c1-825d-ac1703eb618e",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.25\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e46456b-d055-4bda-a954-c70f9c4e6a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = 0.30\n",
    "output_path = \"/home/jovyan/baseline_comparisons/outputs_noisy_miav/\" + f\"noise_percent_{percent}/\"\n",
    "\n",
    "run_noisy_miav_over_baselines(split_seeds = split_seeds, percent = percent)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
