{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "KOuI2yDR4aqS"
   },
   "outputs": [],
   "source": [
    "import copy\n",
    "import math\n",
    "import pickle\n",
    "\n",
    "import yaml\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "# from sklearn.svm import SVC, LinearSVC\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import Perceptron\n",
    "from sklearn.metrics import classification_report, mean_squared_error\n",
    "from sklearn.linear_model import LassoCV, LassoLarsCV\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "np.random.seed(336)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "xBJIz5mw7Amp"
   },
   "source": [
    "# Functions for Preparing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "NkoJcBUc4dfp"
   },
   "outputs": [],
   "source": [
    "def compute_margin(x, w):\n",
    "  return np.dot(x, w.T)\n",
    "\n",
    "def generate_nd_sphere_grid(dim, resolution, round_off=10):\n",
    "    \"\"\"\n",
    "    Generate a grid of points on an N-dimensional sphere.\n",
    "\n",
    "    Parameters:\n",
    "        dim (int): The dimensionality of the sphere.\n",
    "        resolution (int): The number of points per dimension.\n",
    "    Returns:\n",
    "        ndarray: Array of shape (num_points, dim) representing the grid points.\n",
    "    \"\"\"\n",
    "    # Create arrays for each angle in spherical coordinates\n",
    "    angles = [np.linspace(0, np.pi, resolution, endpoint=False) for _ in range(dim - 1)]\n",
    "    angles[-1] = np.linspace(0, 2 * np.pi, resolution, endpoint=False)  # Full range for azimuthal angle\n",
    "    grids = np.meshgrid(*angles, indexing='ij')  # Generate a meshgrid for all angles\n",
    "    coords = []  # Convert spherical coordinates to Cartesian coordinates\n",
    "    for i in range(dim):\n",
    "        coord = np.ones_like(grids[0])  # Start with ones\n",
    "        for j in range(i):\n",
    "            coord *= np.sin(grids[j])  # Multiply by sin of previous angles\n",
    "        if i < dim - 1:\n",
    "            coord *= np.cos(grids[i])  # Multiply by cos of the current angle\n",
    "        coords.append(coord)    # Stack Cartesian coordinates and reshape into a list of points\n",
    "    cartesian_coords = np.stack(coords, axis=-1).reshape(-1, dim)    # round the points to 10 decimal places\n",
    "    cartesian_coords = np.round(cartesian_coords, round_off)\n",
    "\n",
    "    return cartesian_coords\n",
    "\n",
    "\n",
    "def prepare_training_set(X, y, n):\n",
    "\n",
    "    if n < 2:\n",
    "        raise ValueError(\"Need at least two examples to ensure both classes (+1 and -1) are present in the data.\")\n",
    "\n",
    "    indices_pos = np.where(y == 1)[0]\n",
    "    indices_neg = np.where(y == -1)[0]\n",
    "\n",
    "    if len(indices_pos) == 0 or len(indices_neg) == 0:\n",
    "        raise ValueError(\"Both classes (+1 and -1) must be present in the data.\")\n",
    "\n",
    "    indices_pos = np.random.choice(indices_pos, size=n//2,\n",
    "                                          replace=False)\n",
    "    indices_neg = np.random.choice(indices_neg,\n",
    "                                          size=n - (n//2),\n",
    "                                          replace=False)\n",
    "    indices_train = np.concatenate([indices_pos, indices_neg])\n",
    "\n",
    "    return X[indices_train], y[indices_train], indices_train\n",
    "\n",
    "\n",
    "def train_linear_classifier(X, y, classifier_type=\"perceptron\"):\n",
    "\n",
    "    if classifier_type == \"perceptron\":\n",
    "        error_tol = 1e-6\n",
    "        max_steps = 1000000\n",
    "        # model = Perceptron(fit_intercept=False, tol=error_tol,\n",
    "        #                    max_iter=max_steps).fit(X, y)\n",
    "        model = Perceptron(fit_intercept=False).fit(X, y)\n",
    "        return model\n",
    "\n",
    "\n",
    "def generate_subset_small(X, y, P_universe):\n",
    "    \"\"\"\n",
    "    Generate a single subset based on the given parameters.\n",
    "\n",
    "    Parameters:\n",
    "        X: Features of the universe.\n",
    "        y: Labels of the universe.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "\n",
    "    Returns:\n",
    "        A tuple containing the subset (X, y) and the subset indices.\n",
    "    \"\"\"\n",
    "    subset_size = 10\n",
    "    subset_indices = np.random.choice(len(X), subset_size, replace=False,\n",
    "                                      p=P_universe)\n",
    "    subset_X, subset_y = X[subset_indices], y[subset_indices]\n",
    "    return (subset_X, subset_y), subset_indices\n",
    "\n",
    "def generate_subset(X, y, P_universe, subsampling_frac):\n",
    "    \"\"\"\n",
    "    Generate a single subset based on the given parameters.\n",
    "\n",
    "    Parameters:\n",
    "        X: Features of the universe.\n",
    "        y: Labels of the universe.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "        subsampling_frac: Fraction of the universe used for the subset.\n",
    "\n",
    "    Returns:\n",
    "        A tuple containing the subset (X, y) and the subset indices.\n",
    "    \"\"\"\n",
    "\n",
    "    subset_size = int(len(X) * subsampling_frac)\n",
    "\n",
    "    if subset_size < 2:\n",
    "        raise ValueError(\"Subset size must be at least 2 to include both classes.\")\n",
    "\n",
    "    indices_pos = np.where(y == 1)[0]\n",
    "    indices_neg = np.where(y == -1)[0]\n",
    "\n",
    "    if len(indices_pos) == 0 or len(indices_neg) == 0:\n",
    "        raise ValueError(\"Both classes (+1 and -1) must be present in the data.\")\n",
    "\n",
    "    subset_indices_pos = np.random.choice(indices_pos, size=subset_size//2,\n",
    "                                          replace=False)\n",
    "    subset_indices_neg = np.random.choice(indices_neg,\n",
    "                                          size=subset_size - (subset_size//2),\n",
    "                                          replace=False)\n",
    "    subset_indices = np.concatenate([subset_indices_pos, subset_indices_neg])\n",
    "    subset_X, subset_y = X[subset_indices], y[subset_indices]\n",
    "    return (subset_X, subset_y), subset_indices\n",
    "\n",
    "\n",
    "\n",
    "def prepare_data_parallel(universe, P_universe, subsampling_frac,\n",
    "                          num_subsets, n_jobs=8):\n",
    "    \"\"\"\n",
    "    Sample and store subsets for training datamodels using parallel processing.\n",
    "\n",
    "    Parameters:\n",
    "          universe: Tuple (X, y), the dataset to sample subsets from.\n",
    "          P_universe: Probability distribution over the universe.\n",
    "          subsampling_frac: Fraction of the universe used for each subset.\n",
    "          num_subsets: Number of subsets to generate.\n",
    "          n_jobs: Number of parallel jobs (-1 uses all available cores).\n",
    "\n",
    "    Returns:\n",
    "          subsets: List of # num_subsets subsets of size |universe|*subsampling_frac.\n",
    "    \"\"\"\n",
    "    X, y = universe\n",
    "\n",
    "    results = Parallel(n_jobs=n_jobs)(\n",
    "        delayed(generate_subset)(X, y, P_universe, subsampling_frac) for _ in range(num_subsets)\n",
    "    )\n",
    "    datamodel_subsets, subset_indices_list = zip(*results)\n",
    "\n",
    "    return list(datamodel_subsets), list(subset_indices_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "TwXSD8Gn7F10"
   },
   "source": [
    "# Functions for estimating datamodels, computing teaching sets, evaluating NtN/Nature performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Od7xYQfk6lt0"
   },
   "outputs": [],
   "source": [
    "# Procedure 1: EstimateDataModel\n",
    "def estimate_data_model(universe, P_universe, datamodel_subsets, subset_indices,\n",
    "                        num_subsets, test_set_size, num_cross_val):\n",
    "    \"\"\"\n",
    "    Estimate the datamodel parameters using subsets of the universe.\n",
    "\n",
    "    Parameters:\n",
    "        universe: Tuple (X, y), the dataset to sample subsets from.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "        datamodel_subsets:  List of subsets sampled from the universe.\n",
    "        subset_indices: Indices of the subsets of universe.\n",
    "        num_subsets: Number of subsets to sample from the universe.\n",
    "        test_set_size: Size of the test set.\n",
    "\n",
    "    Returns:\n",
    "        weight_vector: Estimated weight vector of the datamodel.\n",
    "    \"\"\"\n",
    "    X, y = universe\n",
    "    d = X.shape[0]\n",
    "    datamodel_training_set = []\n",
    "\n",
    "    max_steps = 1000000\n",
    "    error_tol = 1e-6\n",
    "\n",
    "    X_test, y_test = universe[0], universe[1]\n",
    "\n",
    "    for i in range(num_subsets):\n",
    "\n",
    "        subset_X, subset_y = datamodel_subsets[i][0], datamodel_subsets[i][1]\n",
    "\n",
    "        # Train a linear model on the subset\n",
    "        # model = Perceptron(fit_intercept=False).fit(subset_X, subset_y)\n",
    "        # model = Perceptron(fit_intercept=False, tol=error_tol,\n",
    "        #                    max_iter=max_steps).fit(subset_X, subset_y)\n",
    "        model = train_linear_classifier(subset_X,\n",
    "                                        subset_y,\n",
    "                                        classifier_type=\"perceptron\")\n",
    "\n",
    "        # Sample a test set\n",
    "        # test_indices = np.random.choice(len(X), test_set_size, replace=False,\n",
    "        #                                 p=P_universe)\n",
    "        # X_test, y_test = X[test_indices], y[test_indices]\n",
    "\n",
    "        # Compute empirical risk on the test set\n",
    "        # Use 0-1 loss (preferably) or perceptron loss as risk\n",
    "        margin_test = compute_margin(X_test, model.coef_).reshape(-1,)\n",
    "        y_test_pred = 2*(margin_test >= 0) - 1\n",
    "        risk_test = y_test_pred != y_test\n",
    "        # risk_test = np.maximum(0, - y_test * margin_test)\n",
    "        risk_test_avg = np.average(risk_test)\n",
    "\n",
    "        # Update datamodel training set\n",
    "        subset_indicator = np.zeros(len(X))\n",
    "        subset_indicator[subset_indices[i]] = 1\n",
    "        datamodel_training_set.append((subset_indicator, risk_test_avg))\n",
    "\n",
    "    # Run regression to predict weight vector\n",
    "    X_regression = np.array([item[0] for item in datamodel_training_set])\n",
    "    y_regression = np.array([item[1] for item in datamodel_training_set])\n",
    "\n",
    "    X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_regression,\n",
    "                                                                      y_regression,\n",
    "                                                                      test_size=0.2,\n",
    "                                                                      random_state=42)\n",
    "    print(\"\\n==================\\n\")\n",
    "    print(\"Printing the training set for datamodel training\\n\")\n",
    "    print(X_regression.shape, y_regression.shape)\n",
    "    print(X_regression, y_regression)\n",
    "    print(\"\\n==================\\n\")\n",
    "\n",
    "    # L1 regression\n",
    "    # regression_model = LassoCV(cv=5, random_state=0, n_jobs=8).fit(X_reg_train, y_reg_train)\n",
    "    regression_model = LassoLarsCV(fit_intercept=False, cv=num_cross_val, n_jobs=8).fit(X_reg_train, y_reg_train)\n",
    "    weight_vector = regression_model.coef_\n",
    "\n",
    "    # Compute MSE on held-out test-set and during Cross-Validation\n",
    "    y_reg_test_pred = regression_model.predict(X_reg_test)\n",
    "    mse_datamodels = mean_squared_error(y_reg_test, y_reg_test_pred)\n",
    "    for i in range(len(regression_model.alphas_)):\n",
    "        if regression_model.alpha_ == regression_model.alphas_[i]:\n",
    "            index_alpha = copy.deepcopy(i)\n",
    "            break\n",
    "    mse_datamodels_CV = np.average(regression_model.mse_path_[index_alpha,:])\n",
    "\n",
    "    print(f\"Datamodels: MSE = {mse_datamodels}, MSE_CV = {mse_datamodels_CV}\")\n",
    "\n",
    "    return weight_vector\n",
    "\n",
    "# Procedure 2: ComputeTeachingSet\n",
    "def compute_teaching_set(teaching_budget, nature_budget, weight_vector,\n",
    "                         universe, P_universe):\n",
    "    \"\"\"\n",
    "    Select examples with the smallest weight values to construct teaching set.\n",
    "\n",
    "    Parameters:\n",
    "        teaching_budget: Number of examples to select.\n",
    "        nature_budget: # i.i.d. examples sampled as per P_universe for training model.\n",
    "        weight_vector: Estimated weight vector.\n",
    "        universe: Tuple (X, y), the dataset to sample from.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "\n",
    "    Returns:\n",
    "        teaching_set: subset to be used for limited-budget teaching.\n",
    "    \"\"\"\n",
    "    X, y = universe\n",
    "    weights = weight_vector * (1 - np.power(P_universe, nature_budget))\n",
    "    sorted_indices = np.argsort(weights)\n",
    "    selected_indices = sorted_indices[:teaching_budget]\n",
    "    X_teach = X[selected_indices]\n",
    "    y_teach = y[selected_indices]\n",
    "\n",
    "    return X_teach, y_teach, selected_indices\n",
    "\n",
    "\n",
    "\n",
    "# Procedure 3: EvaluateTeachingSet\n",
    "def evaluate_teaching_set(universe, test_set_size,\n",
    "                          trained_models, P_universe):\n",
    "    \"\"\"\n",
    "    Evaluate the risk of the models trained with limited-budget teaching.\n",
    "\n",
    "    Parameters:\n",
    "        universe: Tuple (X, y), the dataset to sample from.\n",
    "        test_set_size: Size of the test set.\n",
    "        trained_models: List of trained models.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "\n",
    "    Returns:\n",
    "        risk: Average risk of trained models\n",
    "    \"\"\"\n",
    "    X_test, y_test = universe[0], universe[1]\n",
    "    risks = []\n",
    "\n",
    "    for i in range(len(trained_models)):\n",
    "        # Evaluate on the test set\n",
    "        # Use 0-1 loss (preferably) or perceptron loss as risk\n",
    "        margin_test = compute_margin(X_test, trained_models[i].coef_).reshape(-1, )\n",
    "        y_test_pred = 2*(margin_test >= 0) - 1\n",
    "        risk = y_test != y_test_pred\n",
    "        # risk =  np.maximum(0, - y_test * margin_test)\n",
    "        risk_avg = np.average(risk, weights=P_universe)\n",
    "        risks.append(risk_avg)\n",
    "    NtN_risk = np.mean(risks)\n",
    "    return NtN_risk, risks\n",
    "\n",
    "\n",
    "# Procedure 4: TrackNtNPerformance\n",
    "def track_ntn_performance(universe, teaching_budget, max_nature_budget,\n",
    "                          datamodel_subsets, subset_indices_list,\n",
    "                          num_models, P_universe, test_set_size,\n",
    "                          num_cross_val):\n",
    "    \"\"\"\n",
    "    Track and plot NtN performance as a function of the nature budget.\n",
    "\n",
    "    Parameters:\n",
    "        universe: Tuple (X, y), the dataset for training and evaluation.\n",
    "        teaching_budget: Number of examples to select for teaching.\n",
    "        max_nature_budget: Maximum nature budget to evaluate.\n",
    "        num_models: Number of models to train for evaluation.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "        test_set_size: Size of the test set.\n",
    "\n",
    "    Returns:\n",
    "        None\n",
    "    \"\"\"\n",
    "    X, y = universe\n",
    "    risks = []\n",
    "\n",
    "    # Estimate the datamodel\n",
    "    print(\"Estimating the datamodel currently...\\n\")\n",
    "    weight_vector = estimate_data_model(universe, P_universe, datamodel_subsets,\n",
    "                                        subset_indices_list, num_subsets = num_subsets,\n",
    "                                        test_set_size=test_set_size,\n",
    "                                        num_cross_val=num_cross_val)\n",
    "    print(f\"weight_vector: {weight_vector}\",\"\\n\")\n",
    "    print(f\"weight_vector > 0.: {np.sum(np.abs(weight_vector) > 0.)}, \\\n",
    "          {weight_vector[np.abs(weight_vector) > 0.]}\\n\")\n",
    "    print(\"The datamodel has been estimated...\\n\")\n",
    "\n",
    "    risk_train_list = []\n",
    "\n",
    "    risks_test_all = np.zeros((max_nature_budget-1, num_models))\n",
    "\n",
    "    for nature_budget in range(2, max_nature_budget + 1):\n",
    "        # Compute the teaching set\n",
    "        X_teach, y_teach, indices_teach = compute_teaching_set(teaching_budget,\n",
    "                                            max_nature_budget,\n",
    "                                            weight_vector,\n",
    "                                            universe,\n",
    "                                            P_universe)\n",
    "\n",
    "        trained_models = []\n",
    "        risk_train_list = []\n",
    "        for k in range(1, num_models + 1):\n",
    "\n",
    "            # Train a model on the train set\n",
    "            _, _, indices_train = prepare_training_set(X, y, nature_budget)\n",
    "            indices_combined = np.union1d(indices_teach, indices_train)\n",
    "\n",
    "            X_train, y_train = X[indices_combined], y[indices_combined]\n",
    "\n",
    "            model = train_linear_classifier(X_train,\n",
    "                                            y_train,\n",
    "                                            classifier_type=\"perceptron\")\n",
    "            trained_models.append(model)\n",
    "\n",
    "            margin_train = compute_margin(X_train, model.coef_).reshape(-1, )\n",
    "            y_train_pred = 2*(margin_train >= 0) - 1\n",
    "            risk_train = y_train != y_train_pred\n",
    "            # risk =  np.maximum(0, - y_train * margin_train)\n",
    "            risk_train_avg = np.average(risk_train)\n",
    "            risk_train_list.append(risk_train_avg)\n",
    "\n",
    "        # Evaluate the teaching set\n",
    "\n",
    "        risk, risks_list = evaluate_teaching_set(universe, test_set_size, trained_models,\n",
    "                                     P_universe)\n",
    "        risks.append(risk)\n",
    "        risks_test_all[nature_budget-2, :] = np.array(risks_list)\n",
    "\n",
    "    return weight_vector, risks, risks_test_all\n",
    "\n",
    "# Procedure 5: TrackNaturePerformance\n",
    "def track_nature_performance(universe, max_nature_budget,\n",
    "                             num_models, P_universe,\n",
    "                             test_set_size, num_cross_val):\n",
    "    \"\"\"\n",
    "    Track and plot Nature performance as a function of the nature budget.\n",
    "\n",
    "    Parameters:\n",
    "        universe: Tuple (X, y), the dataset for training and evaluation.\n",
    "        max_nature_budget: Maximum nature budget to evaluate.\n",
    "        num_models: Number of models to train for evaluation.\n",
    "        P_universe: Probability distribution over the universe.\n",
    "        test_set_size: Size of the test set.\n",
    "\n",
    "    Returns:\n",
    "        None\n",
    "    \"\"\"\n",
    "    X, y = universe\n",
    "    risks = []\n",
    "\n",
    "    risks_test_all = np.zeros((max_nature_budget-1, num_models))\n",
    "\n",
    "    for nature_budget in range(2, max_nature_budget + 1):\n",
    "        trained_models = []\n",
    "        risk_train_list = []\n",
    "        for k in range(1, num_models + 1):\n",
    "\n",
    "            # Train a model on the train set\n",
    "            X_train, y_train, indices_train = prepare_training_set(X,\n",
    "                                                                   y,\n",
    "                                                                   nature_budget\n",
    "                                                                   )\n",
    "            X_train, y_train = X[indices_train], y[indices_train]\n",
    "            model = train_linear_classifier(X_train,\n",
    "                                            y_train,\n",
    "                                            classifier_type=\"perceptron\")\n",
    "            trained_models.append(model)\n",
    "\n",
    "            margin_train = compute_margin(X_train, model.coef_).reshape(-1, )\n",
    "            y_train_pred = 2*(margin_train >= 0) - 1\n",
    "            risk_train = y_train != y_train_pred\n",
    "            # risk =  np.maximum(0, - y_train * margin_train)\n",
    "            risk_train_avg = np.average(risk_train,\n",
    "                                  weights=P_universe[indices_train]/np.sum(P_universe[indices_train]))\n",
    "            risk_train_list.append(risk_train_avg)\n",
    "\n",
    "        # Evaluate the performance on nature set\n",
    "\n",
    "        risk, risks_list = evaluate_teaching_set(universe, test_set_size, trained_models,\n",
    "                                     P_universe)\n",
    "        risks.append(risk)\n",
    "        risks_test_all[nature_budget-2, :] = np.array(risks_list)\n",
    "\n",
    "    return risks, risks_test_all\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "q0k_v4Gq7Oes"
   },
   "source": [
    "# Config for our 2D Dataset - X = 16 pts. over Unit Circle, Evaluate NtN perf."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "M1wl3A9o6p00"
   },
   "outputs": [],
   "source": [
    "classifier_type = \"sklearn.linear_model.Perceptron\"\n",
    "datamodel_type = \"sklearn.linear_model.LassoLarsCV\"\n",
    "\n",
    "path_nurture_nature_data = \"/nobackup/nurture-nature/data/lin_classif_datamodels_NtN_data.pickle\"\n",
    "path_nurture_nature_plots = \"/nobackup/nurture-nature/plots/\"\n",
    "path_datamodel_data = \"/nobackup/nurture-nature/data/lin_classif_datamodels_subset_data.pickle\"\n",
    "\n",
    "data_dimension = 2\n",
    "universe_size = 4**data_dimension # keep it in multiples of 4\n",
    "subsampling_frac = 0.25\n",
    "num_subsets = 1000\n",
    "teaching_budget = 2\n",
    "max_nature_budget = 4**data_dimension\n",
    "test_set_size = copy.deepcopy(universe_size) # 10\n",
    "num_models = 20\n",
    "num_cross_val = 4 # For L1 regression (estimating datamodels)\n",
    "\n",
    "n_jobs = 8\n",
    "\n",
    "# Define our universe\n",
    "X = generate_nd_sphere_grid(data_dimension, universe_size, round_off=10)\n",
    "\n",
    "w_true = copy.deepcopy(X[3,:]) # np.random.randn(data_dimension)\n",
    "y = 2*(compute_margin(X, w_true) >= 0) - 1\n",
    "universe = (X, y)\n",
    "P_universe = np.array(len(X)*[1/len(X)])\n",
    "\n",
    "# Save YAML configuration file\n",
    "config = {\n",
    "\"classifier\": {\n",
    "    \"type\": classifier_type},\n",
    "\"datamodel\": {\n",
    "    \"type\": datamodel_type},\n",
    "\"data\": {\n",
    "    \"universe_size\": universe_size,\n",
    "    \"P_universe\": \"uniform(X)\",\n",
    "    \"data_dimension\": data_dimension,\n",
    "    \"subsampling_frac\": subsampling_frac,\n",
    "    \"num_subsets\": num_subsets,\n",
    "    \"teaching_budget\": teaching_budget,\n",
    "    \"max_nature_budget\": max_nature_budget,\n",
    "    \"test_set_size\": test_set_size,\n",
    "    \"num_models\": num_models,\n",
    "    \"num_cross_val\": num_cross_val,\n",
    "    \"random_state\": 42},\n",
    "\"paths\": {\n",
    "    \"path_datamodel_data\": path_datamodel_data,\n",
    "    \"path_nurture_nature_data\": path_nurture_nature_data,\n",
    "    \"path_nurture_nature_plots\": path_nurture_nature_plots},\n",
    "}\n",
    "\n",
    "with open(\"config.yaml\", \"w\") as f:\n",
    "    config = yaml.dump(config, stream=f, default_flow_style=False,\n",
    "                        sort_keys=False)\n",
    "f.close()\n",
    "\n",
    "# Generate subsets of universe to train the datamodel (parallelized)\n",
    "datamodel_subsets, subset_indices_list = prepare_data_parallel(universe,\n",
    "                                                      P_universe,\n",
    "                                                      subsampling_frac,\n",
    "                                                      num_subsets,\n",
    "                                                      n_jobs)\n",
    "print(\"Subsets for training the datamodel have been prepared...\\n\")\n",
    "\n",
    "# Track and plot NtN performance\n",
    "datamodel_weight_vector, NtN_risks, risks_test_all_NtN = track_ntn_performance(universe,\n",
    "                                                            teaching_budget,\n",
    "                                                            max_nature_budget,\n",
    "                                                            datamodel_subsets,\n",
    "                                                            subset_indices_list,\n",
    "                                                            num_models,\n",
    "                                                            P_universe,\n",
    "                                                            test_set_size,\n",
    "                                                            num_cross_val\n",
    "                                                            )\n",
    "nature_risks, risks_test_all_nature = track_nature_performance(universe, max_nature_budget,\n",
    "                                        num_models, P_universe,\n",
    "                                        test_set_size, num_cross_val\n",
    "                                        )\n",
    "\n",
    "print(f\"Points in universe corresponding to bottom-10 datamodel weights: \\\n",
    "      X = {X[np.argsort(datamodel_weight_vector)[:10]]}, \\\n",
    "      y = {y[np.argsort(datamodel_weight_vector)[:10]]}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nIwL6Nm-7bsh"
   },
   "source": [
    "# Scatterplots for comparing opt-VC and opt-DM teaching sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "RzM7DWRQ6yTj"
   },
   "outputs": [],
   "source": [
    "# Plot w* and visually inspect where do the teaching points lie\n",
    "\n",
    "points = X[np.argsort(datamodel_weight_vector)[:10]]\n",
    "rem_points = X[np.argsort(datamodel_weight_vector)[teaching_budget:]]\n",
    "rot_90 = np.array([[0,-1],[1,0]])\n",
    "rot_270 = np.array([[0,1],[-1,0]])\n",
    "proposed_teaching_set = np.zeros((data_dimension+1, data_dimension))\n",
    "proposed_teaching_set[0,:] = rot_90 @ w_true\n",
    "proposed_teaching_set[1,:] = rot_270 @ w_true\n",
    "proposed_teaching_set[2,:] = - copy.deepcopy(w_true)\n",
    "\n",
    "w_star = copy.deepcopy(w_true)# plot w_star as a arrow line\n",
    "\n",
    "# Plot config\n",
    "font_size = 20\n",
    "marker_size = 100\n",
    "plt.rcParams.update({'font.size': font_size})\n",
    "plt.rcParams[\"text.usetex\"] = True  # # Enable LaTeX rendering\n",
    "fig, ax = plt.subplots(1, 2, figsize=(10, 5), dpi=1200)\n",
    "\n",
    "ax[0].scatter(X[:, 0], X[:, 1], s=marker_size, marker='o', c='gray')\n",
    "ax[0].scatter(proposed_teaching_set[:, 0], proposed_teaching_set[:, 1],\n",
    "              s=marker_size, label='teaching set (opt-VC)', marker='s', c='g')\n",
    "ax[0].arrow(0, 0, w_star[0], w_star[1], head_width=0.1, head_length=0.1,fc='b',ec='b')\n",
    "ax[0].text(w_star[0] + 0.1, w_star[1] + 0.1, r'\\Huge $w^*$', fontsize=12, color='blue')\n",
    "ax[0].legend(loc='upper right', bbox_to_anchor=(1, 1.2))\n",
    "ax[0].grid(True)\n",
    "ax[0].set_xticklabels([])  # Remove x-axis tick labels\n",
    "ax[0].set_yticklabels([])  # Remove y-axis tick labels\n",
    "ax[0].set_xlabel('x', fontsize=30)\n",
    "ax[0].set_ylabel('y', fontsize=30)\n",
    "ax[0].set_xlim(-1.5, 1.5)\n",
    "ax[0].set_ylim(-1.5, 1.5)\n",
    "\n",
    "\n",
    "ax[1].scatter(X[:, 0], X[:, 1], s=marker_size, marker='o', c='gray')\n",
    "ax[1].scatter(X[np.argsort(datamodel_weight_vector)[:teaching_budget+1], 0],\n",
    "            X[np.argsort(datamodel_weight_vector)[:teaching_budget+1], 1],\n",
    "            s=marker_size, label='teaching set (opt-DM)', marker='x', c='r')\n",
    "ax[1].arrow(0, 0, w_star[0], w_star[1], head_width=0.1, head_length=0.1,fc='b',ec='b')\n",
    "ax[1].text(w_star[0] + 0.1, w_star[1] + 0.1, r'\\Huge $w^*$', fontsize=12, color='blue')\n",
    "for i, (x_i, y_i) in enumerate(X[np.argsort(datamodel_weight_vector)[:data_dimension+1]]):\n",
    "    plt.text(x_i + 0.08, y_i, str(i+1), fontsize=25, color='black')\n",
    "ax[1].grid(True)\n",
    "ax[1].set_xticklabels([])  # Remove x-axis tick labels\n",
    "ax[1].set_yticklabels([])  # Remove y-axis tick labels\n",
    "ax[1].set_xlabel('x', fontsize=30)\n",
    "ax[1].set_ylabel('y', fontsize=30)\n",
    "ax[1].legend(loc='upper right', bbox_to_anchor=(1, 1.2))\n",
    "ax[1].set_xlim(-1.5, 1.5)\n",
    "ax[1].set_ylim(-1.5, 1.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "dGdUF_ji7hph"
   },
   "source": [
    "# Plotting average risk curve as a function of $n_{iid}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "hZhFlD6w61pm"
   },
   "outputs": [],
   "source": [
    "# Plot avg. risk curve as a function of nature budget\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(range(2+teaching_budget, teaching_budget + max_nature_budget + 1), NtN_risks, marker='o', c='r', label='teach+i.i.d.')\n",
    "plt.plot(range(2, max_nature_budget + 1), nature_risks, marker='x', c='c', label='i.i.d.')\n",
    "plt.xlabel(\"Nature+Teaching Budget\")\n",
    "plt.ylabel(\"Risk\")\n",
    "plt.title(f\"NtN and Nature Performance vs Nature Budget (universe:{universe_size}, B={teaching_budget})\")\n",
    "plt.grid(True)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
