{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def load_data(path):\n",
    "    all_df = pd.read_csv(path)\n",
    "    all_df['n'] = all_df['n'].astype('str')\n",
    "    all_df['rank'] = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].rank(\"first\")\n",
    "    all_df['wga_te_err_var'] = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].transform('var')\n",
    "\n",
    "    def get_gt_rank(x, mode='tie', filter_thre=0.05):\n",
    "        # x is a dataframe with columns ['method', 'wga_te_err'], find the method(s) with smallest wga_te_err, with filter_thre tolerance for tie\n",
    "        min_err = x['wga_te_err'].min()\n",
    "        winners = x[x['wga_te_err'] <= min_err + filter_thre][\"method\"].to_list()\n",
    "        return '|'.join(winners)\n",
    "\n",
    "    # Apply the function to each group and reset the index to merge back with the original dataframe\n",
    "    winners_series = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])[['method', 'wga_te_err']].apply(lambda x: get_gt_rank(x)).reset_index(name='winners')\n",
    "\n",
    "    # Merge the results back with the original dataframe\n",
    "    all_df = all_df.merge(winners_series, on=['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])\n",
    "\n",
    "    methods = [\"ERM\", \"GroupDRO\", \"remax-margin\", \"oversample\", \"undersample\"]\n",
    "    all_df[\"multi_hot\"] = all_df[\"winners\"].map(lambda x: [1 if m in x.split(\"|\") else 0 for m in methods])\n",
    "\n",
    "    return all_df\n",
    "\n",
    "# path = \"//exps/div_explore/toy_v2/results_seed0_sc.csv\"\n",
    "# all_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_sc.csv\"\n",
    "sc_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_ci.csv\"\n",
    "ci_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_ai.csv\"\n",
    "ai_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_3shifts.csv\"\n",
    "mix_df = load_data(path)\n",
    "\n",
    "all_df = pd.concat([sc_df, ci_df, ai_df, mix_df], ignore_index=True)\n",
    "all_df_rank = all_df[all_df[\"rank\"]==1.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df_rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "9576*0.8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def eval_acc(y_test, y_pred, mode, verbose=True):\n",
    "    # assuming numpy arrays\n",
    "    if mode == \"0-1\":\n",
    "        # acc = ((y_pred == y_test).float().sum(dim=1)==5).float().mean()\n",
    "        acc = (y_pred == y_test).all(axis=1).mean()\n",
    "        if verbose:\n",
    "            print(f\"Eval {mode} accuracy: {acc}\")\n",
    "    elif mode == \"soft 0-1\":\n",
    "        correct = 0\n",
    "        for i, curr_y in enumerate(y_test):\n",
    "            curr_pred = y_pred[i]\n",
    "            # check if the positions of 1s in curr_pred are also 1s in curr_y\n",
    "            pos = np.where(curr_pred==1)[0]\n",
    "            if np.all(curr_y[pos] == 1):\n",
    "                correct += 1\n",
    "        acc = correct / y_test.shape[0]\n",
    "        if verbose:\n",
    "            print(f\"Eval {mode} accuracy: {acc}\")\n",
    "    return acc\n",
    "\n",
    "def eval_wga_err(test_df, y_pred, df_full, identifier):\n",
    "    # check if y_pred is already a list\n",
    "    if not isinstance(y_pred, list):\n",
    "        y_pred = y_pred.tolist()\n",
    "    test_df['pred'] = y_pred\n",
    "    winners_pred = []\n",
    "    for y_ in y_pred:\n",
    "        y_ = np.array(y_)\n",
    "        winners_pred.append(np.array(methods)[np.where(y_==1)[0]])\n",
    "    test_df['winners_pred'] = winners_pred\n",
    "    # need to transform the multi-hot vector to the algorithm name\n",
    "    fine_test_df = df_full.merge(test_df, on=identifier)\n",
    "    def get_pred_wga_te_err(x):\n",
    "        pred_winners = x['winners_pred'].iloc[0]\n",
    "        if len(pred_winners) == 0:\n",
    "            # randomly select one\n",
    "            pred_winner = np.random.choice(methods)\n",
    "        else:\n",
    "            pred_winner = np.random.choice(pred_winners)\n",
    "        return x[x['method']==pred_winner]['wga_te_err'].iloc[0]\n",
    "    pred_err = fine_test_df.groupby(identifier)['wga_te_err', 'method' ,'winners_pred'].apply(lambda x: get_pred_wga_te_err(x))\n",
    "    print(f\"Eval wg err: {pred_err.mean()}\")\n",
    "    return pred_err.mean()\n",
    "\n",
    "\n",
    "import copy\n",
    "methods = [\"ERM\", \"GroupDRO\", \"remax-margin\", \"oversample\", \"undersample\"]\n",
    "def train_classifier(all_df_rank, tr_size=-1, remove_col=-1):\n",
    "    # all_df_rank = copy.deepcopy(all_df_rank)\n",
    "\n",
    "    all_df_rank_data = all_df_rank[['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat', 'multi_hot']]\n",
    "\n",
    "    all_X = all_df_rank_data.to_numpy()[:,:-1].astype('float')\n",
    "    all_y = np.array(list(all_df_rank_data.to_numpy()[:, -1]))\n",
    "\n",
    "    from sklearn.preprocessing import StandardScaler\n",
    "    from sklearn.linear_model import LogisticRegression\n",
    "    from sklearn.neural_network import MLPClassifier\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    from sklearn.metrics import accuracy_score\n",
    "    from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "    X_train, X_test, y_train, y_test, tr_idx, te_idx = train_test_split(all_X, all_y, range(len(all_X)), test_size=0.2, random_state=0)\n",
    "    # 7960\n",
    "    if tr_size > 0:\n",
    "        num = X_train.shape[0]\n",
    "        te_size = (num - tr_size)/num\n",
    "        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=te_size, random_state=0)\n",
    "    print(\"train size:\", X_train.shape[0])\n",
    "\n",
    "    scaler = StandardScaler()\n",
    "    X_train = scaler.fit_transform(X_train)\n",
    "    X_test = scaler.transform(X_test)\n",
    "\n",
    "    # X_train, y_train = all_X, all_y\n",
    "    # exclude column 2\n",
    "    # ['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat']\n",
    "    cols = list(range(X_train.shape[1]))\n",
    "    if remove_col >= 0:\n",
    "        cols.remove(remove_col)\n",
    "    X_train, X_test = X_train[:,cols], X_test[:,cols]\n",
    "\n",
    "    # clf = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)\n",
    "    # clf = LogisticRegression(random_state=0, max_iter=int(1e8), verbose=True, C=0.1).fit(X_train, y_train)\n",
    "    clf = MLPClassifier(random_state=0, max_iter=200, verbose=False, tol=1e-3, n_iter_no_change=2000, alpha=0.1, hidden_layer_sizes=(100, 100, 50, 50)).fit(X_train, y_train)\n",
    "\n",
    "    # clf = MLPClassifier(random_state=1, max_iter=1000000, verbose=True, tol=1e-3, n_iter_no_change=2000, alpha=0.0001, hidden_layer_sizes=(100, 10)).fit(X_train, y_train)\n",
    "\n",
    "    # solver=\"sgd\", learning_rate_init=0.01\n",
    "    te_acc = clf.score(X_test, y_test)\n",
    "    # print(clf.score(X_train, y_train))\n",
    "    print(f\"test 0-1 accuracy: \", te_acc)\n",
    "\n",
    "    def jac_sim(list1, list2):\n",
    "            set1, set2 = set(list1), set(list2)\n",
    "            intersection = len(set1.intersection(set2))\n",
    "            union = len(set1.union(set2))\n",
    "            return intersection / union\n",
    "    y_pred = clf.predict(X_test)\n",
    "\n",
    "    winners_gt = []\n",
    "    for y_ in y_test:\n",
    "        winners_gt.append(np.array(methods)[np.where(y_==1)[0]])\n",
    "\n",
    "    winners_pred = []\n",
    "    for y_ in y_pred:\n",
    "        winners_pred.append(np.array(methods)[np.where(y_==1)[0]])\n",
    "\n",
    "    res = []\n",
    "    for i in range(len(winners_pred)):\n",
    "        res.append(jac_sim(winners_gt[i], winners_pred[i]))\n",
    "    jac_acc = np.mean(res)\n",
    "    print(f\"jacc accuracy: \", jac_acc)\n",
    "    print()\n",
    "\n",
    "    y_pred = clf.predict(X_test)\n",
    "    soft_te_acc = eval_acc(y_test, y_pred, mode=\"soft 0-1\", verbose=True)\n",
    "\n",
    "    wge = eval_wga_err(all_df_rank_data.iloc[te_idx], y_pred, all_df, identifier=['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])\n",
    "\n",
    "    return te_acc, jac_acc, soft_te_acc, wge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "te_accs, jac_accs = [], []\n",
    "soft_te_accs, wges = [], []\n",
    "size_list = [10, 20, 50, 100, 250, 500, 1000, 2000, 4000, -1]\n",
    "for s in size_list:\n",
    "    curr_te_acc, curr_jac_acc, curr_soft_te_acc, curr_wge = train_classifier(all_df_rank, s)\n",
    "    te_accs.append(curr_te_acc)\n",
    "    jac_accs.append(curr_jac_acc)\n",
    "    soft_te_accs.append(curr_soft_te_acc)\n",
    "    wges.append(curr_wge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the te_accs and jac_accs\n",
    "# adjust the plot size\n",
    "plt.figure(figsize=(4, 3))\n",
    "size_list[-1] = 8000\n",
    "df1 = pd.DataFrame({\"Training size\": size_list, \"Test accuracy\": te_accs, \"Type\": \"0-1 acc.\"})\n",
    "df2 = pd.DataFrame({\"Training size\": size_list, \"Test accuracy\": jac_accs, \"Type\": \"Jaccard acc.\"})\n",
    "df = pd.concat([df1, df2])\n",
    "# use seaborn\n",
    "sns.set_style(\"whitegrid\")\n",
    "sns.lineplot(data=df, x=\"Training size\", y=\"Test accuracy\", hue=\"Type\", markers=True, style=\"Type\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# plot the te_accs and jac_accs\n",
    "# adjust the plot size\n",
    "plt.figure(figsize=(6, 5))\n",
    "size_list[-1] = 7660\n",
    "# df1 = pd.DataFrame({\"Training size\": size_list, \"Test accuracy\": te_accs, \"Type\": \"0-1 acc.\"})\n",
    "# df2 = pd.DataFrame({\"Training size\": size_list, \"Test accuracy\": jac_accs, \"Type\": \"Jaccard acc.\"})\n",
    "# df = pd.concat([df1, df2])\n",
    "# # use seaborn\n",
    "\n",
    "# sns.lineplot(data=df, x=\"Training size\", y=\"Test accuracy\", hue=\"Type\", markers=True, style=\"Type\")\n",
    "sns.set(font_scale=1.6)\n",
    "sns.set_style(\"whitegrid\")\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# Create some mock data\n",
    "t = size_list\n",
    "data1 = np.array(soft_te_accs) * 100\n",
    "data2 = np.array(wges) * 100\n",
    "\n",
    "fig, ax1 = plt.subplots()\n",
    "\n",
    "color = 'tab:red'\n",
    "ax1.set_xlabel(r'Size of meta-dataset $\\mathbb{D}$')\n",
    "ax1.set_ylabel('Test 0-1 ACC.', color=color)\n",
    "ax1.plot(t, data1, color=color, marker='o', markersize=7)\n",
    "ax1.tick_params(axis='y', labelcolor=color)\n",
    "\n",
    "ax2 = ax1.twinx()  # instantiate a second Axes that shares the same x-axis\n",
    "\n",
    "color = 'tab:blue'\n",
    "ax2.set_ylabel('Test worst-group error', color=color)  # we already handled the x-label with ax1\n",
    "ax2.plot(t, data2, color=color, marker='v', markersize=7)\n",
    "ax2.tick_params(axis='y', labelcolor=color)\n",
    "\n",
    "ax1.grid(True)\n",
    "\n",
    "fig.tight_layout()  # otherwise the right y-label is slightly clipped\n",
    "\n",
    "# plot horizontal line\n",
    "ax1.axhline(y=72.5, color='tab:red', linestyle='--')\n",
    "ax2.axhline(y=22.7, color='tab:blue', linestyle='--')\n",
    "\n",
    "# ax2.axhline(y=19.0, color='tab:blue', linestyle='--')\n",
    "\n",
    "# # add text and arrow pointing to the line\n",
    "# ax1.text(4000, 71.0, '72.5%', color='gray', fontsize=14, verticalalignment='center')\n",
    "# ax2.text(4000, 22.5, '22.7%', color='gray', fontsize=14, verticalalignment='center')\n",
    "# # add arrow\n",
    "# ax1.annotate('', xy=(4000, 72.5), xytext=(4000, 71.0),\n",
    "#              arrowprops=dict(facecolor='black', shrink=0.05))\n",
    "\n",
    "# save with high resolution\n",
    "fig.savefig(\"scaling_law1.pdf\", dpi=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "te_accs, jac_accs = [], []\n",
    "soft_te_accs, wges = [], []\n",
    "col_list = [-1, 0, 1, 2, 3, 4, 5]\n",
    "for col in col_list:\n",
    "    curr_te_acc, curr_jac_acc, curr_soft_te_acc, curr_wge = train_classifier(all_df_rank, remove_col=col)\n",
    "    te_accs.append(curr_te_acc)\n",
    "    jac_accs.append(curr_jac_acc)\n",
    "    soft_te_accs.append(curr_soft_te_acc)\n",
    "    wges.append(curr_wge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the te_accs and jac_accs\n",
    "# adjust the plot size\n",
    "plt.figure(figsize=(6.8, 5))\n",
    "col_name = ['N/A', r'$n$', r'$d_{sc}$', r'$d_{ls}$', r'$d_{cs}$', r'$r$', r'$d$']\n",
    "df1 = pd.DataFrame({\"Removed column\": col_list, \"Test accuracy\": soft_te_accs, \"Type\": \"0-1 acc.\"})\n",
    "df1['Removed column'] = df1['Removed column'].map(lambda x: col_name[int(x)+1])\n",
    "# df2 = pd.DataFrame({\"Removed column\": col_list, \"Test accuracy\": jac_accs, \"Type\": \"Jaccard acc.\"})\n",
    "# df = pd.concat([df1, df2])\n",
    "# use seaborn\n",
    "sns.set_style(\"whitegrid\")\n",
    "# barplot\n",
    "# sns.barplot(data=df1, x=\"Removed column\", y=\"Test accuracy\", hue=\"Removed column\", palette='RdYlBu')\n",
    "# sns.barplot(data=df1, x=\"Removed column\", y=\"Test accuracy\", hue=\"Removed column\", palette='twilight_shifted')\n",
    "sns.barplot(data=df1, x=\"Removed column\", y=\"Test accuracy\", hue=\"Removed column\", palette='viridis')\n",
    "# sns.barplot(data=df1, x=\"Removed column\", y=\"Test accuracy\", hue=\"Removed column\", palette='flare')\n",
    "\n",
    "plt.ylabel(\"Test 0-1 ACC.\")\n",
    "plt.xlabel(\"Removed dataset descriptor\")\n",
    "plt.legend().remove()\n",
    "plt.ylim(0.7, 0.87)\n",
    "plt.tight_layout()\n",
    "\n",
    "# add legend\n",
    "# plt.legend(title='', loc='upper left', labels=col_name[1:])\n",
    "plt.text(4.6, 0.79, f'$n$ (data size)\\n'+r'$d_{sc}$'+'\\n'+r'$d_{ls}$'+'\\n'+r'$d_{cs}$'+'\\n'+'$r$ (availability)'+'\\n$d$ (input dim.)', fontsize=15, bbox=dict(facecolor='white', alpha=0.7, edgecolor='gray', boxstyle='round,pad=0.3'))\n",
    "plt.text(5.05, 0.82, '(degree of\\nshifts)', fontsize=15)\n",
    "# save with high resolution\n",
    "plt.savefig(\"lofo3_4.pdf\", dpi=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the te_accs and jac_accs\n",
    "# adjust the plot size\n",
    "plt.figure(figsize=(6, 5))\n",
    "col_name = ['N/A', r'$n$', r'$d_{sc}$', r'$d_{ci}$', r'$d_{ai}$', r'$r$', r'$d$']\n",
    "df1 = pd.DataFrame({\"Removed column\": col_list, \"Test accuracy\": soft_te_accs, \"Type\": \"0-1 acc.\"})\n",
    "df1['Removed column'] = df1['Removed column'].map(lambda x: col_name[int(x)+1])\n",
    "# df2 = pd.DataFrame({\"Removed column\": col_list, \"Test accuracy\": jac_accs, \"Type\": \"Jaccard acc.\"})\n",
    "# df = pd.concat([df1, df2])\n",
    "# use seaborn\n",
    "sns.set_style(\"whitegrid\")\n",
    "# barplot\n",
    "sns.barplot(data=df1, x=\"Removed column\", y=\"Test accuracy\", hue=\"Removed column\", palette='RdYlBu')\n",
    "plt.ylabel(\"Test 0-1 ACC.\")\n",
    "plt.xlabel(\"Removed dataset descriptor\")\n",
    "plt.legend().remove()\n",
    "plt.ylim(0.7, 0.88)\n",
    "plt.tight_layout()\n",
    "# save with high resolution\n",
    "plt.savefig(\"lofo2.pdf\", dpi=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def load_data(path):\n",
    "    all_df = pd.read_csv(path)\n",
    "    all_df['n'] = all_df['n'].astype('str')\n",
    "    all_df['rank'] = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].rank(\"first\")\n",
    "    all_df['wga_te_err_var'] = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].transform('var')\n",
    "\n",
    "    def get_gt_rank(x, mode='tie', filter_thre=0.05):\n",
    "        # x is a dataframe with columns ['method', 'wga_te_err'], find the method(s) with smallest wga_te_err, with filter_thre tolerance for tie\n",
    "        min_err = x['wga_te_err'].min()\n",
    "        winners = x[x['wga_te_err'] <= min_err + filter_thre][\"method\"].to_list()\n",
    "        return '|'.join(winners)\n",
    "\n",
    "    # Apply the function to each group and reset the index to merge back with the original dataframe\n",
    "    winners_series = all_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])[['method', 'wga_te_err']].apply(lambda x: get_gt_rank(x)).reset_index(name='winners')\n",
    "\n",
    "    # Merge the results back with the original dataframe\n",
    "    all_df = all_df.merge(winners_series, on=['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])\n",
    "\n",
    "    methods = [\"ERM\", \"GroupDRO\", \"remax-margin\", \"oversample\", \"undersample\"]\n",
    "    all_df[\"multi_hot\"] = all_df[\"winners\"].map(lambda x: [1 if m in x.split(\"|\") else 0 for m in methods])\n",
    "\n",
    "    return all_df\n",
    "\n",
    "# path = \"//exps/div_explore/toy_v2/results_seed0_sc.csv\"\n",
    "# all_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_sc.csv\"\n",
    "sc_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_ci.csv\"\n",
    "ci_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_ai.csv\"\n",
    "ai_df = load_data(path)\n",
    "\n",
    "path = \"//exps/div_explore/toy_v2/results_seed0_3shifts.csv\"\n",
    "mix_df = load_data(path)\n",
    "\n",
    "all_df = pd.concat([sc_df, ci_df, ai_df, mix_df], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert to multiple binary classification\n",
    "def prepare_data(n, tr_size):\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    tr_idx, te_idx = train_test_split(range(n), test_size=0.2, random_state=0)\n",
    "\n",
    "    if tr_size > 0:\n",
    "        num = len(tr_idx)\n",
    "        te_size = (num - tr_size)/num\n",
    "        tr_idx, val_idx = train_test_split(tr_idx, test_size=te_size, random_state=0)\n",
    "        print(\"train size:\", len(tr_idx))\n",
    "\n",
    "    return tr_idx, te_idx\n",
    "\n",
    "def train_binary_classifier(all_df, tr_idx, te_idx, method1, method2, filter_thre=0.05, mode=\"filter\"):\n",
    "    # mode can be tie\n",
    "    import copy\n",
    "    import warnings\n",
    "    from sklearn.preprocessing import StandardScaler\n",
    "    from sklearn.neural_network import MLPClassifier\n",
    "    from sklearn.linear_model import LogisticRegression\n",
    "    from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "    warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "    all_df_m1 = all_df[all_df[\"method\"]==method1]\n",
    "    all_df_m2 = all_df[all_df[\"method\"]==method2]\n",
    "    all_df_m1[\"loss_diff\"] = all_df_m1['wga_te_err'].to_numpy() - all_df_m2['wga_te_err'].to_numpy()\n",
    "\n",
    "    all_df = copy.deepcopy(all_df_m1)\n",
    "\n",
    "    if mode == \"filter\":\n",
    "        all_df['over_rank'] = all_df['loss_diff'] < 0\n",
    "    elif mode ==\"tie\":\n",
    "        def lossdiff2rank(x):\n",
    "            if x < -filter_thre:\n",
    "                return 1\n",
    "            elif x > filter_thre:\n",
    "                return 0\n",
    "            else:\n",
    "                return 2\n",
    "        all_df['over_rank'] = all_df['loss_diff'].map(lambda x: lossdiff2rank(x))\n",
    "    all_df['over_rank'] = all_df['over_rank'].astype('int')\n",
    "\n",
    "    all_df_tr = all_df.iloc[tr_idx]\n",
    "    all_df_te = all_df.iloc[te_idx]\n",
    "\n",
    "    if mode == \"filter\":\n",
    "        all_df_tr = all_df_tr[(all_df_tr['loss_diff'] < -filter_thre)|(all_df_tr['loss_diff'] > filter_thre)]\n",
    "        all_df_te = all_df_te[(all_df_te['loss_diff'] < -filter_thre)|(all_df_te['loss_diff'] > filter_thre)]\n",
    "\n",
    "    all_df_tr = all_df_tr[['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat', 'over_rank']]\n",
    "    all_df_te = all_df_te[['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat', 'over_rank']]\n",
    "\n",
    "    X_train = all_df_tr.to_numpy()[:,:-1].astype('float')\n",
    "    y_train = all_df_tr.to_numpy()[:,-1].astype('int')\n",
    "    X_test = all_df_te.to_numpy()[:,:-1].astype('float')\n",
    "    y_test = all_df_te.to_numpy()[:,-1].astype('int')\n",
    "\n",
    "    scaler = StandardScaler()\n",
    "    X_train = scaler.fit_transform(X_train)\n",
    "    X_test = scaler.transform(X_test)\n",
    "\n",
    "    # clf = MLPClassifier(random_state=1, max_iter=1000000, verbose=False, tol=5e-3, n_iter_no_change=10000, alpha=0.0001, hidden_layer_sizes=(100,10,)).fit(X_train, y_train)\n",
    "    clf = MLPClassifier(random_state=0, max_iter=1000000, verbose=False, tol=1e-4, n_iter_no_change=2000, alpha=1.0, hidden_layer_sizes=(50,)).fit(X_train, y_train)\n",
    "    # clf = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)\n",
    "    # clf = LogisticRegression(random_state=0, max_iter=int(1e8), verbose=True, C=0.1).fit(X_train, y_train)\n",
    "    train_acc = clf.score(X_train, y_train)\n",
    "    test_acc = clf.score(X_test, y_test)\n",
    "    print(method1, method2, \"train size: \", len(all_df_tr), \"train acc: \", train_acc, \"test acc: \", test_acc)\n",
    "    # compute test acc by class\n",
    "    for i in range(3):\n",
    "        te_idx = y_test == i\n",
    "        tr_idx = y_train == i\n",
    "        # print y distribution\n",
    "        print(\"      class\", i, \"train size: \", len(y_train[tr_idx]))\n",
    "        if len(y_test[te_idx]) == 0:\n",
    "            continue\n",
    "        print(\"      class\", i, \"test acc: \", clf.score(X_test[te_idx], y_test[te_idx]))\n",
    "    # print()\n",
    "\n",
    "    return clf, X_test\n",
    "\n",
    "def extract_pairwise_results(zoo, X_test):\n",
    "    res = []\n",
    "    # if 1d then expand to 2d\n",
    "    if len(X_test.shape) == 1:\n",
    "        X_test = X_test.reshape(1, -1)\n",
    "    for (m1, m2), clf in zoo.items():\n",
    "        y_pred = clf.predict(X_test)\n",
    "        # print(y_pred)\n",
    "        if y_pred == 1:\n",
    "            res.append((m1, m2, m1))\n",
    "        elif y_pred == 0:\n",
    "            res.append((m1, m2, m2))\n",
    "        else:\n",
    "            res.append((m1, m2, m1))\n",
    "            res.append((m1, m2, m2))\n",
    "    # print()\n",
    "    # print(res)\n",
    "    return res\n",
    "\n",
    "def copeland_method(candidate_names, pairwise_results):\n",
    "    # Initialize scores dictionary\n",
    "    scores = {name: 0 for name in candidate_names}\n",
    "\n",
    "    # Update scores based on pairwise results\n",
    "    for result in pairwise_results:\n",
    "        A, B, winner = result\n",
    "        if winner == A:\n",
    "            scores[A] += 1\n",
    "            scores[B] -= 1\n",
    "        elif winner == B:\n",
    "            scores[B] += 1\n",
    "            scores[A] -= 1\n",
    "    print(scores)\n",
    "    # # Generate the ranking\n",
    "    # ranking = sorted(candidate_names, key=lambda x: scores[x], reverse=True)\n",
    "    # get the ones with the highest score, note that some can have the same score, and we want all them\n",
    "    winners = [name for name in candidate_names if scores[name] == max(scores.values())]\n",
    "\n",
    "    return winners, scores\n",
    "\n",
    "def bradley_terry_method(candidate_names, pairwise_results):\n",
    "    from scipy.optimize import minimize\n",
    "\n",
    "    n = len(candidate_names)\n",
    "    candidate_index = {name: i for i, name in enumerate(candidate_names)}\n",
    "\n",
    "    # Initialize ability scores\n",
    "    abilities = np.zeros(n)\n",
    "\n",
    "    def log_likelihood(abilities):\n",
    "        ll = 0\n",
    "        for A, B, winner in pairwise_results:\n",
    "            i, j = candidate_index[A], candidate_index[B]\n",
    "            pi = 1 / (1 + np.exp(abilities[j] - abilities[i]))\n",
    "            if winner == A:\n",
    "                ll += np.log(pi)\n",
    "            elif winner == B:\n",
    "                ll += np.log(1 - pi)\n",
    "        return -ll\n",
    "\n",
    "    result = minimize(log_likelihood, abilities, method='BFGS')\n",
    "    abilities = result.x\n",
    "\n",
    "    # Generate the ranking\n",
    "    ranking = sorted(candidate_names, key=lambda x: abilities[candidate_index[x]], reverse=True)\n",
    "\n",
    "    return ranking[0] # , abilities\n",
    "\n",
    "def binary_classifiers(all_df, tr_size=-1, mode=\"filter\"):\n",
    "    import itertools\n",
    "    methods = np.array([\"ERM\", \"GroupDRO\", \"oversample\", \"undersample\", \"remax-margin\"])\n",
    "    combinations = np.array(list(itertools.combinations(methods, 2)))\n",
    "\n",
    "    num_exps = all_df[all_df['method']==methods[0]].shape[0]\n",
    "    tr_idx, te_idx = prepare_data(num_exps, tr_size)\n",
    "\n",
    "    zoo = {}\n",
    "    for m1, m2 in combinations:\n",
    "        zoo[(m1, m2)], X_test = train_binary_classifier(all_df, tr_idx, te_idx, m1, m2, filter_thre=0.05, mode=mode)\n",
    "    return zoo, te_idx, X_test, methods\n",
    "\n",
    "def ranking_acc(methods, zoo, X_test, y_test, mode=\"copeland\", acc_mode=\"jac\"):\n",
    "\n",
    "    def jac_sim(list1, list2):\n",
    "        set1, set2 = set(list1), set(list2)\n",
    "        intersection = len(set1.intersection(set2))\n",
    "        union = len(set1.union(set2))\n",
    "        return intersection / union\n",
    "\n",
    "    if mode == \"copeland\":\n",
    "        eval_fn = copeland_method\n",
    "    elif mode == \"bradley_terry\":\n",
    "        eval_fn = bradley_terry_method\n",
    "    else:\n",
    "        raise ValueError(f\"unknown mode {mode}\")\n",
    "\n",
    "    y_pred, scores = [], []\n",
    "    for X in X_test:\n",
    "        pairwise_results = extract_pairwise_results(zoo, X)\n",
    "        winners, s = eval_fn(methods, pairwise_results)\n",
    "        y_pred.append(winners)\n",
    "        scores.append(s)\n",
    "\n",
    "    if acc_mode == \"jac\":\n",
    "        res = []\n",
    "        for i in range(len(y_pred)):\n",
    "            res.append(jac_sim(y_test[i], y_pred[i]))\n",
    "        acc = np.mean(res)\n",
    "        print(f\"{mode} method accuracy: \", acc)\n",
    "    elif acc_mode == \"accuracy\":\n",
    "        correct = 0\n",
    "        for i in range(len(y_pred)):\n",
    "            if set(y_test[i]) == set(y_pred[i]):\n",
    "                correct += 1\n",
    "        acc = correct / len(y_test)\n",
    "        print(f\"{mode} method accuracy: \", acc)\n",
    "    elif acc_mode == \"all\":\n",
    "        res = []\n",
    "        for i in range(len(y_pred)):\n",
    "            res.append(jac_sim(y_test[i], y_pred[i]))\n",
    "        acc = np.mean(res)\n",
    "        print(f\"{mode} method jaccard accuracy: \", acc)\n",
    "\n",
    "        correct = 0\n",
    "        for i in range(len(y_pred)):\n",
    "            if set(y_test[i]) == set(y_pred[i]):\n",
    "                correct += 1\n",
    "        acc = correct / len(y_test)\n",
    "        print(f\"{mode} method 0-1 accuracy: \", acc)\n",
    "    else:\n",
    "        raise ValueError(f\"unknown acc_mode {acc_mode}\")\n",
    "\n",
    "    return y_pred, scores\n",
    "\n",
    "    # # correct = 0\n",
    "    # # for i in range(len(y_pred)):\n",
    "    # #     if y_test[i] in y_pred[i]:\n",
    "    # #         correct += 1\n",
    "    # # acc = correct / len(y_test)\n",
    "    # # print(f\"{mode} method accuracy: \", acc)\n",
    "    # res = []\n",
    "    # for i in range(len(y_pred)):\n",
    "    #     res.append(jac_sim(y_test[i], y_pred[i]))\n",
    "    # acc = np.mean(res)\n",
    "    # print(f\"{mode} method accuracy: \", acc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "zoo, te_idx, X_test, methods = binary_classifiers(all_df, tr_size=200, mode=\"tie\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# y_test = all_df_rank[\"method\"].to_numpy()[te_idx]\n",
    "all_df_rank = all_df[all_df[\"rank\"]==1.0]\n",
    "winners = all_df_rank[\"winners\"].to_numpy()[te_idx]\n",
    "y_test = np.array([w.split(\"|\") for w in winners])\n",
    "\n",
    "# filter_idx = []\n",
    "# for i in range(len(y_test)):\n",
    "#     if len(y_test[i]) != 5:\n",
    "#         filter_idx.append(i)\n",
    "# y_test = y_test[filter_idx]\n",
    "# X_test = X_test[filter_idx]\n",
    "\n",
    "y_pred, scores = ranking_acc(methods, zoo, X_test, y_test, mode=\"copeland\", acc_mode=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores_df = pd.DataFrame(scores)\n",
    "test_raw_samples = []\n",
    "for m in methods:\n",
    "    curr_method = all_df[all_df[\"method\"]==m].iloc[te_idx]\n",
    "    curr_method[\"pred_rank\"] = scores_df[m].to_numpy()\n",
    "    test_raw_samples.append(curr_method)\n",
    "test_df = pd.concat(test_raw_samples, ignore_index=True)\n",
    "\n",
    "min_err = test_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].min().reset_index(name='min_err')\n",
    "test_df = test_df.merge(min_err, on=['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])\n",
    "\n",
    "max_err = test_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err'].max().reset_index(name='max_err')\n",
    "test_df = test_df.merge(max_err, on=['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])\n",
    "\n",
    "test_df['rel_err'] = (test_df['wga_te_err'] - test_df['min_err']) / (test_df['max_err'] - test_df['min_err'])\n",
    "\n",
    "# replace NaN in 'rel_err' by 0\n",
    "test_df['rel_err'] = test_df['rel_err'].fillna(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def corr(x):\n",
    "    # print(x['wga_te_err'].to_numpy())\n",
    "    # print(x['pred_rank'].to_numpy())\n",
    "    # compute person correlation\n",
    "    # print(x['pred_rank'].to_numpy())\n",
    "    # corr = np.corrcoef(x['wga_te_err'].to_numpy(), x['pred_rank'].to_numpy())[0, 1]\n",
    "    # [\"ERM\", \"GroupDRO\", \"oversample\", \"undersample\", \"remax-margin\"] array([6088, 8933, 6721, 8730, 7799])\n",
    "    try:\n",
    "        # corr = np.corrcoef(x['wga_te_err'].to_numpy(), np.array([1,3,4,2,0]))[0, 1]\n",
    "        corr = np.corrcoef(x['wga_te_err'].to_numpy(), x['pred_rank'].to_numpy())[0, 1]\n",
    "    except:\n",
    "        corr = np.nan\n",
    "    return corr\n",
    "\n",
    "corr_df = test_df.groupby(['n', 'sc', 'ci', 'ai', 'var_causal', 'd_feat'])['wga_te_err', 'pred_rank'].apply(lambda x: corr(x)).reset_index(name='corr')\n",
    "# corr_df['corr'] = corr_df['corr'].fillna(0)\n",
    "\n",
    "# remove the rows with NaN\n",
    "corr_df = corr_df.dropna()\n",
    "\n",
    "corr_df['corr'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# scatter plot\n",
    "sns.scatterplot(data=test_df, x='pred_rank', y='rel_err', hue='method')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "div_backup",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
