{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from minio_obj_storage import get_numpy_from_cloud\n",
    "import pandas as pd\n",
    "import logging\n",
    "import json\n",
    "from sklearn.metrics import roc_auc_score\n",
    "import numpy as np\n",
    "\n",
    "logger = logging.getLogger('Default')\n",
    "with open(\"./config.json\", 'r') as f:\n",
    "    config = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_seeds = []\n",
    "results_dataset = []\n",
    "results_metric = []\n",
    "results_auroc = []\n",
    "\n",
    "for seed in [1,2,3]:\n",
    "    for dataset_name in ['cifar10_duplicate', 'cifar100_duplicate']:\n",
    "        indices_map = get_numpy_from_cloud('learning-dynamics-models', dataset_name, 'duplicate_index_map.npy').item()\n",
    "        losses = []\n",
    "        loss_curvature = []\n",
    "        lt = []\n",
    "        for arrays, metric, suffix in [\n",
    "            (losses, 'loss', \"\"), \n",
    "            (loss_curvature, 'loss_curvature', \"_h_0.001_n_10\"), \n",
    "            (lt, 'correct', \"\")]:\n",
    "            for epoch in range(200):\n",
    "                score = get_numpy_from_cloud(\n",
    "                    'learning-dynamics-scores', \n",
    "                    dataset_name,\n",
    "                    f\"{metric}_{dataset_name}_resnet18_seed_{seed}_epoch_{epoch}{suffix}.npy\")\n",
    "                arrays.append(score)\n",
    "\n",
    "        losses = np.array(losses)\n",
    "        loss_curvature = np.array(loss_curvature)\n",
    "        lt = np.array(lt)\n",
    "\n",
    "        duplicate_idxs = []\n",
    "        for idx1, (idx2, _) in indices_map.items():\n",
    "            duplicate_idxs.append(idx1)\n",
    "            duplicate_idxs.append(idx2)\n",
    "\n",
    "        is_duplicate = np.zeros((losses.shape[1]))\n",
    "        is_duplicate[duplicate_idxs] = 1\n",
    "        results_df = pd.DataFrame(\n",
    "            data={\n",
    "                'loss': losses.mean(0),\n",
    "                'loss_curvature': loss_curvature.mean(0),\n",
    "                'clt': 1 - lt.mean(0),\n",
    "                'lt': np.argmax(lt, 0),\n",
    "                'is_duplicate': is_duplicate\n",
    "            }\n",
    "        )\n",
    "\n",
    "        for metric in ['loss', 'loss_curvature', 'clt', 'lt']:\n",
    "            y_true = results_df['is_duplicate']\n",
    "            y_scores = results_df[metric]\n",
    "            auroc = roc_auc_score(y_true, y_scores)\n",
    "            results_seeds.append(seed)\n",
    "            results_dataset.append(dataset_name)\n",
    "            results_metric.append(metric)\n",
    "            results_auroc.append(auroc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cleanlab.filter import find_label_issues\n",
    "for seed in [1,2,3]:\n",
    "    for dataset_name in ['cifar10_duplicate', 'cifar100_duplicate']:\n",
    "        container_name = 'learning-dynamics-scores'\n",
    "        indices_map = get_numpy_from_cloud('learning-dynamics-models', dataset_name, 'duplicate_index_map.npy').item()\n",
    "        duplicate_idxs = []\n",
    "        for idx1, (idx2, _) in indices_map.items():\n",
    "            duplicate_idxs.append(idx1)\n",
    "            duplicate_idxs.append(idx2)\n",
    "\n",
    "        conf_learning_labels = get_numpy_from_cloud(\n",
    "            container_name, \n",
    "            f\"{dataset_name}_noisy\",\n",
    "            f\"duplicate_conf_learning_labels_noise_idx_{seed}_noise_0.0.pt\"\n",
    "        ).astype(np.int32)\n",
    "\n",
    "        is_duplicate = np.zeros((len(conf_learning_labels)))\n",
    "        is_duplicate[duplicate_idxs] = 1\n",
    "\n",
    "        prob_file_name = f\"duplicate_conf_learning_prob_noise_idx_{seed}_noise_0.0.pt\"\n",
    "        prob_4_eph = get_numpy_from_cloud(container_name, f\"{dataset_name}_noisy\", prob_file_name)\n",
    "\n",
    "        conf_learning = find_label_issues(\n",
    "            conf_learning_labels,\n",
    "            prob_4_eph,\n",
    "            return_indices_ranked_by=\"self_confidence\",\n",
    "            filter_by=\"confident_learning\",\n",
    "        )\n",
    "\n",
    "        conf = prob_4_eph.max(axis=1)\n",
    "        conf_learning_soft = np.zeros(len(conf_learning_labels))\n",
    "        for idx, i in enumerate(conf_learning[::-1]):\n",
    "            conf_learning_soft[i] = (idx  + 1) / len(conf_learning)\n",
    "\n",
    "        y_true = is_duplicate\n",
    "        y_scores = conf_learning_soft.astype(np.float32)\n",
    "        auroc = roc_auc_score(y_true, y_scores)\n",
    "        results_seeds.append(seed)\n",
    "        results_dataset.append(dataset_name)\n",
    "        results_metric.append('cl')\n",
    "        results_auroc.append(auroc)\n",
    "\n",
    "        y_true = is_duplicate\n",
    "        y_scores = 1 - prob_4_eph.max(1)\n",
    "        auroc = roc_auc_score(y_true, y_scores)\n",
    "        results_seeds.append(seed)\n",
    "        results_dataset.append(dataset_name)\n",
    "        results_metric.append('in conf.')\n",
    "        results_auroc.append(auroc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from minio_obj_storage import get_numpy_from_cloud\n",
    "import torch\n",
    "\n",
    "bucket_name = 'learning-dynamics-scores'\n",
    "index = torch.load(\"./index/data_index_cifar100.pt\")\n",
    "index = np.array(list(range(len(index))))\n",
    "label_noise = 0.0\n",
    "for dataset in ['cifar100', 'cifar10']:\n",
    "    container_dir = dataset + '_duplicate'\n",
    "\n",
    "    indices_map = get_numpy_from_cloud('learning-dynamics-models', container_dir, 'duplicate_index_map.npy').item()\n",
    "    duplicate_idxs = []\n",
    "    for idx1, (idx2, _) in indices_map.items():\n",
    "        duplicate_idxs.append(idx1)\n",
    "        duplicate_idxs.append(idx2)\n",
    "\n",
    "    is_duplicate = np.zeros((len(index)))\n",
    "    is_duplicate[duplicate_idxs] = 1\n",
    "    epochs_total = 200\n",
    "    for seed in [1,2,3]:\n",
    "        preds = np.zeros((epochs_total, len(index)))\n",
    "        for part in [0, 1]:\n",
    "            if part == 0:\n",
    "                index1 = index[:len(index) // 2]\n",
    "                index2 = index[len(index) // 2:]\n",
    "            else:\n",
    "                index2 = index[:len(index) // 2]\n",
    "                index1 = index[len(index) // 2:]    \n",
    "            pred = get_numpy_from_cloud(bucket_name, container_dir, f\"ssft_pred_resnet18_part_{part}_noisy_idx_{seed}_noise_{label_noise}.npy\")\n",
    "            preds[:, index1] = pred[epochs_total:, index1]\n",
    "\n",
    "        preds = np.array(preds).T\n",
    "        ft = np.mean(preds, axis=1)\n",
    "\n",
    "        for metric in ['ssft']:\n",
    "            auroc = roc_auc_score(is_duplicate, 1-ft)\n",
    "            results_seeds.append(seed)\n",
    "            results_auroc.append(auroc)\n",
    "            results_metric.append(metric)\n",
    "            results_dataset.append(container_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_results = pd.DataFrame(data={\n",
    "    'seed': results_seeds,\n",
    "    'metric': results_metric,\n",
    "    'dataset': results_dataset,\n",
    "    'auroc': results_auroc\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_results.groupby(['dataset', 'metric']).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_results.groupby(['dataset', 'metric']).std()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py3.11_tf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
