{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the correct data from an ensemble-model folder:\n",
    "dataset = \"utkface\"\n",
    "path = f\"./votes/{dataset}/\"\n",
    "raw_votes = np.load(path + f\"model(1)-raw-votes-(mode-random)-dataset-{dataset}.npy\").astype(float)\n",
    "targets = np.load(path + f\"model(1)-targets-(mode-random)-dataset-{dataset}.npy\").astype(float)\n",
    "sensitives = np.load(path + f\"model(1)-sensitives-(mode-random)-dataset-{dataset}.npy\").astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1500,)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sensitives.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "calibration_data = pd.DataFrame(\n",
    "    np.c_[raw_votes.argmax(axis=1), targets, sensitives], columns=[\"prediction\", \"truth\", \"sensitive\"]) # note the prediction is not noised yet\n",
    "calibration_data = calibration_data.astype({\"prediction\": int, \"truth\": int, \"sensitive\": int})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loss Disparity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[0.82132565, 0.75      , 0.78666667, 0.875     , 0.85714286],\n",
       "        [0.19298246, 0.06666667, 0.2761194 , 0.128     , 0.20338983]],\n",
       "\n",
       "       [[0.17867435, 0.25      , 0.21333333, 0.125     , 0.14285714],\n",
       "        [0.80701754, 0.93333333, 0.7238806 , 0.872     , 0.79661017]]])"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def probability_of_k_given_kprime_and_z(calibration_data):\n",
    "    \"\"\"Producing a |Y| x |Y'| x |Z| matrix of probabilities, where Y is ground truth label and Y' is the predicated label and Z is sensitive attribute.\"\"\"\n",
    "\n",
    "    num_classes = max(calibration_data[\"truth\"]) + 1\n",
    "    num_sensitives = max(calibration_data[\"sensitive\"]) + 1\n",
    "\n",
    "    probabilities = np.zeros((num_classes, num_classes, num_sensitives))\n",
    "\n",
    "    for z in range(num_sensitives):\n",
    "        for k in range(num_classes):\n",
    "            for k_ in range(num_classes):\n",
    "                numerator = calibration_data.query(f'truth == {k} and prediction == {k_} and sensitive == {z}')\n",
    "                denominator = calibration_data.query(f'prediction == {k_} and sensitive == {z}')\n",
    "                probabilities[k, k_, z] = len(numerator) / len(denominator)\n",
    "\n",
    "    # sum of probabilities (over 'Y') should some to 1\n",
    "    assert np.allclose(probabilities.sum(axis=0), 1)\n",
    "    return probabilities\n",
    "\n",
    "probability_of_k_given_kprime_and_z(calibration_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.514, 0.486])"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def probability_of_k(calibration_data):\n",
    "    \"\"\"Producing a |Y| vector of probabilities, where Y is ground truth label.\"\"\"\n",
    "    num_classes = max(calibration_data[\"truth\"]) + 1\n",
    "    \n",
    "    probabilities = np.zeros(num_classes)\n",
    "    for k in range(num_classes):\n",
    "        numerator = calibration_data.query(f'truth == {k}')\n",
    "        denominator = calibration_data\n",
    "        probabilities[k] = len(numerator) / len(denominator)\n",
    "    \n",
    "    # sum of probabilities (over 'Y') should some to 1\n",
    "    assert np.allclose(probabilities.sum(axis=0), 1)\n",
    "    return probabilities\n",
    "probability_of_k(calibration_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_given_Yprime_and_Z = probability_of_k_given_kprime_and_z(calibration_data)\n",
    "Y = probability_of_k(calibration_data)\n",
    "indicator = np.ones((Y.shape[0], Y.shape[0])) - np.eye(Y.shape[0])\n",
    "\n",
    "calibration_constants = Y_given_Yprime_and_Z.transpose((0, 2, 1)) @ indicator @ Y\n",
    "\n",
    "np.savez(\"utkface_calibration_constants\", calibration_constants=calibration_constants)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.49835725, 0.39876667, 0.52424537, 0.491042  , 0.5211138 ],\n",
       "       [0.50164275, 0.60123333, 0.47575463, 0.508958  , 0.4788862 ]])"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calibration_constants"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Equality of Odds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def probability_of_not_kprime_given_not_k_and_z(calibration_data):\n",
    "    num_classes = max(calibration_data[\"truth\"]) + 1\n",
    "    num_sensitives = max(calibration_data[\"sensitive\"]) + 1\n",
    "\n",
    "    probabilities = {}\n",
    "    for z in range(num_sensitives):\n",
    "        for k in range(num_classes):\n",
    "            for k_ in range(num_classes):\n",
    "                numerator = calibration_data.query(f'prediction != {k_} and truth != {k} and sensitive == {z}')\n",
    "                denominator = calibration_data.query(f'truth != {k} and sensitive == {z}')\n",
    "                probabilities[(k, k_, z)] = len(numerator) / len(denominator)\n",
    "    return probabilities"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
