{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0032b0c5-e766-4091-b60a-5a9f92a966ad",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import jsonlines\n",
    "from collections import defaultdict\n",
    "import pickle\n",
    "import numpy as np\n",
    "import scipy\n",
    "from scipy import stats\n",
    "from typing import *\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fbc64f4-3472-4f15-ae35-6994e9e39a87",
   "metadata": {},
   "source": [
    "#### Portfolio Computation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "efd9572f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def metric_zscore_normalizer(list_of_scores):\n",
    "    # list of scores contains concatenated scores of model_a, model_b, model_c \n",
    "    lengths= [ l.size for l in list_of_scores]\n",
    "    scores = np.concatenate(list_of_scores, axis=None)\n",
    "    scores = stats.zscore(scores)\n",
    "    norm_scores = np.split(scores, np.cumsum(lengths))\n",
    "    # print(len(norm_scores))\n",
    "    return norm_scores[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5f5591cd-2c22-4188-9afd-9ff834de6550",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def metric_cdf_normalizer(list_of_scores):\n",
    "    # list of scores contains concatenated scores of model_a, model_b, model_c \n",
    "    lengths= [ l.size for l in list_of_scores]\n",
    "    scores = np.concatenate(list_of_scores, axis=None)\n",
    "    scores_sorted = np.sort(scores)\n",
    "    cdf = [np.searchsorted(scores_sorted, x, side='right') for x in scores]\n",
    "    cdf= np.array(cdf)/scores.size\n",
    "    norm_scores = np.split(cdf, np.cumsum(lengths))\n",
    "    # print(len(norm_scores))\n",
    "    return norm_scores[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "57e7a2a3-a874-4ecf-89f0-cd0d295d29e1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_portfolio(w, norm_list_metrics_model):\n",
    "        # list to array samples times metrics\n",
    "        sample_all_metrics = np.stack(norm_list_metrics_model, axis=-1) # nsamples x nmetrics\n",
    "        # print(sample_all_metrics.shape)\n",
    "        # print(w.shape)\n",
    "        # geometric mean \n",
    "        portfolio = stats.mstats.gmean(sample_all_metrics, axis=1, dtype=None, weights=w) # nsample , archimedian copula\n",
    "        return portfolio "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "96f31597-86dc-435f-bdbc-c5957e258577",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def build_portfolio(data):\n",
    "    metric_wise_dict = {}\n",
    "    model_names = list(data.keys())\n",
    "    metric_names = list(data[model_names[0]].keys())\n",
    "    for m in metric_names:\n",
    "        scores_list = []\n",
    "        for mname in model_names:\n",
    "            scores_list.append(data[mname][m])\n",
    "        metric_wise_dict[m] = scores_list\n",
    "    normalized_metric_dict = {}\n",
    "    for key,value in metric_wise_dict.items():\n",
    "        normalized_metric_dict[key] = metric_cdf_normalizer(np.array(value))\n",
    "    model_normalized_scores = defaultdict(list)\n",
    "    model_normalized_scores_per_metric = defaultdict(dict)\n",
    "    for i in range(0,len(model_names)):\n",
    "        for metric in normalized_metric_dict.keys():\n",
    "            model_normalized_scores[model_names[i]].append(normalized_metric_dict[metric][i])\n",
    "            model_normalized_scores_per_metric[model_names[i]][metric] = normalized_metric_dict[metric][i]\n",
    "    portfolio = {}\n",
    "    for k, v in model_normalized_scores.items():\n",
    "        portfolio[k] = {\"portfolio\":get_portfolio(None,v)}\n",
    "    return portfolio\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3bccd76a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cdf_normalization(data,log=False):\n",
    "    metric_wise_dict = {}\n",
    "    model_names = list(data.keys())\n",
    "    metric_names = list(data[model_names[0]].keys())\n",
    "    for m in metric_names:\n",
    "        scores_list = []\n",
    "        for mname in model_names:\n",
    "            scores_list.append(data[mname][m])\n",
    "        metric_wise_dict[m] = scores_list\n",
    "    normalized_metric_dict = {}\n",
    "    for key,value in metric_wise_dict.items():\n",
    "        if log:\n",
    "            print(\"Computing log CDF\")\n",
    "            normalized_metric_dict[key] = np.log(metric_cdf_normalizer(np.array(value)))\n",
    "        else:\n",
    "            normalized_metric_dict[key] = metric_cdf_normalizer(np.array(value))\n",
    "    model_normalized_scores = defaultdict(list)\n",
    "    model_normalized_scores_per_metric = defaultdict(dict)\n",
    "    for i in range(0,len(model_names)):\n",
    "        for metric in normalized_metric_dict.keys():\n",
    "            model_normalized_scores[model_names[i]].append(normalized_metric_dict[metric][i])\n",
    "            model_normalized_scores_per_metric[model_names[i]][metric] = normalized_metric_dict[metric][i]\n",
    "    return model_normalized_scores_per_metric\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e274bab-7174-4167-a9d9-23cfa2913d64",
   "metadata": {},
   "source": [
    "#### Test Data Loading "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7f05070d-9b1c-4bd7-bcf3-3154e79a05d4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_based_dict_test = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3eaf7d18-8b9a-4cb0-8256-15b149d8060f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "with jsonlines.open('test_data_prepared.jsonl') as reader:\n",
    "    for obj in reader:\n",
    "        for response in obj['candidates']:\n",
    "            if model_based_dict_test.get(response['model']):\n",
    "                # print(response['scores'])\n",
    "                # print(model_based_dict_test.get(response['model']))\n",
    "                for k in model_based_dict_test[response['model']].keys():\n",
    "                    # print(k, model_based_dict_test[response['model']][k])\n",
    "                    model_based_dict_test[response['model']][k].append(response['scores'][k])\n",
    "            else:\n",
    "                # print(f\"canttt findd in dictionary {response['model']}\")\n",
    "                model_based_dict_test[response['model']] = defaultdict(list) \n",
    "                for k in response['scores'].keys():\n",
    "                    # print(response['scores'][k])\n",
    "                    model_based_dict_test[response['model']][k].append(response['scores'][k])\n",
    "                    # print(model_based_dict_test.get(response['model']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "30b0e1d2-90a5-4e52-af59-4f53354c273a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"mix_instruct_test.pickle\",\"wb\") as handle:\n",
    "    pickle.dump(model_based_dict_test, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cafd00ee",
   "metadata": {},
   "source": [
    "### Log CDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "9668e65d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n",
      "Computing log CDF\n"
     ]
    }
   ],
   "source": [
    "log_cdf_normalized_data = cdf_normalization(model_based_dict_test,log=True)\n",
    "\n",
    "multivariate_format_dict_logcdfNorm = {}\n",
    "for k, v in log_cdf_normalized_data.items():\n",
    "    all_metrics_vals = np.array(list(v.values()))\n",
    "    multivariate_format_dict_logcdfNorm[k] = all_metrics_vals.T\n",
    "    \n",
    "\n",
    "\n",
    "with open(\"mv_logcdfNorm_mix_instruct_test.pickle\",\"wb\") as handle:\n",
    "    pickle.dump(multivariate_format_dict_logcdfNorm, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3899f6a6-9785-429b-9796-05d22819fa0b",
   "metadata": {},
   "source": [
    "#### Mix Instruct Test Portfolio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "82f660f6-a6ab-46f5-a41e-e0213ad69e72",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "mix_instruct_test_portfolio = build_portfolio(model_based_dict_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "340857b4-cac0-4f7c-b7c8-24f51df72c9c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "with open(\"MixInstructTest_Portfolio.pickle\",\"wb\") as handle:\n",
    "    pickle.dump(mix_instruct_test_portfolio, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "41536f8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mix_instruct_test_portfolio_zscore = zscore_normalization(model_based_dict_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "446a1ff8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
