{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c9da1c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "class Importance_Sampling(object):\n",
    "    def __init__(self, raw_data, theta, gamma, policy, step_offset):\n",
    "        self.raw_data = raw_data\n",
    "        self.theta = theta\n",
    "        self.gamma = gamma\n",
    "        self.traces = []\n",
    "        self.n_action = 0\n",
    "        self.n_user = 0\n",
    "        self.random_prob = 0\n",
    "        self.policy = policy\n",
    "        self.alpha = 0.5\n",
    "        self.step_offset = step_offset # from a step h to calculate ips\n",
    "\n",
    "    def readData(self):\n",
    "        raw_data = self.raw_data\n",
    "\n",
    "        Q_list = ['ps', 'fwe', 'we']\n",
    "        beh_prob_list = ['prob_ps', 'prob_fwe', 'prob_we']\n",
    "        user_list = list(raw_data['userID'].unique())\n",
    "        self.n_action = len(Q_list)\n",
    "        self.n_user = len(user_list)\n",
    "        self.random_prob = 1.0 / self.n_action\n",
    "        \n",
    "\n",
    "        for user in user_list:\n",
    "            user_sequence = []\n",
    "            user_data = raw_data.loc[raw_data['userID'] == user,]\n",
    "            row_index = user_data.index.tolist()\n",
    "            \n",
    "            expert_count = 0\n",
    "            for i in range(0, len(row_index)):\n",
    "                action = user_data.loc[row_index[i], 'real_action']\n",
    "                \n",
    "                reward = user_data.loc[row_index[i], 'inferred_rew']\n",
    "                # critical = user_data.loc[row_index[i], 'critical']\n",
    "                Qs = user_data.loc[row_index[i], Q_list].tolist()\n",
    "                beh_probs = user_data.loc[row_index[i], beh_prob_list].tolist()\n",
    "                \n",
    "                eva_probs = []\n",
    "                if self.policy == 'CHRL':\n",
    "                    if user_data.loc[row_index[i], 'critical'] == 1:\n",
    "                        eva_action = Qs.index(max(Qs))\n",
    "                        eva_probs = [0.8 if x == eva_action else 1e-1 for x in range(self.n_action)]\n",
    "                    else:\n",
    "                        eva_probs = [1/self.n_action for x in range(self.n_action)]\n",
    "                elif self.policy == 'SOCHRL':\n",
    "                    if user_data.loc[row_index[i], 'critical'] == 1:\n",
    "                        eva_action = Qs.index(min(Qs))\n",
    "                        eva_probs = [0.8 if x == eva_action else 1e-1 for x in range(self.n_action)]\n",
    "                    else:\n",
    "                        eva_probs = [1/self.n_action for x in range(self.n_action)]\n",
    "                elif self.policy == 'FHRL':\n",
    "                    eva_action = Qs.index(max(Qs))\n",
    "                    eva_probs = [0.8 if x == eva_action else 1e-1 for x in range(self.n_action)]\n",
    "                \n",
    "                elif self.policy == 'expert':\n",
    "                    eva_action = expert_count\n",
    "                    expert_count += 1\n",
    "                    if expert_count > 2:\n",
    "                        expert_count = 0\n",
    "                    eva_probs = [0.8 if x == eva_action else 1e-1 for x in range(self.n_action)]\n",
    "                elif self.policy == 'RAND':\n",
    "                    eva_probs = [1/self.n_action for x in range(self.n_action)]\n",
    "              \n",
    "                  \n",
    "\n",
    "                user_sequence.append((action, reward, Qs, beh_probs, eva_probs))\n",
    "\n",
    "            self.traces.append(user_sequence)\n",
    "\n",
    "\n",
    "    def IS(self):\n",
    "        IS = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            cumulative_reward = 0\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):   \n",
    "\n",
    "#                 print(i)\n",
    "#                 print((action, reward, Qs, beh_probs, eva_probs))\n",
    "                cumul_policy_prob *= eva_probs[action]\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                cumulative_reward += math.pow(self.gamma, i+self.step_offset) * reward\n",
    "\n",
    "            weight = cumul_policy_prob / cumul_random_prob\n",
    "            \n",
    "            IS_reward = cumulative_reward * weight\n",
    "\n",
    "            IS += IS_reward\n",
    "\n",
    "        IS = float(IS) / self.n_user\n",
    "        return IS\n",
    "\n",
    "\n",
    "    def WIS(self):\n",
    "        WIS = 0\n",
    "        total_weight = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            cumulative_reward = 0\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                \n",
    "                cumul_policy_prob *= eva_probs[action]\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                cumulative_reward += math.pow(self.gamma, i+self.step_offset) * reward\n",
    "\n",
    "            weight = cumul_policy_prob / cumul_random_prob\n",
    "            \n",
    "            total_weight += weight\n",
    "            IS_reward = cumulative_reward * weight\n",
    "\n",
    "            WIS += IS_reward\n",
    "\n",
    "        WIS = float(WIS) / total_weight\n",
    "        return WIS\n",
    "\n",
    "\n",
    "    def PDIS(self):\n",
    "        PDIS = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            PDIS_each_student = 0\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                cumul_policy_prob *= eva_probs[action]\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                weight = cumul_policy_prob / cumul_random_prob\n",
    "                \n",
    "                PDIS_each_student += math.pow(self.gamma, i) * reward * weight\n",
    "\n",
    "            PDIS += PDIS_each_student\n",
    "\n",
    "        PDIS = float(PDIS) / self.n_user\n",
    "        return PDIS\n",
    "\n",
    "    # PHWIS-Behvaior\n",
    "    def PHWIS_beh(self):\n",
    "        PHWIS_beh = {}\n",
    "        total_weight = {}\n",
    "        len_traj = {}\n",
    "        count_traj = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            tau = len(each_student_data)\n",
    "            if len(each_student_data) in len_traj:\n",
    "                len_traj[tau] += 1\n",
    "            else:\n",
    "                len_traj[tau] = 1\n",
    "                total_weight[tau] = 0\n",
    "                PHWIS_beh[tau] = 0\n",
    "            \n",
    "            count_traj += 1 # total number of trajectories\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            cumulative_reward = 0\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                cumul_policy_prob *= eva_probs[action]\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                cumulative_reward += math.pow(self.gamma, i) * reward\n",
    "\n",
    "            weight = cumul_policy_prob / cumul_random_prob\n",
    "            \n",
    "            total_weight[tau] += weight\n",
    "            IS_reward = cumulative_reward * weight\n",
    "\n",
    "            PHWIS_beh[tau] += IS_reward\n",
    "\n",
    "        PHWIS_beh = {tau: float(PHWIS_beh[tau]) / total_weight[tau] for tau in PHWIS_beh}\n",
    "        PHWIS_beh_total = sum((len_traj[tau] / count_traj) * PHWIS_beh[tau] for tau in PHWIS_beh)\n",
    "        return PHWIS_beh_total\n",
    "\n",
    "\n",
    "    # PHWIS-Estimated\n",
    "    def PHWIS_est(self):\n",
    "        PHWIS_beh = {}\n",
    "        total_weight = {}\n",
    "        len_traj = {}\n",
    "        count_traj = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            tau = len(each_student_data)\n",
    "            if len(each_student_data) in len_traj:\n",
    "                len_traj[tau] += 1\n",
    "            else:\n",
    "                len_traj[tau] = 1\n",
    "                total_weight[tau] = 0\n",
    "                PHWIS_beh[tau] = 0\n",
    "            \n",
    "            count_traj += 1 # total number of trajectories\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            cumulative_reward = 0\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                cumul_policy_prob *= eva_probs[action]\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                cumulative_reward += math.pow(self.gamma, i) * reward\n",
    "\n",
    "            weight = cumul_policy_prob / cumul_random_prob\n",
    "            \n",
    "            total_weight[tau] += weight\n",
    "            IS_reward = cumulative_reward * weight\n",
    "\n",
    "            PHWIS_beh[tau] += IS_reward\n",
    "\n",
    "        PHWIS_beh = {tau: float(PHWIS_beh[tau]) / total_weight[tau] for tau in PHWIS_beh}\n",
    "        PHWIS_beh_total = sum((len_traj[tau] / count_traj) * PHWIS_beh[tau] for tau in PHWIS_beh)\n",
    "        return PHWIS_beh_total\n",
    "\n",
    "    def DR(self):\n",
    "        DR = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            DR_each_student = 0\n",
    "            previous_weight = 1\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                Q_act = Qs[action]\n",
    "                V = max(Qs)\n",
    "                prob_logP = eva_probs[action]\n",
    "\n",
    "                cumul_policy_prob *= prob_logP\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                weight = cumul_policy_prob / cumul_random_prob\n",
    "                \n",
    "                DR_each_student += math.pow(self.gamma, i) * (reward * weight - Q_act * weight + V * previous_weight)\n",
    "\n",
    "                previous_weight = weight\n",
    "\n",
    "            DR += DR_each_student\n",
    "\n",
    "        DR = float(DR) / self.n_user\n",
    "        return DR\n",
    "\n",
    "    def WDR(self):\n",
    "        WDR = 0\n",
    "        DR = 0\n",
    "        total_weight = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            cumul_policy_prob = 1\n",
    "            cumul_random_prob = 1\n",
    "            DR_each_student = 0\n",
    "            previous_weight = 1\n",
    "\n",
    "            for i, (action, reward, Qs, beh_probs, eva_probs) in enumerate(each_student_data):\n",
    "\n",
    "                Q_act = Qs[action]\n",
    "                V = max(Qs)\n",
    "                prob_logP = eva_probs[action]\n",
    "\n",
    "                cumul_policy_prob *= prob_logP\n",
    "                cumul_random_prob *= beh_probs[action]\n",
    "                weight = cumul_policy_prob / cumul_random_prob\n",
    "                \n",
    "                DR_each_student += math.pow(self.gamma, i) * (reward * weight - Q_act * weight + V * previous_weight)\n",
    "\n",
    "                previous_weight = weight\n",
    "\n",
    "            each_weight = cumul_policy_prob / cumul_random_prob\n",
    "            total_weight += each_weight\n",
    "\n",
    "            DR += DR_each_student\n",
    "\n",
    "        WDR = float(DR) / total_weight\n",
    "        return WDR\n",
    "   \n",
    "    def FQE(self):\n",
    "        FQE = 0\n",
    "\n",
    "        for each_student_data in self.traces:\n",
    "            (action, reward, Qs, beh_probs, eva_probs) = each_student_data[0]\n",
    "            Q_act = Qs[action]\n",
    "            prob_logP = eva_probs[action]\n",
    "            FQE += prob_logP * Q_act\n",
    "\n",
    "        FQE = float(FQE) / self.n_user\n",
    "        return FQE"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
