{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5c5f64c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "import pickle\n",
    "\n",
    "def sarsa(episodes=100000, alpha=0.01, epsilon_min=0.0, max_steps=500, decay_episodes = 25000):\n",
    "    Q = np.zeros((GRID_HEIGHT, GRID_WIDTH, NUM_ACTIONS)) + 10\n",
    "    rewards = []\n",
    "    epsilon = epsilon_min\n",
    "    for episode in range(episodes):\n",
    "        if episode < decay_episodes:\n",
    "            epsilon = 1.0 - (episode / decay_episodes)\n",
    "        else:\n",
    "            epsilon = 0.0\n",
    "\n",
    "        state = START\n",
    "        action = choose_action(Q, state, epsilon)\n",
    "        done = False\n",
    "        total_reward = 0\n",
    "        \n",
    "        for t in range(max_steps):\n",
    "            next_state, reward, done = step(state, action)\n",
    "            next_action = choose_action(Q, next_state, epsilon)\n",
    "\n",
    "            x, y = state\n",
    "            nx, ny = next_state\n",
    "\n",
    "            Q[x, y, action] += alpha * (\n",
    "                reward + Q[nx, ny, next_action] - Q[x, y, action]\n",
    "            )\n",
    "            total_reward += reward\n",
    "            state = next_state\n",
    "            action = next_action\n",
    "            \n",
    "            rewards.append(total_reward)\n",
    "            \n",
    "            if done:\n",
    "                break\n",
    "        rewards.append(total_reward)\n",
    "\n",
    "    return Q, rewards\n",
    "\n",
    "\n",
    "def PCVaR_Q_Pre_train(Q, num_simul=10000):\n",
    "    Q_cvar_sum = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(H), NUM_ACTIONS)) \n",
    "    M_sum = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(H), NUM_ACTIONS))      \n",
    "    Count = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(H), NUM_ACTIONS))       \n",
    "    rewards = []\n",
    "    for episode in range(num_simul):\n",
    "        if episode % 10000 == 0:\n",
    "            print(f\"Episode {episode}\")\n",
    "        while True:\n",
    "            state = (random.randint(0, 7), random.randint(0, 9))\n",
    "            if state not in OBSTACLES and state != GOAL:\n",
    "                break\n",
    "        total_reward = 0\n",
    "        done = False\n",
    "        trajectory = []\n",
    "\n",
    "        time_step = 0\n",
    "        action = choose_action(Q, state, 0.6)\n",
    "        while not done:\n",
    "            next_state, reward, done = step(state, action)\n",
    "            trajectory.append((total_reward, state, action, reward))\n",
    "            state = next_state\n",
    "            total_reward += reward\n",
    "            time_step +=1\n",
    "            if time_step > 1000:\n",
    "                break\n",
    "            action = choose_action(Q, state, 0.0)\n",
    "        rewards.append(total_reward)\n",
    "        \n",
    "        G = 0\n",
    "        T = len(trajectory) - 1\n",
    "        Remain_sum = []\n",
    "        for _, _, _, r in reversed(trajectory):\n",
    "            G = r + G\n",
    "            Remain_sum.insert(0, G)\n",
    "            T -= 1\n",
    "\n",
    "        for t, ((sum_r, s, a, r), Gt) in enumerate(zip(trajectory, Remain_sum)):\n",
    "            s_x, s_y = s\n",
    "            for i, h in enumerate(H):\n",
    "                idx = np.clip(i - int(round(sum_r)),0,len(H)-1)\n",
    "                indicator = 1.0 if Gt <= H[idx] else 0.0\n",
    "                Count[s_x, s_y, idx, a] += 1\n",
    "                M_sum[s_x, s_y, idx, a] += indicator\n",
    "                Q_cvar_sum[s_x, s_y, idx, a] += Gt * indicator\n",
    "                if ((i - int(round(sum_r))) < 0) or ((i - int(round(sum_r))) > (len(H) - 1)):\n",
    "                    break\n",
    "\n",
    "    M = np.zeros_like(M_sum) \n",
    "    Q_cvar = np.zeros_like(Q_cvar_sum) \n",
    "\n",
    "    valid = Count > 0\n",
    "    M[valid] = M_sum[valid] / Count[valid]\n",
    "    Q_cvar[valid] = Q_cvar_sum[valid] / Count[valid]\n",
    "    Q_cvar[7,9, :, :] = 0\n",
    "    Q_cvar[5,2, :, :] = 0\n",
    "    Q_cvar[5,5, :, :] = 0\n",
    "    Q_cvar[2,4, :, :] = 0\n",
    "    Q_cvar[6,7, :, :] = 0\n",
    "    M[7,9, 151:, :] = 1\n",
    "    M[5,2, 151:, :] = 1\n",
    "    M[5,5, 151:, :] = 1\n",
    "    M[2,4, 151:, :] = 1\n",
    "    M[6,7, 151:, :] = 1\n",
    "    \n",
    "    return Q_cvar, M, rewards\n",
    "\n",
    "def CVaR_Q_Pre_train(Q, num_simul=10000):\n",
    "    Q_cvar_sum = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(H), NUM_ACTIONS))\n",
    "    Count = np.zeros((GRID_HEIGHT, GRID_WIDTH, len(H), NUM_ACTIONS))       \n",
    "    rewards = []\n",
    "    for episode in range(num_simul):\n",
    "        if (episode+1) % 10000 == 0:\n",
    "            print(f\"Episode {(episode+1)}\")\n",
    "        while True:\n",
    "            state = (random.randint(0, 7), random.randint(0, 9))\n",
    "            if state not in OBSTACLES and state != GOAL:\n",
    "                break\n",
    "        total_reward = 0\n",
    "        done = False\n",
    "        trajectory = []\n",
    "\n",
    "        time_step = 0\n",
    "        action = choose_action(Q, state, 0.6)\n",
    "        while not done:\n",
    "            next_state, reward, done = step(state, action)\n",
    "            trajectory.append((total_reward, state, action, reward))\n",
    "            state = next_state\n",
    "            total_reward += reward\n",
    "            time_step +=1\n",
    "            if time_step > 1000:\n",
    "                break\n",
    "            action = choose_action(Q, state, 0.0)\n",
    "        rewards.append(total_reward)\n",
    "\n",
    "        G = 0\n",
    "        T = len(trajectory) - 1\n",
    "        Remain_sum = []\n",
    "        for _, _, _, r in reversed(trajectory):\n",
    "            G = r + G\n",
    "            Remain_sum.insert(0, G)\n",
    "            T -= 1\n",
    "\n",
    "        for t, ((sum_r, s, a, r), Gt) in enumerate(zip(trajectory, Remain_sum)):\n",
    "            s_x, s_y = s\n",
    "            for i, h in enumerate(H):\n",
    "                idx = np.clip(i - int(round(sum_r)),0,len(H)-1)\n",
    "                indicator = 1.0 if Gt <= H[idx] else 0.0\n",
    "                Count[s_x, s_y, idx, a] += 1\n",
    "                Q_cvar_sum[s_x, s_y, idx, a] += (H[idx] - Gt) * indicator\n",
    "                if ((i - int(round(sum_r))) < 0) or ((i - int(round(sum_r))) > (len(H) - 1)):\n",
    "                    break\n",
    "    valid = Count > 0\n",
    "    Q_cvar = np.zeros_like(Q_cvar_sum)\n",
    "    Q_cvar[valid] = Q_cvar_sum[valid] / Count[valid]\n",
    "    for i,h in enumerate(H[150:]):\n",
    "        Q_cvar[7,9, i+150, :] = h\n",
    "        Q_cvar[5,2, i+150, :] = h\n",
    "        Q_cvar[5,5, i+150, :] = h\n",
    "        Q_cvar[2,4, i+150, :] = h\n",
    "        Q_cvar[6,7, i+150, :] = h\n",
    "    return Q_cvar, rewards\n",
    "\n",
    "def update_CVaR(Q_cvar, H, lr1, trajectory):\n",
    "    Q_est = np.zeros_like(Q_cvar)\n",
    "    count = np.zeros_like(Q_cvar)\n",
    "    G = 0\n",
    "    Remain_sum = []\n",
    "    for _, _, _, r, _ in reversed(trajectory):\n",
    "        G = r + G\n",
    "        Remain_sum.insert(0, G)\n",
    "    for t, ((sum_r, s, a, r, s_next), Gt) in enumerate(zip(trajectory, Remain_sum)):\n",
    "        for i in range(len(H)):\n",
    "            idx = i - int(round(sum_r))\n",
    "            if ((idx < 0) or (idx > (len(H) - 1))):\n",
    "                break\n",
    "            count[s[0],s[1],idx, a] += 1\n",
    "            next_idx = np.clip(idx - int(round(r)), 0, len(H) -1)\n",
    "            q_values = Q_cvar[s_next[0], s_next[1], next_idx, :]    \n",
    "            min_q = np.min(q_values)\n",
    "            min_actions = np.where(q_values == min_q)[0]\n",
    "            a_next =  np.random.choice(min_actions)\n",
    "            Q_est[s[0],s[1],idx,a] += (Q_cvar[s_next[0], s_next[1],next_idx, a_next])\n",
    "    valid = count > 0\n",
    "    Q_cvar[valid] += lr1*(Q_est[valid]/count[valid] - Q_cvar[valid])  \n",
    "    return Q_cvar\n",
    "\n",
    "def CVaR_Q_learning(Q_cvar, decay_episodes=2000):\n",
    "    alpha_theta = 0.01\n",
    "    eta_set = H\n",
    "    rewards = []\n",
    "    eta_index= int(eta_RN) + 150\n",
    "    eta = eta_RN\n",
    "    cvar_hist = []\n",
    "    sigma = 45\n",
    "    for episode in range(num_episodes):\n",
    "        epsilon_t = max(1.0 - (episode / decay_episodes), 0.0)\n",
    "        Trajectory = []\n",
    "        state = START\n",
    "        done = False\n",
    "        total_reward = 0\n",
    "        eta_t_idx = eta_index\n",
    "        t = 0\n",
    "        while not done:\n",
    "            action = choose_action_CVaR(Q_cvar, state, eta_t_idx, epsilon_t)\n",
    "            next_state, reward, done = step(state, action)\n",
    "            Trajectory.append([total_reward, state, action, reward, next_state])\n",
    "            total_reward += reward\n",
    "            eta_t_idx =  np.clip(eta_t_idx - int(round(reward)), 0, len(H) -1) \n",
    "            state = next_state\n",
    "            t += 1\n",
    "            if t > 1000:\n",
    "                done = True\n",
    "        rewards.append(total_reward)\n",
    "        Q_cvar = update_CVaR(Q_cvar, H, alpha_theta, Trajectory)\n",
    "        if((episode + 1)%1000 == 0):\n",
    "            var_est = -1000000000\n",
    "            eta = 0\n",
    "            Q_start = Q_cvar[START[0], START[1]]\n",
    "            for i, h in enumerate(H):\n",
    "                val_all = h - Q_start[i]/q\n",
    "                max_val = np.max(val_all)\n",
    "                if max_val > var_est:\n",
    "                    var_est = max_val\n",
    "                    eta = h\n",
    "                    eta_index = i\n",
    "            sigma = max(45*(3 - ((episode + 1) // 2000)),0)\n",
    "        sample_eta = np.random.normal(loc=eta, scale=sigma)\n",
    "        sample_eta = np.clip(sample_eta, eta - 2*sigma, eta + 2*sigma)\n",
    "        eta_index = np.clip(int(round(sample_eta) + 150), 0,250)\n",
    "        if((episode + 1)%1000 == 0):\n",
    "            print(eta_index - 150, eta)\n",
    "            rewards_test = []\n",
    "            for iter in range(10000):\n",
    "                state = START\n",
    "                done = False\n",
    "                eta_t_idx = eta_index\n",
    "                total_reward = 0 \n",
    "                t = 0\n",
    "                while not done:\n",
    "                    action = choose_action_CVaR(Q_cvar, state, eta_t_idx, 0.0)\n",
    "                    next_state, reward, done = step(state, action)\n",
    "                    total_reward += reward\n",
    "                    eta_t_idx =  np.clip(eta_t_idx - int(round(reward)), 0, len(H) -1) \n",
    "                    state = next_state\n",
    "                    t += 1\n",
    "                    if t > 1000:\n",
    "                        done = True\n",
    "                rewards_test.append(total_reward)\n",
    "            rewards_test = np.array(rewards_test)\n",
    "            var_test = np.percentile(rewards_test, q * 100 )\n",
    "            cvar_test = np.mean(rewards_test[rewards_test <= var_test])\n",
    "            cvar_hist. append(cvar_test)\n",
    "    return Q_cvar, cvar_hist\n",
    "\n",
    "def update_PCVaR(Q_cvar, M, H, lr1, lr2, trajectory):\n",
    "    Q_est = np.zeros_like(Q_cvar)\n",
    "    M_est = np.zeros_like(M)\n",
    "    count = np.zeros_like(Q_cvar)\n",
    "\n",
    "    for t, ((sum_r, s, a, r, s_next)) in enumerate(trajectory):\n",
    "        for i in range(len(H)):\n",
    "            idx = i - int(round(sum_r))\n",
    "            if ((idx < 0) or (idx > (len(H) - 1))):\n",
    "                break\n",
    "            count[s[0],s[1],idx, a] += 1\n",
    "            next_idx = np.clip(idx - int(round(r)), 0, len(H) -1)\n",
    "            q_values = Q_cvar[s_next[0], s_next[1], next_idx, :] - H[next_idx]*M[s_next[0], s_next[1], next_idx, :]     \n",
    "            max_q = np.max(q_values)\n",
    "            max_actions = np.where(q_values == max_q)[0]\n",
    "            a_next =  np.random.choice(max_actions)\n",
    "            Q_est[s[0],s[1],idx,a] += (Q_cvar[s_next[0], s_next[1],next_idx, a_next] + M[s_next[0], s_next[1],next_idx, a_next]*r)\n",
    "            M_est[s[0],s[1],idx, a] += M[s_next[0], s_next[1],next_idx, a_next]\n",
    "    valid = count > 0\n",
    "    Q_cvar[valid] += lr1*(Q_est[valid]/count[valid] - Q_cvar[valid])  \n",
    "    M[valid] += lr2*(M_est[valid]/count[valid] - M[valid]) \n",
    "    return Q_cvar, M    \n",
    "\n",
    "def PCVaR_Q_learning(Q_cvar, M, decay_episodes=2000):\n",
    "    start_time = time.time() \n",
    "    alpha_theta = 0.01\n",
    "    alpha_phi = 0.01\n",
    "    eta_set = H\n",
    "    rewards = []\n",
    "    eta_index= int(eta_RN) + 150\n",
    "    eta = eta_RN\n",
    "    sigma = 45\n",
    "    cvar_hist = []\n",
    "    for episode in range(num_episodes):\n",
    "        epsilon_t = max(1.0 - (episode / decay_episodes), 0.0)\n",
    "        Trajectory = []\n",
    "        state = START\n",
    "        done = False\n",
    "        total_reward = 0\n",
    "        eta_t_idx = eta_index\n",
    "        t = 0\n",
    "        while not done:\n",
    "            action = choose_action_PCVaR(Q_cvar, M, state, eta_t_idx, epsilon_t)\n",
    "            next_state, reward, done = step(state, action)\n",
    "            Trajectory.append([total_reward, state, action, reward, next_state])\n",
    "            total_reward += reward\n",
    "            eta_t_idx =  np.clip(eta_t_idx - int(round(reward)), 0, len(H) -1) \n",
    "            state = next_state\n",
    "            t += 1\n",
    "            if t > 1000:\n",
    "                done = True\n",
    "        rewards.append(total_reward)\n",
    "        Q_cvar, M = update_PCVaR(Q_cvar, M, H, alpha_theta, alpha_phi, Trajectory)\n",
    "        if((episode + 1)%1000 == 0):\n",
    "            var_est = -10000000000\n",
    "            eta = 0\n",
    "            Q_start = Q_cvar[START[0], START[1]]\n",
    "            M_start = M[START[0], START[1]]\n",
    "            for i, h in enumerate(H):\n",
    "                val_all = h * (q - M_start[i]) + Q_start[i]\n",
    "                max_val = np.max(val_all)\n",
    "                if max_val > var_est:\n",
    "                    var_est = max_val\n",
    "                    eta = h\n",
    "            sigma = max(45*(3 - ((episode + 1) // 2000)),0)\n",
    "        sample_eta = np.random.normal(loc=eta, scale=sigma)\n",
    "        sample_eta = np.clip(sample_eta, eta - 2*sigma, eta + 2*sigma)\n",
    "        eta_index = np.clip(int(round(sample_eta) + 150), 0,250)\n",
    "        \n",
    "        if((episode + 1)%1000 == 0):\n",
    "            print(eta_index - 150, eta)\n",
    "            rewards_test = []\n",
    "            for iter in range(10000):\n",
    "                state = START\n",
    "                done = False\n",
    "                eta_t_idx = int(eta) + 150\n",
    "                total_reward = 0 \n",
    "                t = 0\n",
    "                while not done:\n",
    "                    action = choose_action_PCVaR(Q_cvar, M, state, eta_t_idx, 0.0)\n",
    "                    next_state, reward, done = step(state, action)\n",
    "                    total_reward += reward\n",
    "                    eta_t_idx =  np.clip(eta_t_idx - int(round(reward)), 0, len(H) -1) \n",
    "                    state = next_state\n",
    "                    t += 1\n",
    "                    if t > 1000:\n",
    "                        done = True\n",
    "                rewards_test.append(total_reward)\n",
    "            rewards_test = np.array(rewards_test)\n",
    "            var_test = np.percentile(rewards_test, q * 100 )\n",
    "            cvar_test = np.mean(rewards_test[rewards_test <= var_test])\n",
    "            cvar_hist. append(cvar_test)\n",
    "    return Q_cvar, M, cvar_hist"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
