#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
import pickle


# In[2]:


def next_state(s, a):
    if (s,a) == (0,0):
        return 1
    if (s,a) == (1,0):
        return 2
    if (s,a) == (1,1):
        return 3
    if (s,a) == (2,0):
        return 4
    if (s,a) == (2,1):
        return 5


# In[3]:


def epsilon_greedy(state, idx,  epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice([0, 1])  # 0: Go to state 2, 1: Go to state 4
    if (Q_cvar[state, idx,0]) == (Q_cvar[state, idx,1]):  
        return random.choice([0, 1])
    else:
        return np.argmax(Q_cvar[state, idx,:])
    
def update_Q_cvar_adam(lr, trajectory_batch):
    global m_Q, v_Q, t_adam
    Q_est = np.zeros((3, len(H), 2))
    count = np.zeros((3, len(H), 2))
    for i, h in enumerate(H):
        for b in range(batch):
            for [sum_r, s, a, r] in trajectory_batch[b]: 
                idx = np.clip( i - int(10 * round(sum_r,1)), 0 ,len(H) - 1)
                count[s,idx, a] += 1
                next_idx = np.clip(idx - int(10 * round(r,1)), 0, len(H) -1)
                s_next = next_state(s, a)
                a_next = epsilon_greedy(s_next, next_idx, 0.0)
                Q_est[s,idx,a] += ( Q_cvar[s_next,next_idx, a_next])

    for s in range(3):
        for i in range(len(H)):
            for a in range(2):
                if count[s, i, a] == 0:
                    continue
                grad = (Q_est[s, i, a] / count[s, i, a]) - Q_cvar[s, i, a]
                #grad = np.clip(grad, -1.0, 1.0)
                m_Q[s, i, a] = beta1 * m_Q[s, i, a] + (1 - beta1) * grad
                v_Q[s, i, a] = beta2 * v_Q[s, i, a] + (1 - beta2) * (grad ** 2)
                m_hat = m_Q[s, i, a] / (1 - beta1 ** t_adam)
                v_hat = v_Q[s, i, a] / (1 - beta2 ** t_adam)
                Q_cvar[s, i, a] += lr * m_hat / (np.sqrt(v_hat) + epsilon_adam)


# In[4]:


### parameter
mu = np.array([1.0, 2.0, 3.0, 2.0, 1.5])
y1_opt = 0.25
eta_opt_index = 131
H = np.linspace(-10.0,15.0,251)
q = 0.1
epsilon_initial = 0.1 
decay_rate = 0.9  
num_episodes = 5001  
batch = 8
alpha_theta_initial = 0.05 
seed_value = 42


# # M

# In[5]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 42
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[8]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 0
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[11]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 1
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[14]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 2
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[17]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 3
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[20]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 4
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[23]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 5
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[26]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 6
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[29]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 7
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


# In[32]:


beta1, beta2, epsilon_adam = 0.9, 0.999, 1e-8

m_Q = np.zeros(( 6, len(H), 2))
v_Q = np.zeros(( 6, len(H), 2))
m_g = np.zeros(( 6, len(H), 2))
v_g = np.zeros(( 6, len(H), 2))

t_adam = 1  # time step
alpha_theta_initial = 0.01 # Learning rate
epsilon_initial = 0.1 
seed_value = 8
eta_index = 100
np.random.seed(seed_value)
random.seed(seed_value)
Q_cvar = np.zeros(( 6, len(H), 2))
Q_cvar[:3,:,:] = 5.0
for j, h in enumerate(H[100:]):
    i = j + 100
    Q_cvar[3:,i,:] = -h           
Q_cvar_old = Q_cvar.copy()
Q_cvar_norm_hist = []
eta = H[eta_index] 
Q_cvar_hist = []
Q_cvar_hist2 = []
Q_cvar_hist3 = []
eta_hist = []
prop = []
M0_hist = []
cvar_hist = []
for episode in range(num_episodes):
    alpha_theta = alpha_theta_initial * (decay_rate ** (episode//100))
    epsilon = epsilon_initial * (decay_rate ** (episode//100))
    trajectory_batch = [] 
    for b in range(batch):
        trajectory = []
        total_reward = 0
        state = 0 
        action = 0
        reward = np.random.normal(mu[0], 1)
        trajectory.append([total_reward, state, action, reward])
        total_reward += reward
        state = 1
        eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) - 1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[1], 1)
            state = 2
        else:
            reward = mu[2]
            state = 3
        trajectory.append([total_reward, 1, action, reward])
        total_reward += reward
        if action == 1:
            trajectory_batch.append(trajectory)
            continue
        eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
        action = epsilon_greedy(state, eta_t_idx, epsilon)
        if action == 0:
            reward = np.random.normal(mu[3], 1)
            state = 4
        else:
            reward = mu[4]
            state = 5
        trajectory.append([total_reward, 2, action, reward])
        trajectory_batch.append(trajectory)
    if(episode%500 == 0):
        test_num = 100000
        reward_cvar = np.zeros(test_num)
        r1_cvar = np.zeros(test_num)
        r2_cvar = np.zeros(test_num)
        r3_cvar = np.zeros(test_num)
        act2_cvar = np.zeros(test_num) + 5
        act3_cvar = np.zeros(test_num) + 5
        for c in range(test_num):
            total_reward = 0
            state = 0 
            reward = np.random.normal(mu[state], 1)
            r1_cvar[c] = reward
            total_reward += r1_cvar[c]
            state = 1
            eta_t_idx =  np.clip(eta_index - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[1], 1)
                state = 2
            else:
                reward = mu[2]
                state = 3
            act2_cvar[c] = action
            r2_cvar[c] = reward
            total_reward += reward
            if action == 1:
                reward_cvar[c] = total_reward
                continue
            eta_t_idx =  np.clip(eta_t_idx - int(10 * round(reward,1)), 0, len(H) -1) 
            action = epsilon_greedy(state, eta_t_idx, 0.0)
            if action == 0:
                reward = np.random.normal(mu[3], 1)
                state = 4
            else:
                reward = mu[4]
                state = 5
                state = 1
            act3_cvar[c] = action
            r3_cvar[c] = reward
            total_reward += reward
            reward_cvar[c] = total_reward
        var_q = np.percentile(reward_cvar, q * 100)
        cvar_q = np.mean(reward_cvar[reward_cvar<= var_q])
        cvar_hist.append(cvar_q)   
    update_Q_cvar_adam(alpha_theta, trajectory_batch)
    eta_idx = np.argmax(H[:]*q + Q_cvar[0, :, 0])
    Q_cvar_hist3.append(H[eta_idx]*q + Q_cvar[0 ,eta_idx,0])
    Q_cvar_hist.append(H[eta_opt_index]*q + Q_cvar[0 ,eta_opt_index,0])
    if(episode%500 == 0):
        eta_index = np.argmax(H[:]*q + Q_cvar[0, :, 0])
        eta = H[eta_index]
        eta_hist.append(eta_index)
        if episode == 0:
            eta_index = 100
            eta_hist.append(eta_index)


with open('CVaR-Q_cvar.pickle', 'wb') as f:
    pickle.dump(cvar_hists, f)
with open('CVaR-Q_Pre-trained model_u.pickle', 'wb') as f:
    pickle.dump(Q_cvar, f)

