import numpy as np
import matplotlib.pyplot as plt
import cvxpy as cp
r=0.4
q=0
max_steps =500
learning_rate = 0.3
learning_rate_lam=0.1
learning_rate_PG=0.01
gamma = 0.9
PG_steps=400
TD_step=300

All_states=[]
for i in range(64):
    All_states.append(i)
All_actions=[0,1,2,3]
S_n=len(All_states)
A_n=len(All_actions)
theta=np.random.random((S_n,A_n))


rho=np.ones(S_n)
for i in range(len(rho)):
  rho[i]=rho[i]/(np.sum(rho))


one_var = np.ones(A_n)




def normal_vec(vectr):
  sum=np.sum(vectr)
  for i in range(len(vectr)):
    vectr[i]=vectr[i]/sum
  return np.array(vectr)

def normal(theta):
  theta_solve = np.zeros((S_n, A_n))
  for state in All_states:
    # Construct the problem.
    x = cp.Variable(A_n)
    objective = cp.Minimize(cp.sum_squares(x - theta[state, :]))
    constraints = [0 <= x, one_var @ x <= 1, one_var @ x >= 1]
    prob = cp.Problem(objective, constraints)
    # The optimal objective value is returned by `prob.solve()`.
    prob.solve()
    # The optimal value for x is stored in `x.value`.
    solv = x.value
    for jjjjj in range(len(solv)):
      if solv[jjjjj] < pow(10,-3):
        solv[jjjjj] = 0
    theta_solve[state, :] = normal_vec(solv)
  return theta_solve


theta=normal(theta)


All_c=np.zeros([S_n,A_n])
for i in All_states:
    for j in All_actions:
        All_c[i,j]=np.random.random()

print(All_c)


def choose_action(state,theta):
  p_choose=np.array(theta[state,:])
  action_chosed=np.random.choice(All_actions,p=p_choose.ravel())
  return action_chosed

print(choose_action(4,theta))

def step(obs,act):
    if act==0:
        st2 = obs - 8
        if st2 < 0:
            st2 = obs
            return -5,All_c[obs,act],st2
        elif st2 in [19,29,35,41,42,46,49,52,54,59]:
            return -10,All_c[obs,act],st2
        else:
            return 0,1,st2
    if act ==1:
        st2 = obs+ 8
        if st2 >63:
            st2 = obs
            return -5,All_c[obs,act],st2
        elif st2 in [19,29,35,41,42,46,49,52,54,59]:
            return -10, All_c[obs,act], st2
        elif st2==15:
            return 20, All_c[obs,act], 0
        else:
            return 0,All_c[obs,act],st2
    if act==2:
        st2 = obs - 1
        if obs in [0,8,16,24,32,40,48,56]:
            st2 = obs
            return -5,All_c[obs,act],st2
        elif st2 in [19,29,35,41,42,46,49,52,54,59]:
            return -10, All_c[obs,act], st2
        else:
            return 0, All_c[obs,act], st2
    if act==3:
        st2 = obs +1
        if obs in [7,15,23,31,39,47,55,63]:
            st2 = obs
            return -5,All_c[obs,act],st2
        elif st2 in [19,29,35,41,42,46,49,52,54,59]:
            return -10, All_c[obs,act], st2
        elif st2==63:
            return 20, All_c[obs,act], 0
        else:
            return 0, All_c[obs,act], st2





def reset():
  rho1=normal_vec(rho)
  ss=np.random.choice(All_states, p=rho1.ravel())
  return ss


Qr=np.ones((S_n,A_n))

def robust_TDr(observation, observation2, reward, action,action2, theta):
  Vr_fun=[]
  for state in All_states:
    sum=0
    for act in All_actions:
      sum=sum+theta[state,act]*Qr[state,act]
    Vr_fun.append(sum)
  minV=np.min(Vr_fun)
  Qr[observation,action]=(1-learning_rate)*Qr[observation,action]+learning_rate*(reward+gamma*(1-r)*Qr[observation2,action2]+gamma*r*minV)

Qg=np.ones((S_n,A_n))
def robust_TDg(observation, observation2, cost, action,action2, theta):
  Vg_fun=[]
  for state in All_states:
    sum=0
    for act in All_actions:
      sum=sum+theta[state,act]*Qg[state,act]
    Vg_fun.append(sum)
  minVg=np.min(Vg_fun)
  Qg[observation,action]=(1-learning_rate)*Qg[observation,action]+learning_rate*(cost+gamma*(1-r)*Qg[observation2,action2]+gamma*r*minVg)



Qr_test=np.ones((S_n,A_n))

def test_TDr(observation, observation2, reward, action,action2, theta):
  Vr_fun=[]
  for state in All_states:
    sum=0
    for act in All_actions:
      sum=sum+theta[state,act]*Qr_test[state,act]
    Vr_fun.append(sum)
  minV=np.min(Vr_fun)
  Qr_test[observation,action]=(1-learning_rate)*Qr_test[observation,action]+learning_rate*(reward+gamma*(1-r)*Qr_test[observation2,action2]+gamma*r*minV)

Qg_test=np.ones((S_n,A_n))
def test_TDg(observation, observation2, cost, action,action2, theta):
  Vg_fun=[]
  for state in All_states:
    sum=0
    for act in All_actions:
      sum=sum+theta[state,act]*Qg_test[state,act]
    Vg_fun.append(sum)
  minVg=np.min(Vg_fun)
  Qg_test[observation,action]=(1-learning_rate)*Qg_test[observation,action]+learning_rate*(cost+gamma*(1-r)*Qg_test[observation2,action2]+gamma*r*minVg)





Qnr=np.zeros((S_n,A_n))
Qng=np.zeros((S_n,A_n))
def TDr(observation, observation2, reward, action,action2, theta):
  Qnr[observation,action]=(1-learning_rate)*Qnr[observation,action]+learning_rate*(reward+gamma*Qnr[observation2,action2])

def TDg(observation, observation2, cost, action,action2, theta):
  Qng[observation,action]=(1-learning_rate)*Qng[observation,action]+learning_rate*(cost+gamma*Qng[observation2,action2])


Qr=np.random.random((S_n,A_n))
def Vrr(Qqqr,theta):## 已知Q求V
  Vr_in=np.zeros(S_n)
  for state in range(S_n):
    sum=0
    for action in range(A_n):
      sum=sum+theta[state,action]*Qqqr[state,action]
    Vr_in[state]=sum
  return Vr_in
print('test')
print(Vrr(Qr,theta))




def Vgg(Qqqr,theta):## 已知Q求V
  Vr_in=np.zeros(S_n)
  for state in range(S_n):
    sum=0
    for action in range(A_n):
      sum=sum+theta[state,action]*Qqqr[state,action]
    Vr_in[state]=sum
  return Vr_in






####   C robust PG
p_geo=1-gamma+gamma*r
theta=np.zeros((S_n,A_n))
for j in All_states:
    theta[j,2]=0.8
    for jj in range(A_n):
        if jj !=2:
            theta[j,jj]=np.random.random()
theta=np.ones((S_n,A_n))
theta=normal(theta)
theta_n=theta


Qr=np.zeros((S_n,A_n))
Qnr=np.zeros((S_n,A_n))
print(theta)
print(theta_n)
CRPG_r=[]
CRPG_g=[]
CRPG_l=[]
CRPG_nr=[]
CRPG_ng=[]
CRPG_nl=[]
lam=5
lam_n=5
for jj in range(PG_steps):
    print("Qr=")
    print(Qr)
    print("Qnr=")
    print(Qnr)
    theta=normal(theta)
    print('theta=')
    print(theta)
    theta_n=normal(theta_n)
    CRPG_l.append(lam)
    CRPG_nl.append(lam_n)
    print(jj)
    #####evaluation
    Qr_ave = np.zeros((S_n, A_n))
    Qg_ave = np.zeros((S_n, A_n))
    Qnr_ave = np.zeros((S_n, A_n))
    Qng_ave = np.zeros((S_n, A_n))
    Qtestr_ave = np.zeros((S_n, A_n))
    Qtestg_ave = np.zeros((S_n, A_n))
    ###robust TD and non-robust TD
    for i in range(5):
        Qr = np.zeros((S_n, A_n))
        Qg = np.zeros((S_n, A_n))
        Qnr = np.zeros((S_n, A_n))
        Qng = np.zeros((S_n, A_n))
        Qr_test = np.zeros((S_n, A_n))
        Qg_test = np.zeros((S_n, A_n))
        for j in range(TD_step):
            obs = reset()
            obs_n= obs
            obs_test=obs
            action = choose_action(obs, theta)
            action_n = choose_action(obs_n, theta_n)
            action_test = choose_action(obs_test, theta_n)
            ttt = 0
            while ttt < max_steps:
                reward, cost, obs2 = step(obs, action)
                act2 = choose_action(obs2, theta)
                robust_TDr(obs, obs2, reward, action, act2, theta)
                robust_TDg(obs, obs2, cost, action, act2, theta)
                obs = obs2
                action = act2
                reward_n, cost_n, obs_n2 = step(obs_n, action_n)
                act_n2 = choose_action(obs_n2, theta_n)
                TDr(obs_n, obs_n2, reward_n, action_n, act_n2, theta_n)
                TDg(obs_n, obs_n2, cost_n, action_n, act_n2, theta_n)
                obs_n = obs_n2
                action_n = act_n2
                reward_test, cost_test, obs_test2 = step(obs_test, action_test)
                act_test2 = choose_action(obs_test2, theta_n)
                test_TDr(obs_test, obs_test2, reward_test, action_test, act_test2, theta_n)
                test_TDg(obs_test, obs_test2, cost_test, action_test, act_test2, theta_n)
                obs_test = obs_test2
                action_test = act_test2
                ttt= ttt + 1
        Qr_ave = Qr_ave + Qr / 5
        Qg_ave = Qg_ave + Qg / 5
        Qnr_ave = Qnr_ave + Qnr /5
        Qng_ave = Qng_ave + Qng /5
        Qtestr_ave = Qtestr_ave + Qr_test/5
        Qtestg_ave = Qtestg_ave + Qg_test/5
    #######append numbers
    Qr=Qr_ave
    Qg=Qg_ave
    Qnr=Qtestr_ave
    Qng=Qtestg_ave
    Vr =Vrr(Qr_ave, theta)
    Vg = Vgg(Qg_ave, theta)
    Vnr = Vrr(Qtestr_ave, theta_n)
    Vng = Vgg(Qtestg_ave, theta_n)
    Vg_rho =np.mean(Vg)
    Vr_rho=np.mean(Vr)
    vr_test_rho = np.mean(Vrr(Qtestr_ave, theta_n))
    vg_test_rho = np.mean(Vgg(Qtestg_ave, theta_n))
    Vng_rho=vg_test_rho
    CRPG_g.append(Vg_rho)
    CRPG_r.append(Vr_rho)
    CRPG_ng.append(vg_test_rho)
    CRPG_nr.append(vr_test_rho)
    ####computed gradient V_r+lam(v_g-b)
    grad_Vr_ave=np.zeros((S_n,A_n))
    grad_Vg_ave = np.zeros((S_n, A_n))
    for timess in range(100):
        sample_length = (np.random.geometric(p=p_geo))
        obs = reset()
        act = choose_action(obs, theta)
        for ii in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta)
            obs = obs2
            act = act2
        x_rho = obs
        B2 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B2[x_rho, a] = 1 / (p_geo) * Qr[x_rho, a]
        grad_Vr_ave=B2/100+grad_Vr_ave
    sr_min = np.argmin(Vr)
    for timess in range(100):
        obs = sr_min
        act = choose_action(obs, theta)
        for ii in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta)
            obs = obs2
            act = act2
        xr_min = obs
        B1 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B1[xr_min, a] = 1 / (p_geo) * gamma * r / (1 - gamma) * Qr[xr_min, a]
        grad_Vr_ave = B1 / 100 + grad_Vr_ave
    #####gradient of robust V_g
    for timess in range(100):
        sample_length = (np.random.geometric(p=p_geo))
        obs = reset()
        act = choose_action(obs, theta)
        for i in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta)
            obs = obs2
            act = act2
        x_rho = obs
        B2 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B2[x_rho, a] = 1 / (p_geo) * Qg[x_rho, a]
        grad_Vg_ave = B2 / 100 + grad_Vg_ave
    for timess in range(100):
        sg_min = np.argmin(Vg)
        obs = sg_min
        act = choose_action(obs, theta)
        for i in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta)
            obs = obs2
            act = act2
        xg_min = obs
        B1 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B1[xg_min, a] = gamma * r* Qg[xg_min, a]/((p_geo) * (1 - gamma) )
        grad_Vg_ave = B2 / 100 + grad_Vg_ave
    grad_V = grad_Vr_ave + lam * grad_Vg_ave
    theta = theta + learning_rate_PG * (grad_V)
    grad_Vnr_ave=np.zeros((S_n,A_n))
    grad_Vng_ave = np.zeros((S_n, A_n))
    ####gradient of Vr_norobust
    Qnr=Qr_test
    Qng=Qg_test
    for timess in range(100):
        sample_length = (np.random.geometric(p=1-gamma))
        obs = reset()
        act = choose_action(obs, theta_n)
        for ii in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta_n)
            obs = obs2
            act = act2
        x_rho = obs
        B2 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B2[x_rho, a] = 1 / (1-gamma) * Qnr[x_rho, a]
        grad_Vnr_ave=grad_Vnr_ave+B2/100
    ###gradient of Vg_norbost
    for timess in range(100):
        sample_length = (np.random.geometric(p=1-gamma))
        obs = reset()
        act = choose_action(obs, theta_n)
        for ii in range(sample_length):
            reward, cost, obs2 = step(obs, act)
            act2 = choose_action(obs2, theta_n)
            obs = obs2
            act = act2
        x_rho = obs
        B2 = np.zeros(((S_n, A_n)))
        for a in All_actions:
            B2[x_rho, a] = 1 / (1-gamma) * Qng[x_rho, a]
        grad_Vng_ave = grad_Vng_ave + B2 / 100
    theta_n = theta_n + learning_rate_PG * (grad_Vnr_ave+lam_n*grad_Vng_ave)
    ########update lambda
    lam = lam - learning_rate_lam * (Vg_rho -2)
    lam_n = lam_n - learning_rate_lam * (Vng_rho -2)
    if lam < 0:
        lam = 0
    if lam_n < 0:
        lam_n = 0





file = open('fl_r_1.txt','w');
file.write(str(CRPG_r));
file.close();

file = open('fl_g_1.txt','w');
file.write(str(CRPG_g));
file.close();

file = open('fl_l_1.txt','w');
file.write(str(CRPG_l));
file.close();
file = open('fl_nr_1.txt','w');
file.write(str(CRPG_nr));
file.close();

file = open('fl_ng_1.txt','w');
file.write(str(CRPG_ng));
file.close();

file = open('fl_nl_1.txt','w');
file.write(str(CRPG_nl));
file.close();

plt.plot(CRPG_r,label='r')
plt.plot(CRPG_g,label='g')
plt.plot(CRPG_nr,label='nr')
plt.plot(CRPG_ng,label='ng')
plt.legend()
plt.show()

print(theta)
print(theta_n)

