import numpy as np
import torch
# 前面的输出作为后一个的输入
def evaluate_episode_rtg(
        state_dim,
        act_dim,
        model,
        paths,
        max_ep_len=1000,
        scale=1000.,
        state_mean=0.,
        state_std=1.,
        device='cuda',
        target_return=None,
        mode='normal',
    ):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)

    

    path_len = len(paths)

    action_agent = []
    action_phy = []
    state_agent = []
    state_phy = []
    die = []
    reward_agent = []
    reward_phy = []

    for i in range(path_len):
        # state = env.reset(train=False,eval_type='val')
        states_phy = paths[i]['observations']
        rewards_phy = paths[i]['rewards']
        actions_phy = paths[i]['actions']
        max_ep_len = len(paths[i]['observations'])
        
        for t in range(max_ep_len):
            state = states_phy[t]

            states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
            actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
            rewards = torch.zeros(0, device=device, dtype=torch.float32)

            ep_return = target_return
            target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
            timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

            actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
            rewards = torch.cat([rewards, torch.zeros(1, device=device)])

            state,action,return_preds = model.get_action(
                (states.to(dtype=torch.float32) - state_mean) / state_std,
                actions.to(dtype=torch.float32),
                rewards.to(dtype=torch.float32),
                target_return.to(dtype=torch.float32),
                timesteps.to(dtype=torch.long),
            )
            #actions[-1] = action
            
            reward_agent.append(np.abs(return_preds.cpu().detach().numpy()))
            reward_phy.append(rewards_phy[t])

            action_phy.append(actions_phy[t])
            action_agent.append(action)
            die.append(paths[i]['dieds'][t])
            state_agent.append(state.detach().cpu().numpy())
            state_phy.append(states_phy[t])


    return reward_agent,reward_phy,state_agent,state_phy,action_agent,action_phy,die

def evaluate_episode(
        env,
        state_dim,
        act_dim,
        model,
        max_ep_len=1000,
        device='cuda',
        target_return=None,
        mode='normal',
        state_mean=0.,
        state_std=1.,
):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)

    state = env.reset()

    # we keep all the histories on the device
    # note that the latest action and reward will be "padding"
    states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
    actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
    rewards = torch.zeros(0, device=device, dtype=torch.float32)
    target_return = torch.tensor(target_return, device=device, dtype=torch.float32)
    sim_states = []

    episode_return, episode_length = 0, 0
    for t in range(max_ep_len):

        # add padding
        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        action = model.get_action(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
            target_return=target_return,
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()

        state, reward_env, done, _ = env.step(action)

        cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
        states = torch.cat([states, cur_state], dim=0)
        #rewards[-1] = return_agent  # 统计 agent 得到的回报,环境反馈的奖励无效

       #episode_return += return_agent
        episode_length += 1

        if done:
            break

    return episode_return, episode_length

def evaluate_episode_bc(
        state_dim,
        act_dim,
        model,
        paths,
        max_ep_len=1000,
        device='cuda',
        target_return=None,
        mode='normal',
        state_mean=0.,
        state_std=1.,
):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)
    
    episode_return, episode_length = 0, 0
    action_mean = []
    r_mean =[]
    for i in range(len(paths[0]['observations'])):
        state = paths[0]['observations'][i]
        action_ph = paths[0]['actions'][i]
        
        states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
        actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
        rewards = torch.zeros(0, device=device, dtype=torch.float32)

        ep_return = target_return
        target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
        timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        action = model.get_action(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()
        
        action_mean.append(np.mean((action-action_ph)**2))


    return np.mean(action_mean)

def evaluate_episode_rtg(
        env,
        state_dim,
        act_dim,
        model,
        max_ep_len=1000,
        scale=1000.,
        state_mean=0.,
        state_std=1.,
        device='cuda',
        target_return=None,
        mode='normal',
    ):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)

    # state = env.reset(train=False,eval_type='val')
    state = env.reset()
    if mode == 'noise':
        state = state + np.random.normal(0, 0.1, size=state.shape)

    # we keep all the histories on the device
    # note that the latest action and reward will be "padding"
    states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
    actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
    rewards = torch.zeros(0, device=device, dtype=torch.float32)

    ep_return = target_return
    target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
    timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

    sim_states = []

    episode_return, episode_length = 0, 0
    for t in range(max_ep_len):

        # add padding
        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        _,action,return_preds = model.get_action(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
            target_return.to(dtype=torch.float32),
            timesteps.to(dtype=torch.long),
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()

        state, reward, done, action = env.step(action)

        cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
        states = torch.cat([states, cur_state], dim=0)
        rewards[-1] = return_preds

        if mode != 'delayed':
            pred_return = target_return[0,-1] - (return_preds/scale)
        else:
            pred_return = target_return[0,-1]
        target_return = torch.cat(
            [target_return, pred_return.reshape(1, 1)], dim=1)
        timesteps = torch.cat(
            [timesteps,
             torch.ones((1, 1), device=device, dtype=torch.long) * (t+1)], dim=1)

        episode_return += return_preds
        episode_length += 1

        if done:
            break

    return episode_return, episode_length

def evaluate_episode_rtg_action(
        state_dim,
        act_dim,
        model,
        paths,
        max_ep_len=1000,
        scale=1000.,
        state_mean=0.,
        state_std=1.,
        device='cuda',
        target_return=None,
        mode='normal',
    ):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)
    
    episode_return, episode_length = 0, 0
    action_mean = []
    for i in range(len(paths[0]['observations'])):
        state = paths[0]['observations'][i]
        action_ph = paths[0]['actions'][i]
        
        states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
        actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
        rewards = torch.zeros(0, device=device, dtype=torch.float32)

        ep_return = target_return
        target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
        timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        _,action,_ = model.get_action(
            (states.to(dtype=torch.float32) - state_mean) / state_std,
            actions.to(dtype=torch.float32),
            rewards.to(dtype=torch.float32),
            target_return.to(dtype=torch.float32),
            timesteps.to(dtype=torch.long),
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()
        
        action_mean.append(np.mean((action-action_ph)**2))

        # print("agent:",np.mean(action,0))
        # print("ph:",np.mean(action_ph,0))

    return np.mean(action_mean)
# 前面的和后面的没有关系
def evaluate_episode_rtg_2(
        state_dim,
        act_dim,
        model,
        paths,
        max_ep_len=1000,
        scale=1000.,
        state_mean=0.,
        state_std=1.,
        device='cuda',
        target_return=None,
        mode='normal',
    ):

    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)
    state_std = torch.from_numpy(state_std).to(device=device)

    

    path_len = len(paths)

    action_agent = []
    action_phy = []
    state_agent = []
    state_phy = []
    die = []
    reward_agent = []
    reward_phy = []

    for i in range(path_len):
        # state = env.reset(train=False,eval_type='val')
        states_phy = paths[i]['observations']
        rewards_phy = paths[i]['rewards']
        actions_phy = paths[i]['actions']
        max_ep_len = len(paths[i]['observations'])
        
        for t in range(max_ep_len):
            state = states_phy[t]

            states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
            actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
            rewards = torch.zeros(0, device=device, dtype=torch.float32)

            ep_return = target_return
            target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
            timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

            actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
            rewards = torch.cat([rewards, torch.zeros(1, device=device)])

            state,action,return_preds = model.get_action(
                (states.to(dtype=torch.float32) - state_mean) / state_std,
                actions.to(dtype=torch.float32),
                rewards.to(dtype=torch.float32),
                target_return.to(dtype=torch.float32),
                timesteps.to(dtype=torch.long),
            )
            #actions[-1] = action
            
            reward_agent.append(np.abs(return_preds.cpu().detach().numpy()))
            reward_phy.append(rewards_phy[t])

            action_phy.append(actions_phy[t])
            action_agent.append(action)
            die.append(paths[i]['dieds'][t])
            state_agent.append(state.detach().cpu().numpy())
            state_phy.append(states_phy[t])


    return reward_agent,reward_phy,state_agent,state_phy,action_agent,action_phy,die

def my_evaluate_episode_rtg(state_dim,act_dim,model,paths,max_ep_len=1000,scale=1000.,state_mean=0.,state_std=1.,
                        device='cuda',target_return=None,mode='delayed',):
    model.eval()
    model.to(device=device)

    state_mean = torch.from_numpy(state_mean).to(device=device)     
    state_std = torch.from_numpy(state_std).to(device=device)   

    target_r = target_return
    len_paths = len(paths)

    action_agent = []
    action_phy = []
    state_agent = []
    state_phy = []
    die = []
    reward_agent = []
    reward_phy = []

    sum_agent_ivs = []
    sum_agent_vasos = [] 
    sum_phy_ivs = []
    sum_phy_vasos = [] 

    delta_phy_vasos = [] 
    delta_agent_vasos = []
 

    for i in range(len_paths):
        states = paths[i]['observations']
        next_states = paths[i]['next_observations']
        actions = paths[i]['actions']
        rewards = paths[i]['rewards']

        state = states[0]
        action = actions[0]
        r = rewards[0]
        
        #actions_dt = torch.zeros((0,act_dim),device=device,dtype=torch.float32)
        actions_dt = torch.from_numpy(action).reshape(1, act_dim).to(device=device, dtype=torch.float32)
        states_dt = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
        
        #rewards_dt = torch.tensor(r, dtype=torch.float32, device=device).reshape(1, 1)
        rewards_dt = torch.zeros(0,device=device,dtype=torch.float32)
        r = torch.tensor([r], device=device, dtype=torch.float32)

        # 添加 r 到 rewards_dt 中
        rewards_dt = torch.cat([rewards_dt, r])
        
        ep_return = target_r
        target_return = torch.tensor(ep_return, device=device, dtype=torch.float32).reshape(1, 1)
        timesteps = torch.tensor(1, device=device, dtype=torch.long).reshape(1, 1)
        
        sum_agent,sum_phy = 0.,0.

        

        num = min(len(states),max_ep_len)-1

        sum_agent_iv = 0
        sum_agent_vaso = 0
        sum_phy_iv = 0
        sum_phy_vaso = 0

        delta_phy_vaso = -99999
        delta_phy_pre_vaso = 0
        delta_agent_vaso = -99999
        delta_agent_pre_vaso = 0

        for j in range(num):
            # add padding
            #actions_dt = torch.cat([actions_dt, torch.zeros((1, act_dim), device=device)], dim=0)
            #rewards_dt = torch.cat([rewards_dt, torch.zeros(1, device=device)])
            

            state_preds,action_preds,return_preds,act_loss = model.get_action(
                (states_dt.to(dtype=torch.float32) - state_mean) / state_std,
                actions_dt.to(dtype=torch.float32),
                rewards_dt.to(dtype=torch.float32),
                target_return.to(dtype=torch.float32),
                timesteps.to(dtype=torch.long),  
            )  # state tensor, action array, return_preds tensor
            # 是否取j
            actions_dt = torch.cat([actions_dt, torch.zeros((1, act_dim), device=device)], dim=0)
            rewards_dt = torch.cat([rewards_dt, torch.zeros(1, device=device)])

            # if model.vent:
            #     actionp = action_preds
            # else:
            actionp = action_preds[0, -1]
            #actionp = actionp.clamp(0, 1)
            actions_dt[-1] = actionp
            returnp = return_preds[0,-1,0]
            rewards_dt[-1] = returnp
            statep = state_preds[0,-1]

            if len(actions[j]) == 2:
                sum_phy_iv += actions[j][0]
                sum_phy_vaso += actions[j][1]
                sum_agent_iv += actionp.cpu().numpy()[0]
                sum_agent_vaso += actionp.cpu().numpy()[1]

                delta_phy_vaso = max(actions[j][1]-delta_phy_pre_vaso,delta_phy_vaso)
                delta_phy_pre_vaso = actions[j][1]
                delta_agent_vaso = max(actionp.cpu().numpy()[1]-delta_agent_pre_vaso,delta_phy_vaso)
                delta_agent_pre_vaso = actionp.cpu().numpy()[1]
            
            
            # actions_dt = torch.cat([actions_dt, actionp.view(1,act_dim)], dim=0)
            # rewards_dt = torch.cat([rewards_dt, returnp.view(1,1)],dim=1)

            if mode != 'delayed':
                treturnp = target_return[0,-1] - 0.99*(returnp/scale)
            else:
                treturnp = target_return[0,-1]
            
            
            target_return = torch.cat([target_return, treturnp.view(1,1)], dim=1)
            timesteps = torch.cat([timesteps,torch.ones((1, 1), device=device, dtype=torch.long) * (j+2)], dim=1)

            sum_agent += returnp.item()
            sum_phy += rewards[j]

            if j != num-1:
                states_dt = torch.cat([states_dt,statep.view(1,state_dim)],dim=0)
            else:
                break

            reward_phy.append(torch.tensor(rewards[j])) # r0
            reward_agent.append(returnp)

            action_phy.append(actions[j]) # a0
            action_agent.append(actionp.detach().cpu().numpy())

            if j < num-1:
                state_phy.append(states[j+1])  # s1
                state_agent.append(statep.detach().cpu().numpy())
        sum_agent_ivs.append(sum_agent_iv)
        sum_agent_vasos.append(sum_agent_vaso) 
        sum_phy_ivs.append(sum_phy_iv)
        sum_phy_vasos.append(sum_phy_vaso)

        delta_phy_vasos.append(delta_phy_vaso)
        delta_agent_vasos.append(delta_agent_vaso)
    if len(actions[0]) == 2:
        a = np.mean(sum_agent_ivs) - np.mean(sum_phy_ivs) 
        b = np.mean(sum_agent_vasos) - np.mean(sum_phy_vasos)
        c = np.mean(delta_agent_vasos) - np.mean(delta_phy_vasos)
    else:
        a = b = c = 0

        
    return reward_agent,reward_phy,state_agent,state_phy,action_agent,action_phy,die,act_loss,a,b,c
