# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/dqn/#dqnpy
import argparse
import os
import random
import time
from distutils.util import strtobool

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter

from games.grid_shooting import GridShootingVSRandom,GridShootingVSGiven
import numpy as np
import json

from cleanrl.models import QNetwork, LatentMLP, Encoder, ClassMLP, NPMLP, NPMLPLORA

latent_dim=2

def parse_args():
    # fmt: off
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
        help="the name of this experiment")
    parser.add_argument("--seed", type=int, default=1,
        help="seed of the experiment")
    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, `torch.backends.cudnn.deterministic=False`")
    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, cuda will be enabled by default")
    parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="if toggled, this experiment will be tracked with Weights and Biases")
    parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
        help="the wandb's project name")
    parser.add_argument("--wandb-entity", type=str, default=None,
        help="the entity (team) of wandb's project")
    parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="whether to capture videos of the agent performances (check out `videos` folder)")
    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="whether to save model into the `runs/{run_name}` folder")
    parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
        help="whether to upload the saved model to huggingface")
    parser.add_argument("--hf-entity", type=str, default="",
        help="the user or org name of the model repository from the Hugging Face Hub")

    # Algorithm specific arguments
    parser.add_argument("--env-id", type=str, default="CartPole-v1",
        help="the id of the environment")
    parser.add_argument("--total-timesteps", type=int, default=1000010,#500010,
        help="total timesteps of the experiments")
    parser.add_argument("--learning-rate", type=float, default=2.5e-4,
        help="the learning rate of the optimizer")
    parser.add_argument("--num-envs", type=int, default=1,
        help="the number of parallel game environments")
    parser.add_argument("--buffer-size", type=int, default=100000,
        help="the replay memory buffer size")
    parser.add_argument("--gamma", type=float, default=0.99,
        help="the discount factor gamma")
    parser.add_argument("--tau", type=float, default=1.,
        help="the target network update rate")
    parser.add_argument("--target-network-frequency", type=int, default=500,
        help="the timesteps it takes to update the target network")
    parser.add_argument("--batch-size", type=int, default=128,
        help="the batch size of sample from the reply memory")
    parser.add_argument("--start-e", type=float, default=1,
        help="the starting epsilon for exploration")
    parser.add_argument("--end-e", type=float, default=0.05,
        help="the ending epsilon for exploration")
    parser.add_argument("--exploration-fraction", type=float, default=0.5,
        help="the fraction of `total-timesteps` it takes from start-e to go end-e")
    parser.add_argument("--learning-starts", type=int, default=10000,
        help="timestep to start learning")
    parser.add_argument("--train-frequency", type=int, default=10,
        help="the frequency of training")
    parser.add_argument("--save-model-frequency", type=int, default=200000,
        help="the frequency of training")
    parser.add_argument("--eval-frequency", type=int, default=10000,
        help="the frequency of training")
    parser.add_argument("--offline_dir1", type=str, default='/home/wjh/myh/cleanrl-master/useful_models/buffer_star',
        help="the frequency of training")
    parser.add_argument("--offline_dir2", type=str, default='/home/wjh/myh/cleanrl-master/useful_models/buffer_onlyshoot',
        help="the frequency of training")
    parser.add_argument("--gt-z", type=bool, default=False,
        help="the frequency of training")
    parser.add_argument("--load_q", type=str, default='/home/wjh/myh/cleanrl-master/useful_models/load_q/dqnpg_env_500000.cleanrl_model',
        help="the frequency of training")
    args = parser.parse_args()
    # fmt: on
    assert args.num_envs == 1, "vectorized envs are not supported at the moment"

    return args


def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)

        return env

    return thunk


# ALGO LOGIC: initialize agent here:

def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)


if __name__ == "__main__":
    import stable_baselines3 as sb3

    if sb3.__version__ < "2.0":
        raise ValueError(
            """Ongoing migration: run the following command to install the new dependencies:

poetry run pip install "stable_baselines3==2.0.0a1"
"""
        )
    args = parse_args()
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
    if args.track:
        import wandb

        wandb.init(
            project=args.wandb_project_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

    # TRY NOT TO MODIFY: seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # env setup
    envs = GridShootingVSRandom()
    eval_env = GridShootingVSRandom()
    env_list = [GridShootingVSRandom() for _ in range(args.batch_size)]
    assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

    state_dim=np.array(envs.single_observation_space.shape).prod()
    action_size=envs.single_action_space.n

    '''q_network = QNetwork(envs).to(device)
    optimizer = optim.Adam(QNetwork.parameters(), lr=args.learning_rate)
    target_network = QNetwork(envs).to(device)
    target_network.load_state_dict(q_network.state_dict())'''
    
    q_network = QNetwork(envs,latent_dim=0).to(device)
    assert args.load_q!=''
    q_network.load_state_dict(torch.load(args.load_q))
    
    actor_network = NPMLPLORA(envs,latent_dim=latent_dim).to(device)
    actor_optimizer = optim.Adam(actor_network.parameters(), lr=args.learning_rate)
    
    # actor_network.linear1.weight=q_network.network[0].weight
    # actor_network.linear2.weight=q_network.network[2].weight
    # actor_network.linear3.weight=q_network.network[4].weight
    # actor_network.linear1.bias=q_network.network[0].bias
    # actor_network.linear2.bias=q_network.network[2].bias
    # actor_network.linear3.bias=q_network.network[4].bias

    actor_network_cls = NPMLPLORA(envs,latent_dim=latent_dim).to(device)
    actor_optimizer_cls = optim.Adam(actor_network_cls.parameters(), lr=args.learning_rate)
    
    # actor_network_cls.linear1.weight=q_network.network[0].weight
    # actor_network_cls.linear2.weight=q_network.network[2].weight
    # actor_network_cls.linear3.weight=q_network.network[4].weight
    # actor_network_cls.linear1.bias=q_network.network[0].bias
    # actor_network_cls.linear2.bias=q_network.network[2].bias
    # actor_network_cls.linear3.bias=q_network.network[4].bias
    
    actor_network_eval=actor_network_cls
    
    rb_offline = ReplayBuffer(
        args.buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        handle_timeout_termination=False,
    )
    
    assert args.offline_dir1!=''
    offline_dir_list=[args.offline_dir1]
    if args.offline_dir2!='':
        offline_dir_list.append(args.offline_dir2)
    
    o_traj_list=[]
    a_traj_list=[]
    mask_list=[]
    eval_o_list=[]
    eval_a_list=[]
    eval_o_traj_list=[]
    eval_a_traj_list=[]
    eval_mask_list=[]
    eval_gt_z_list=[]
    eval_split=[0]
    offset=0
    for i_dir,offline_dir in enumerate(offline_dir_list):
        episode_limit=100
        o,a,done=np.load(offline_dir+'/observations.npy'),np.load(offline_dir+'/actions.npy'),np.load(offline_dir+'/dones.npy')
        #o,a,done=o[:1000],a[:1000],done[:1000]#debug
        split=np.where(done==1)[0]
        o,a=o[:split[-1]+1],a[:split[-1]+1]
        #eye=np.eye(action_size)
        o_traj=np.zeros((len(split),episode_limit,o.shape[-1]))
        a_traj=np.zeros((len(split),episode_limit))
        mask=np.zeros((len(split),episode_limit))
        for i in range(len(split)):
            if i==0:
                o_traj[i,:split[0]+1]=o[:split[0]+1]
                a_traj[i,:split[0]+1]=a[:split[0]+1]
                mask[i,:split[0]+1]=1
            else:
                o_traj[i,:split[i]-split[i-1]]=o[split[i-1]+1:split[i]+1]
                a_traj[i,:split[i]-split[i-1]]=a[split[i-1]+1:split[i]+1]
                mask[i,:split[i]-split[i-1]]=1
    
        ratio=0.9
        now=0
        split_traj_n=int(ratio*len(split))
        split_i=split[split_traj_n]+1
        eval_o_traj=np.zeros((split[-1]+1-split_i,episode_limit,state_dim))
        eval_a_traj=np.zeros((split[-1]+1-split_i,episode_limit))
        eval_mask=np.zeros((split[-1]+1-split_i,episode_limit))
        for i in range(len(o)):
            next_o=np.zeros_like(o[i])
            next_o[0]=now+offset
            next_o[1+i_dir]=1
            #if random.random()<0.5:
            #    next_o[2]=1
            #else:
            #    next_o[1]=1
            if i<split_i:
                rb_offline.add(o[i], next_o, a[i], [0], False, {})
            else:
                eval_o_traj[i-split_i,:split[now]-split[now-1],:]=o[split[now-1]+1:split[now]+1]
                eval_a_traj[i-split_i,:split[now]-split[now-1]]=a[split[now-1]+1:split[now]+1]
                eval_mask[i-split_i,:split[now]-split[now-1]]=1
            if i==split[now]:
                now+=1
        eval_o_traj=torch.tensor(eval_o_traj).to(device).float()
        eval_a_traj=torch.tensor(eval_a_traj).to(device).float()
        eval_mask=torch.tensor(eval_mask).to(device).float()
        eval_o=np.array(o[split_i:])
        eval_a=np.array(a[split_i:])
        eval_gt_z=torch.zeros((eval_o.shape[0],2)).to(device).float()
        eval_gt_z[:,i_dir]=1
        
        o_traj_list.append(o_traj)
        a_traj_list.append(a_traj)
        mask_list.append(mask)
        eval_o_traj_list.append(eval_o_traj)
        eval_a_traj_list.append(eval_a_traj)
        eval_mask_list.append(eval_mask)
        eval_o_list.append(eval_o)
        eval_a_list.append(eval_a)
        eval_gt_z_list.append(eval_gt_z)
        eval_split.append(eval_split[-1]+len(eval_o))
        
        offset+=len(o_traj)
    o_traj=np.concatenate(o_traj_list,axis=0)
    a_traj=np.concatenate(a_traj_list,axis=0)
    mask=np.concatenate(mask_list,axis=0)
    eval_o=torch.tensor(np.concatenate(eval_o_list,axis=0)).to(device).float()
    eval_a=np.concatenate(eval_a_list,axis=0)
    eval_a_torch=torch.tensor(eval_a).to(device).float()
    eval_o_traj=torch.cat(eval_o_traj_list,dim=0)
    eval_a_traj=torch.cat(eval_a_traj_list,dim=0)
    eval_mask=torch.cat(eval_mask_list,dim=0)
    eval_gt_z=torch.cat(eval_gt_z_list,axis=0)
        
    start_time = time.time()

    # TRY NOT TO MODIFY: start the game
    obs = envs.reset()
    all_ret,all_win,all_acc=[],[],[]
    loss_func1=nn.CrossEntropyLoss()
    loss_func2=nn.MSELoss()
    start_t=time.time()
    encoder_loss=0
    
    zs=[]
    for global_step in range(args.total_timesteps):
        # ALGO LOGIC: put action logic here
        '''epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
        if random.random() < epsilon:
            actions = envs.single_action_space.sample()#np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            q_values = q_network(torch.Tensor(obs).to(device))
            actions = torch.argmax(q_values, dim=-1).cpu().numpy()

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards, terminated, infos = envs.step(actions)

        # TRY NOT TO MODIFY: record rewards for plotting purposes
        if "final_info" in infos:
            for info in infos["final_info"]:
                # Skip the envs that are not done
                if "episode" not in info:
                    continue
                #print(f"global_step={global_step}, episodic_return={info['episode']['r']}, win={info['episode']['win']}")
                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                writer.add_scalar("charts/win", info["episode"]["win"], global_step)
                writer.add_scalar("charts/epsilon", epsilon, global_step)

        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
        real_next_obs = next_obs.copy()
        #for idx, d in enumerate(truncated):
        #    if d:
        #        real_next_obs[idx] = infos["final_observation"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminated, infos)

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs
        if terminated:
            obs=envs.reset()'''

        # ALGO LOGIC: training.
        if True or global_step > args.learning_starts:
            if global_step % args.train_frequency == 0:
                data = rb_offline.sample(args.batch_size)
                batch_os=torch.tensor(o_traj[data.next_observations[:,0].cpu().int().numpy()]).to(device).float()
                batch_as=torch.tensor(a_traj[data.next_observations[:,0].cpu().int().numpy()]).to(device).float()
                src_mask=torch.tensor(mask[data.next_observations[:,0].cpu().int().numpy()]).to(device).float()
                if args.gt_z:
                    z=data.next_observations[:,1:3]
                else:
                    with torch.no_grad():
                        # print('db',batch_os.shape,batch_as.shape,src_mask.shape)
                        z = actor_network_cls.select(batch_os,batch_as,src_mask)
                        #z = actor_network_cls.selectv2(batch_os,batch_as,src_mask)
                pred = actor_network_cls.forward_all_head(data.observations)#actor_network_cls(data.observations,z)
                #print(pred.shape,data.actions.shape)
                
                #z onehot
                z_oh=torch.eye(latent_dim)[z.argmax(-1)].float().to(device)
                
                q = q_network(data.observations)
                #q=q-q.mean(-1,keepdim=True)
                q_data=q.gather(1, data.actions)#(bs,1)
                #q_pi=q.gather(1,pred.argmax(-1).unsqueeze(1))
                adv=q_data#-q_pi#(adv-adv.mean())/(adv.std()+1e-8)
                #adv_weighted=F.softmax(adv/z.clamp(0.1,1)*0.1,dim=0)
                adv=F.softmax(adv*0.1,dim=0)#(bs,1)
                
                #softmax*z
                #prob=(F.softmax(pred,-1).clamp(1e-4,1)*z.transpose(1,0).unsqueeze(2)).sum(dim=0)
                prob=(F.softmax(pred,-1).clamp(1e-4,1)*z_oh.transpose(1,0).unsqueeze(2)).sum(dim=0)
                actor_loss_cls = (-prob.log().gather(1,data.actions)).mean()
                # #softmax.log.gather*z
                # prob=F.softmax(pred,-1).clamp(1e-4,1)
                # gathered_prob=prob.log().gather(2,data.actions.unsqueeze(0).repeat(latent_dim,1,1))
                # gathered_prob=gathered_prob.squeeze(2)
                # actor_loss_cls = (-gathered_prob*z.transpose(1,0)).mean()
                # #logit
                # score=(pred*z.transpose(1,0).unsqueeze(2)).sum(dim=0)
                # actor_loss_cls = (-F.softmax(score,-1).clamp(1e-4,1).log().gather(1,data.actions)).mean()#loss_func1(pred,data.actions[:,0])
                
                actor_optimizer_cls.zero_grad()
                actor_loss_cls.backward()
                actor_optimizer_cls.step()
                
                #useless imitate q
                '''pred_all=actor_network(data.observations,torch.ones_like(z).cuda()/latent_dim)
                a_best=q.argmax(-1).detach().unsqueeze(-1)
                adv_best=q.gather(1, a_best)-q_pi
                adv_best=F.softmax(adv_best*0.1,dim=0)
                actor_loss+=(-F.softmax(pred_all,-1).clamp(1e-4,1).log().gather(1,a_best)*adv_best).mean()'''
                
                #style kl
                actor_loss=0
                for i in range(latent_dim):
                    z_i=torch.zeros_like(z)
                    z_i[:,i]=1
                    pred_i=actor_network_eval(data.observations,z_i)
                    #actor_loss += (-F.softmax(pred_i,-1).clamp(1e-4,1).log().gather(1,data.actions)*adv/z[:,i].clamp(0.1,1)).mean()
                    #actor_loss += (-F.softmax(pred_i,-1).clamp(1e-4,1).log().gather(1,data.actions)*adv*z[:,i]).mean()
                    actor_loss+=(-F.softmax(pred_i,-1).clamp(1e-4,1).log().gather(1,data.actions).squeeze(1)*adv[:]*z[:,i]).sum()#(F.cross_entropy(pred_i,data.actions[:,0],reduce=False)*z[:,i]).sum()
                    #actor_loss+=(-F.softmax(pred_i,-1).clamp(1e-4,1).log().gather(1,data.actions).squeeze(1)*z[:,i]).sum()
                    
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                if global_step % 100 == 0:
                    writer.add_scalar("losses/actor_loss", actor_loss_cls, global_step)
                    writer.add_scalar("losses/encoder_loss", encoder_loss, global_step)
                    #writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)
                    #print("SPS:", int(global_step / (time.time() - start_time)))
                    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

            # update target network
            '''if global_step % args.target_network_frequency == 0:
                for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
                    target_network_param.data.copy_(
                        args.tau * q_network_param.data + (1.0 - args.tau) * target_network_param.data
                    )'''
            
            if False and global_step % args.save_model_frequency==0:
                model_path = f"runs/{run_name}/actor_{args.exp_name}_{global_step}.cleanrl_model"
                torch.save(actor_network.state_dict(), model_path)
            
            if global_step % args.eval_frequency==0:
                if global_step==100000:
                    print('z',actor_network_eval.select(batch_os,batch_as,src_mask)[:20])
                return_mean=[]
                win_mean=[]
                star_mean,shoot_mean=[],[]
                n_eval=1000
                #print('eval z',z[0])
                for z_i in range(latent_dim):
                    return_list,win_list=[],[]
                    star_list,shoot_list=[],[]
                    eval_z=torch.eye(latent_dim)[z_i].to(device)
                    for _ in range(n_eval):
                        obs=eval_env.reset()
                        terminated=False
                        while not terminated:
                            with torch.no_grad():
                                scores = actor_network_eval(torch.Tensor(obs).to(device),eval_z)
                                actions = torch.argmax(scores, dim=-1).cpu().numpy()
                            next_obs, rewards, terminated, infos = eval_env.step(actions)
                            obs=next_obs
                        return_list.append(infos['final_info'][0]['episode']['r'])
                        win_list.append(infos['final_info'][0]['episode']['win'])
                        star_list.append(infos['final_info'][0]['episode']['star'])
                        shoot_list.append(infos['final_info'][0]['episode']['shoot'])
                    return_mean.append(np.mean(return_list))
                    win_mean.append(np.mean(win_list))
                    star_mean.append(np.mean(star_list))
                    shoot_mean.append(np.mean(shoot_list))
                    
                with torch.no_grad():
                    z=torch.eye(latent_dim)[0].to(device)
                    scores = actor_network_eval(eval_o,z)
                    actions0 = torch.argmax(scores, dim=-1).cpu().numpy()
                    z=torch.eye(latent_dim)[1].to(device)
                    scores = actor_network_eval(eval_o,z)
                    actions1 = torch.argmax(scores, dim=-1).cpu().numpy()
                    # print('db1',eval_o.shape,eval_a_torch.shape)
                    z=actor_network_eval.select(eval_o_traj,eval_a_traj,eval_mask)
                    scores = actor_network_eval(eval_o,z)
                    actions_all = torch.argmax(scores, dim=-1).cpu().numpy()
                
                acc_cat0,acc_cat1=[],[]
                for i in range(1,len(eval_split)):
                    acc_cat0.append((actions0[eval_split[i-1]:eval_split[i]]==eval_a[eval_split[i-1]:eval_split[i]]).mean())
                    acc_cat1.append((actions1[eval_split[i-1]:eval_split[i]]==eval_a[eval_split[i-1]:eval_split[i]]).mean())
                acc_all=(actions_all==eval_a).mean()
                
                print('test acc',acc_all)
                
                print(global_step,'evaluation in',n_eval,'times',return_mean,win_mean,star_mean,shoot_mean,acc_cat0,acc_cat1,'time',time.time()-start_t)
                start_t=time.time()
                
                all_ret.append(return_mean)
                all_win.append(win_mean)
                #all_acc.append(acc)
                json.dump(all_ret,open(f"runs/{run_name}/ret.json",'w'))
                json.dump(all_win,open(f"runs/{run_name}/win.json",'w'))
                #json.dump(all_acc,open(f"runs/{run_name}/acc.json",'w'))
                
                    

    envs.close()
    writer.close()
