import torch
import os
from datetime import datetime
import numpy as np
from mpi4py import MPI
from mpi_utils.mpi_utils import sync_networks, sync_grads
from rl_modules.replay_buffer import replay_buffer
from rl_modules.models import actor, SAEncoder, GEncoder
from mpi_utils.normalizer import normalizer
from her_modules.her1cons import her_sampler
from her_modules.herAnt import her_sampler as HERANT

class ddpg_ConsHER:
    def __init__(self, args, alg, env, env_name, env_params):
        print('ddpg_ConsHER')
        self.args = args
        self.env = env
        self.alg = alg
        self.env_name = env_name
        self.env_params = env_params
        
        # CRL-specific hyperparameters
        self.args.lambda_contrast = getattr(args, 'lambda_contrast', 0.1)  # Weight for contrastive loss
        self.args.repr_dim = getattr(args, 'repr_dim', 64)  # Representation dimension
        self.args.target_entropy = getattr(args, 'target_entropy', -env_params['action'])  # Target entropy for exploration
        self.args.logsumexp_penalty_coeff = getattr(args, 'logsumexp_penalty_coeff', 0.1)  # Logsumexp regularization

        # Create networks
        self.actor_network = actor(env_params)
        self.actor_network.share_memory()
        self.sa_encoder = SAEncoder(env_params)
        self.g_encoder = GEncoder(env_params)
        
        # Initialize learnable log_alpha for entropy regularization
        self.log_alpha = torch.tensor(0.0, requires_grad=True, device=torch.device('cuda:%d' % self.args.gpuid))
        
        # Sync networks across CPUs
        sync_networks(self.actor_network)
        sync_networks(self.sa_encoder)
        sync_networks(self.g_encoder)
        
        # Build target network for actor (no critic)
        self.actor_target_network = actor(env_params)
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        
        self.device = torch.device('cuda:%d' % self.args.gpuid)
        if self.args.cuda:
            self.actor_network.to(self.device)
            self.actor_target_network.to(self.device)
            self.sa_encoder.to(self.device)
            self.g_encoder.to(self.device)
            self.log_alpha = self.log_alpha.to(self.device)
        
        # Create optimizers
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.sa_encoder_optim = torch.optim.Adam(list(self.sa_encoder.parameters())+list(self.g_encoder.parameters()), lr=self.args.lr_critic)
        # self.g_encoder_optim = torch.optim.Adam(self.g_encoder.parameters(), lr=self.args.lr_critic)
        self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.lr_critic)

        # HER sampler
        if env_name[:3] == 'Ant':
            self.her_module = HERANT(self.args.replay_strategy, self.args.replay_k, env_params, self.env_name,
                                     self.env.compute_reward)
        else:
            self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, env_params, self.env_name,
                                          self.env.compute_reward)

        # Create replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        
        # Create normalizers
        self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.ag_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        self.ag_next_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        self.dg_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        
        # Setup model saving
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.makedirs(self.args.save_dir)
            self.model_path = os.path.join(self.args.save_dir, 'weights', self.args.env_name,
                                           alg + '_' + str(self.args.weight1) + '_' + str(self.args.weight2),
                                           str(self.args.seed))
            self.data_path = os.path.join(self.args.save_dir, 'samples', self.args.env_name,
                                          alg + '_' + str(self.args.weight1) + '_' + str(self.args.weight2),
                                          str(self.args.seed))
            if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)
            if not os.path.exists(self.data_path):
                os.makedirs(self.data_path)

        self.total_step = 0

    def train(self):
        ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
        observation, _ = self.env.reset()
        obs = observation['observation']
        ag = observation['achieved_goal']
        g = observation['desired_goal']
        for t in range(self.env_params['max_timesteps']):
            with torch.no_grad():
                input_tensor = self._preproc_inputs(obs, g)
                pi = self.actor_network(input_tensor)
                action = self._select_actions(pi)
            observation_new, _, info = self.env.step(action)
            obs_new = observation_new['observation']
            ag_new = observation_new['achieved_goal']
            ep_obs.append(obs.copy())
            ep_ag.append(ag.copy())
            ep_g.append(g.copy())
            ep_actions.append(action.copy())
            obs = obs_new
            ag = ag_new

    def learn(self, writer, logpath, tag):
        for epoch in range(self.args.n_epochs):
            samples = []
            for _ in range(self.args.n_cycles):
                mb_obs, mb_ag, mb_g, mb_actions = [], [], [], []
                for _ in range(self.args.num_rollouts_per_mpi):
                    ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
                    observation = self.env.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']
                    for t in range(self.env_params['max_timesteps']):
                        with torch.no_grad():
                            input_tensor = self._preproc_inputs(obs, g)
                            pi = self.actor_network(input_tensor)
                            action = self._select_actions(pi)
                        observation_new, _, _, info = self.env.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_actions.append(action.copy())
                        obs = obs_new
                        ag = ag_new
                    ep_obs.append(obs.copy())
                    ep_ag.append(ag.copy())
                    mb_obs.append(ep_obs)
                    mb_ag.append(ep_ag)
                    mb_g.append(ep_g)
                    mb_actions.append(ep_actions)
                mb_obs = np.array(mb_obs)
                mb_ag = np.array(mb_ag)
                mb_g = np.array(mb_g)
                mb_actions = np.array(mb_actions)
                self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions])
                self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions])
                for _ in range(self.args.n_batches):
                    trans = self._update_network(writer, epoch + 1)
                    self.total_step += 1
                self._soft_update_target_network(self.actor_target_network, self.actor_network)
            
            if epoch%5==0:
                torch.save([self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std,
                        self.actor_network.state_dict(),self.sa_encoder.state_dict(),self.g_encoder.state_dict()],
                       os.path.join(self.model_path, tag + '_' + str(epoch) + '.pt'))

            if MPI.COMM_WORLD.Get_rank() == 0:
                if self.env_name.startswith('Fetch') or self.env_name.startswith('Hand'):
                    success_rate = self._eval_agent()
                    print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format(datetime.now(), epoch, success_rate))
                    writer.add_scalar('reward_eval', success_rate, global_step=epoch)
                elif self.env_name.startswith('Ant') :
                    success_rate1, success_rate2 = self._eval_Ant()
                    print('[{}] epoch is: {}, eval success rate is: {:.3f}: '.format(datetime.now(),
                                                                                   epoch, success_rate2))
                    writer.add_scalar('reward_eval1', success_rate1, global_step=epoch)
                    writer.add_scalar('reward_eval2', success_rate2, global_step=epoch)
                elif self.env_name.startswith('Point')  or self.env_name.startswith('MultiGoal'):
                    success_rate = self._eval_Point()
                    print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format(datetime.now(), epoch, success_rate))
                    writer.add_scalar('reward_eval', success_rate, global_step=epoch)
                else:
                    if epoch%5==0:
                        success_rate = self._eval_agent_sawyer()
                        keys = list(info.keys())
                        j=0
                        for i, key in enumerate(keys):
                            if key=='xy-success':
                                j=i
                            res = success_rate[:,i::len(keys),i,0][:, -1]
                            writer.add_scalar('reward_eval/'+str(key), res.mean(), global_step=epoch)
                        print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format(datetime.now(), epoch, res.mean()))

    def _preproc_inputs(self, obs, g):
        obs_norm = self.o_norm.normalize(obs)
        g_norm = self.g_norm.normalize(g)
        inputs = np.concatenate([obs_norm, g_norm])
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.to(self.device)
        return inputs

    def _select_actions(self, pi):
        action = pi.cpu().numpy().squeeze()
        action += self.args.noise_eps * self.env_params['action_max'] * np.random.randn(*action.shape)
        action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max'])
        random_actions = np.random.uniform(low=-self.env_params['action_max'],
                                           high=self.env_params['action_max'],
                                           size=self.env_params['action'])
        action += np.random.binomial(1, self.args.random_eps, 1)[0] * (random_actions - action)
        return action

    def _update_normalizer(self, episode_batch):
        mb_obs, mb_ag, mb_g, mb_actions = episode_batch
        mb_obs_next = mb_obs[:, 1:, :]
        mb_ag_next = mb_ag[:, 1:, :]
        num_transitions = mb_actions.shape[1]
        buffer_temp = {'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions,
                       'obs_next': mb_obs_next, 'ag_next': mb_ag_next}
        transitions, _, _, _, _, _ = self.her_module.sample_her_transitions(buffer_temp, num_transitions)
        obs, g = transitions['obs'], transitions['g']
        transitions['obs'], transitions['g'] = self._preproc_og(obs, g)
        self.o_norm.update(transitions['obs'])
        self.g_norm.update(transitions['g'])
        self.o_norm.recompute_stats()
        self.g_norm.recompute_stats()

    def _preproc_og(self, o, g):
        o = np.clip(o, -self.args.clip_obs, self.args.clip_obs)
        g = np.clip(g, -self.args.clip_obs, self.args.clip_obs)
        return o, g

    def _soft_update_target_network(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)

    def _update_network(self, writer, epoch):
        # Sample transitions from replay buffer
        transitions, her_indexes, _, _, _, _ = self.buffer.sample(self.args.batch_size)
        
        # Preprocess observations and goals
        o, o_next, g = transitions['obs'], transitions['obs_next'], transitions['sg']
        transitions['obs'], transitions['sg'] = self._preproc_og(o, g)
        transitions['obs_next'], transitions['g_next'] = self._preproc_og(o_next, g)
        
        obs_norm = self.o_norm.normalize(transitions['obs'])
        g_norm = self.g_norm.normalize(transitions['sg'])
        inputs_norm = np.concatenate([obs_norm, g_norm], axis=1)
        
        obs_next_norm = self.o_norm.normalize(transitions['obs_next'])
        g_next_norm = self.g_norm.normalize(transitions['g_next'])
        inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1)
        
        # Convert to tensors
        inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32).to(self.device)
        inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32).to(self.device)
        actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32).to(self.device)
        
        # 1. Contrastive loss for encoders
        sa_repr = self.sa_encoder(torch.cat([torch.tensor(obs_norm, dtype=torch.float32).to(self.device),
                                             actions_tensor], dim=1))
        g_repr = self.g_encoder(torch.tensor(g_norm, dtype=torch.float32).to(self.device))
        logits = torch.matmul(sa_repr, g_repr.T)
        pos_logits = torch.diag(logits)
        neg_logits = torch.logsumexp(logits, dim=1)
        contrast_loss = -torch.mean(pos_logits - neg_logits)
        logsumexp = torch.logsumexp(logits + 1e-6, dim=1)
        contrast_loss += self.args.logsumexp_penalty_coeff * torch.mean(logsumexp**2)
        
     
        self.sa_encoder_optim.zero_grad()
        contrast_loss.backward()
        self.sa_encoder_optim.step()
        
        # 2. Actor and Alpha loss
        actions_real = self.actor_network(inputs_norm_tensor)
        sa_repr_actor = self.sa_encoder(torch.cat([torch.tensor(obs_norm,dtype=torch.float32).to(self.device),actions_real], dim=1))
        qf_pi = torch.sum(sa_repr_actor * g_repr.detach(), dim=-1)
        actor_loss = torch.mean(- qf_pi)
        # actor_loss += self.args.action_l2 * (actions_real / self.env_params['action_max']).pow(2).mean()

        # Update actor
        self.actor_optim.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_network)
        self.actor_optim.step()
        

        # # Log losses
        # writer.add_scalar('contrast_loss', contrast_loss.item(), global_step=self.total_step)
        # writer.add_scalar('actor_loss', actor_loss.item(), global_step=self.total_step)
        # writer.add_scalar('alpha_loss', alpha_loss.item(), global_step=self.total_step)
        
        # Return transitions for compatibility
        return transitions

    def _eval_agent(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                per_success_rate.append(info['is_success'])
                # per_success_rate.append(info['state_success'])
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(total_success_rate[:, -1])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
        return global_success_rate / MPI.COMM_WORLD.Get_size()


    def _eval_Point(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                is_success = np.linalg.norm(obs - g) < self.env.distance
                # is_success = np.sum(np.square(obs[:2] - g))< self.env.distance_threshold
                per_success_rate.append(is_success)
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(np.max(total_success_rate[:, -3:], axis=-1))
        # local_success_rate = np.mean(total_success_rate[:, -5:])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
        return global_success_rate / MPI.COMM_WORLD.Get_size()


    def _eval_Ant(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                ag = observation_new['achieved_goal']
                is_success = np.linalg.norm(ag - g) < self.env.distance_threshold
                per_success_rate.append(is_success)
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate1 = np.mean(np.max(total_success_rate[:, -3:], axis=-1))
        local_success_rate2 = np.mean(np.max(total_success_rate[:, -1:], axis=-1))

        global_success_rate1 = MPI.COMM_WORLD.allreduce(local_success_rate1, op=MPI.SUM)
        global_success_rate2 = MPI.COMM_WORLD.allreduce(local_success_rate2, op=MPI.SUM)
        return global_success_rate1 / MPI.COMM_WORLD.Get_size(),global_success_rate2 / MPI.COMM_WORLD.Get_size()


    def _eval_Point(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                is_success = np.linalg.norm(obs - g) < self.env.distance
                # is_success = np.sum(np.square(obs[:2] - g))< self.env.distance_threshold
                per_success_rate.append(is_success)
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(np.max(total_success_rate[:, -3:], axis=-1))
        # local_success_rate = np.mean(total_success_rate[:, -5:])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
        return global_success_rate / MPI.COMM_WORLD.Get_size()


    def _eval_agent_sawyer(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                keys = list(info.keys())
                tmp = []
                for key in keys:
                    tmp.append([info[key]])
                    per_success_rate.append(tmp)
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)

        return total_success_rate
