import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import os

from meta_test_algo.network import es_policy2
from meta_test_algo.base import base

class EvolutionStrategies_FULL(base):
    def __init__(self, 
                 obs_dim, 
                 action_dim,
                 net_size,
                 latent_action_dim,
                 device,
                 es_params,
                 **kwargs):
        super().__init__(obs_dim,
                         action_dim,
                         net_size,
                         latent_action_dim,
                         device,
                         **kwargs)

        self.max_path_length = kwargs['max_path_length']
        self.reward_scale = kwargs['reward_scale']

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.device = device

        self.n_rollouts = es_params['n_rollouts']
        self.noise_sigma = es_params['noise_sigma']
        self.lr = es_params['lr']
        self.elite_frac = es_params['elite_frac']
        
        self.policy = es_policy2(obs_dim,action_dim,net_size,latent_action_dim).to(self.device)
        
        self.first_layer_params = list(self.policy.first_layer.parameters())
        self.last_layer_params = list(self.policy.last_layer.parameters())
        self.all_params = list(self.policy.parameters())

        self.head_tail_param = nn.utils.parameters_to_vector(self.first_layer_params + self.last_layer_params).detach().cpu().numpy()
        self.param_shape = self.head_tail_param.shape[0]


    def es_adapt_head_mixed_sampling(self, env, head_tail_ratio=0.6):
        full_param_vec = torch.nn.utils.parameters_to_vector(self.policy.parameters()).detach().cpu().numpy()
        param_size = len(full_param_vec)
        
        head_param_vec = torch.nn.utils.parameters_to_vector(self.policy.first_layer.parameters()).detach().cpu().numpy()
        tail_param_vec = torch.nn.utils.parameters_to_vector(self.policy.last_layer.parameters()).detach().cpu().numpy()
        head_size = len(head_param_vec)
        tail_size = len(tail_param_vec)
        tail_start = param_size - tail_size

        head_indices = np.arange(0, head_size)
        tail_indices = np.arange(tail_start, param_size)
        backbone_indices = np.arange(head_size,tail_start)

        # head-tail params to be updated
        num_head = int(head_size * head_tail_ratio)
        num_tail = int(tail_size * head_tail_ratio)

        # head-tail params not to be updated.
        num_other_head = head_size - num_head
        num_other_tail = tail_size - num_tail

        sampled_head_idx = np.random.choice(head_indices, size=num_head, replace=False)
        sampled_tail_idx = np.random.choice(tail_indices, size=num_tail, replace=False)
        sampled_other_idx = np.random.choice(backbone_indices, size=num_other_head+num_other_tail, replace=False)
        sampled_idx = np.concatenate([sampled_head_idx, sampled_tail_idx, sampled_other_idx])

        pair_list = []
        total_steps = 0

        for _ in range(self.n_rollouts // 2):
            epsilon = np.zeros_like(full_param_vec)
            sampled_noise = np.random.randn(self.param_shape)
            epsilon[sampled_idx] = sampled_noise

            # θ + ε
            perturbed_pos = full_param_vec + self.noise_sigma * epsilon
            torch.nn.utils.vector_to_parameters(
                torch.tensor(perturbed_pos, dtype=torch.float32).to(self.device),
                self.policy.parameters()
            )
            r_pos, env_steps = self.evaluate_head(env)
            total_steps += env_steps

            # θ - ε
            perturbed_neg = full_param_vec - self.noise_sigma * epsilon
            torch.nn.utils.vector_to_parameters(
                torch.tensor(perturbed_neg, dtype=torch.float32).to(self.device),
                self.policy.parameters()
            )
            r_neg, env_steps = self.evaluate_head(env)
            total_steps += env_steps

            r_diff = r_pos - r_neg
            r_mean = (r_pos + r_neg) / 2.0
            pair_list.append((epsilon, r_diff, r_mean))

        num_elites = max(1, int(len(pair_list) * self.elite_frac))
        elite_pairs = sorted(pair_list, key=lambda x: x[2])[-num_elites:]

        r_diffs = np.array([r_diff for _, r_diff, _ in elite_pairs])
        r_diffs = (r_diffs - r_diffs.mean()) / (r_diffs.std() + 1e-6)

        grad_estimate = np.zeros_like(full_param_vec)
        for (eps, _, _), r in zip(elite_pairs, r_diffs):
            grad_estimate += r * eps
        grad_estimate /= num_elites

        full_param_vec[sampled_idx] += self.lr * grad_estimate[sampled_idx]

        torch.nn.utils.vector_to_parameters(
            torch.tensor(full_param_vec, dtype=torch.float32).to(self.device),
            self.policy.parameters()
        )

        return total_steps


    def collet_data_and_train_filter(self,env):
        env_steps = self.es_adapt_head_mixed_sampling(env,head_tail_ratio=0.5)
        return env_steps
