import copy
import random
from abc import ABC
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
import torch.nn.functional as F

from thinker_task.ppo.utils import save_debug_data

def to(tensor: Union[torch.Tensor, list[torch.Tensor]], device):
    if isinstance(tensor, list):
        return [to(t, device) for t in tensor]
    return tensor.to(device)


def pin_memory(tensor: Union[torch.Tensor, list[torch.Tensor]]):
    if isinstance(tensor, list):
        return [pin_memory(t) for t in tensor]
    return tensor.pin_memory()


@dataclass
class Experience:
    """Experience is a batch of data.
    These data should have the the sequence length and number of actions.
    Left padding for sequences is applied.

    Shapes of each tensor:
    sequences: (B, S)
    action_log_probs: (B, A)
    base_action_log_probs: (B, A)
    values: (B, A)
    returns: (B, A)
    advatanges: (B, A)
    attention_mask: (B, S)
    action_mask: (B, A)
    kl: (B, A)
    sys_mask: (B, A)
    actor_target_values: (B, A, *)

    "A" is the number of actions.
    """

    sequences: torch.Tensor
    action_log_probs: Optional[torch.Tensor]
    base_action_log_probs: Optional[torch.Tensor]
    values: Optional[torch.Tensor]
    returns: Optional[torch.Tensor]
    advantages: Optional[torch.Tensor]
    attention_mask: Optional[torch.LongTensor]
    action_mask: Optional[torch.BoolTensor]
    num_actions: Optional[torch.Tensor]
    packed_seq_lens: Optional[torch.Tensor]
    info: Optional[dict]
    kl: Optional[torch.Tensor] = None
    sys_mask: Optional[torch.Tensor] = None
    actor_target_values: Optional[torch.Tensor] = None

    @torch.no_grad()
    def to_device(self, device: torch.device) -> None:
        self.sequences = to(self.sequences, device)
        if self.action_log_probs is not None:
            self.action_log_probs = to(self.action_log_probs, device)
        if self.base_action_log_probs is not None:
            self.base_action_log_probs = to(self.base_action_log_probs, device)
        if self.values is not None:
            self.values = to(self.values, device)
        if self.returns is not None:
            self.returns = to(self.returns, device)
        if self.advantages is not None:
            self.advantages = to(self.advantages, device)
        if self.attention_mask is not None:
            self.attention_mask = to(self.attention_mask, device)
        if self.action_mask is not None:
            self.action_mask = to(self.action_mask, device)
        if self.num_actions is not None:
            self.num_actions = to(self.num_actions, device)
        if self.packed_seq_lens is not None:
            self.packed_seq_lens = to(self.packed_seq_lens, device)
        if self.sys_mask is not None:
            self.sys_mask = to(self.sys_mask, device)
        if self.actor_target_values is not None:
            self.actor_target_values = to(self.actor_target_values, device)

    def pin_memory(self):
        self.sequences = pin_memory(self.sequences)
        if self.action_log_probs is not None:
            self.action_log_probs = pin_memory(self.action_log_probs)
        if self.base_action_log_probs is not None:
            self.base_action_log_probs = pin_memory(self.base_action_log_probs)
        if self.values is not None:
            self.values = pin_memory(self.values)
        if self.returns is not None:
            self.returns = pin_memory(self.returns)
        if self.advantages is not None:
            self.advantages = pin_memory(self.advantages)
        if self.attention_mask is not None:
            self.attention_mask = self.attention_mask.pin_memory()
        if self.action_mask is not None:
            self.action_mask = self.action_mask.pin_memory()
        if self.num_actions is not None:
            self.num_actions = self.num_actions.pin_memory()
        if self.packed_seq_lens is not None:
            self.packed_seq_lens = self.packed_seq_lens.pin_memory()
        if self.sys_mask is not None:
            self.sys_mask = self.sys_mask.pin_memory()
        if self.actor_target_values is not None:
            self.actor_target_values = self.actor_target_values.pin_memory()
        return self
    
    def save_data(self, **kwargs):
        save_debug_data(
            sequences=self.sequences,
            action_log_probs=self.action_log_probs,
            base_action_log_probs=self.base_action_log_probs,
            values=self.values,
            returns=self.returns,
            advantages=self.advantages,
            attention_mask=self.attention_mask,
            action_mask=self.action_mask,
            num_actions=self.num_actions,
            packed_seq_lens=self.packed_seq_lens,
            kl=self.kl,
            sys_mask=self.sys_mask,
            actor_target_values=self.actor_target_values,
            **kwargs,
        )


@dataclass
class BufferItem:
    """BufferItem is an item of experience data.

    Shapes of each tensor:
    sequences: (S)
    action_log_probs: (A)
    base_action_log_probs: (A)
    values: (1)
    returns: (1)
    advatanges: (1)
    attention_mask: (S)
    action_mask: (A)
    sys_mask: (A)

    "A" is the number of actions.
    """

    sequences: torch.Tensor
    action_log_probs: Optional[torch.Tensor]
    base_action_log_probs: Optional[torch.Tensor]
    values: Optional[torch.Tensor]
    returns: Optional[torch.Tensor]
    advantages: Optional[torch.Tensor]
    attention_mask: Optional[torch.LongTensor]
    action_mask: Optional[torch.BoolTensor]
    num_actions: Optional[torch.Tensor]
    packed_seq_lens: Optional[torch.Tensor]
    info: Optional[dict]
    sys_mask: Optional[torch.BoolTensor]
    actor_target_values: Optional[torch.Tensor]

    def to_json(self) -> dict:
        def _to_json(obj):
            if isinstance(obj, torch.Tensor):
                return obj.cpu().tolist()
            elif isinstance(obj, dict):
                return {k: _to_json(v) for k, v in obj.items()}
            elif isinstance(obj, (list, tuple)):
                return [_to_json(x) for x in obj]
            else:
                return obj

        return {k: _to_json(v) for k, v in self.__dict__.items()}


def split_experience_batch(experience: Experience) -> List[BufferItem]:
    batch_size = len(experience.sequences)
    batch_kwargs = [{} for _ in range(batch_size)]
    keys = (
        "sequences",
        "action_log_probs",
        "base_action_log_probs",
        "values",
        "returns",
        "advantages",
        "attention_mask",
        "action_mask",
        "num_actions",
        "packed_seq_lens",
        "sys_mask",
        "actor_target_values",
    )
    for key in keys:
        value = getattr(experience, key)
        if value is None:
            for i in range(batch_size):
                batch_kwargs[i][key] = None
            continue
        vals = value
        if isinstance(vals, torch.Tensor):
            vals = torch.unbind(vals)
        assert batch_size == len(vals)
        for i, v in enumerate(vals):
            if isinstance(v, torch.Tensor):
                v = v.clone()
            batch_kwargs[i][key] = v

    for i in range(batch_size):
        batch_kwargs[i]["info"] = {}
    for k, v in experience.info.items():
        if v is None:
            for i in range(batch_size):
                batch_kwargs[i]["info"][k] = None
            continue
        vals = torch.unbind(v)
        assert batch_size == len(vals)
        for i, vv in enumerate(vals):
            if isinstance(vv, torch.Tensor):
                assert vv.numel() == 1, f"info[{k}] must be a scalar tensor, but got {vv.shape}"
                vv = vv.item()
            batch_kwargs[i]["info"][k] = vv

    items = [BufferItem(**kwargs) for kwargs in batch_kwargs]
    return items


def zero_pad_sequences(sequences: List[torch.Tensor], side: str = "left") -> torch.Tensor:
    assert side in ("left", "right")
    max_len = max(seq.size(0) for seq in sequences)
    padded_sequences = []
    for seq in sequences:
        pad_len = max_len - seq.size(0)
        padding = (pad_len, 0) if side == "left" else (0, pad_len)
        padded_sequences.append(F.pad(seq, padding))
    return torch.stack(padded_sequences, dim=0)


def make_experience_batch(items: List[BufferItem], packing_samples=False) -> Experience:
    kwargs = {}
    keys = (
        "sequences",
        "action_log_probs",
        "base_action_log_probs",
        "values",
        "returns",
        "advantages",
        "attention_mask",
        "action_mask",
        "num_actions",
        "packed_seq_lens",
        "sys_mask",
        "actor_target_values",
    )
    for key in keys:
        vals = [getattr(item, key) for item in items]
        if not packing_samples:
            batch_data = zero_pad_sequences(vals, "left") if vals[0] is not None else None
        else:
            batch_data = vals if vals[0] is not None else None
        kwargs[key] = batch_data

    kwargs["info"] = {}
    for key in items[0].info.keys():
        if items[0].info[key] is None:
            kwargs["info"][key] = None
            continue
        vals = torch.tensor([item.info[key] for item in items])
        kwargs["info"][key] = vals
    return Experience(**kwargs)


def remove_padding_in_sequences(items):
    for item in items:
        seq, act_log_prob, base_act_log_prob, value, ret, adv, att_mask, act_mask, sys_mask, act_value = (
            item.sequences,
            item.action_log_probs,
            item.base_action_log_probs,
            item.values,
            item.returns,
            item.advantages,
            item.attention_mask,
            item.action_mask,
            item.sys_mask,
            item.actor_target_values,
        )
        right_pad = (1 - act_mask.long()).sum()
        right_pad = None if right_pad == 0 else -right_pad

        # left_pad for seq and att_mask
        left_pad = att_mask.long().argmax()
        (
            item.sequences,
            item.action_log_probs,
            item.base_action_log_probs,
            item.values,
            item.returns,
            item.advantages,
            item.attention_mask,
            item.action_mask,
            item.sys_mask,
            item.actor_target_values,
        ) = (
            seq[left_pad:right_pad],
            act_log_prob[:right_pad] if act_log_prob is not None else None,
            base_act_log_prob[:right_pad] if base_act_log_prob is not None else None,
            value[:right_pad] if value is not None else None,
            ret[:right_pad] if ret is not None else None,
            adv[:right_pad] if adv is not None else None,
            att_mask[left_pad:right_pad] if att_mask is not None else None,
            act_mask[:right_pad] if act_mask is not None else None,
            sys_mask[:right_pad] if sys_mask is not None else None,
            act_value[:right_pad] if act_value is not None else None,
        )
    return items


class NaiveReplayBuffer(ABC):
    """Naive replay buffer class. It stores experience.

    Args:
        sample_batch_size (int): Batch size when sampling.
        limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
        cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True.
    """

    def __init__(
        self, sample_batch_size: int, limit: int = 0, cpu_offload: bool = True, packing_samples: bool = False
    ) -> None:
        super().__init__()
        self.sample_batch_size = sample_batch_size
        # limit <= 0 means unlimited
        self.limit = limit
        self.cpu_offload = cpu_offload
        self.packing_samples = packing_samples
        self.target_device = torch.device(f"cuda:{torch.cuda.current_device()}")
        self.items: List[BufferItem] = []

    @torch.no_grad()
    def split_to_n_batches(self, n_batches: int, drop_last: bool = False) -> List["NaiveReplayBuffer"]:
        assert n_batches > 0
        if not drop_last:
            batch_size = (len(self.items) + n_batches - 1) // n_batches
        else:
            batch_size = len(self.items) // n_batches
        if len(self.items) < batch_size * n_batches:
            # padding
            padding_size = batch_size * n_batches - len(self.items)
            padding_items = random.choices(self.items, k=padding_size)
            self.items.extend(padding_items)
        bfs = []
        items = copy.deepcopy(self.items)
        random.shuffle(items)
        for i in range(n_batches):
            bf = NaiveReplayBuffer(
                sample_batch_size=self.sample_batch_size,
                limit=self.limit,
                cpu_offload=self.cpu_offload,
                packing_samples=self.packing_samples,
            )
            bf.items = items[i * batch_size : (i + 1) * batch_size]
            bfs.append(bf)
        return bfs

    @torch.no_grad()
    def append(self, experience: Experience) -> None:
        if self.cpu_offload:
            experience.to_device(torch.device("cpu"))
        items = split_experience_batch(experience)
        # the packed samples comes with no padding
        if not self.packing_samples:
            items = remove_padding_in_sequences(items)
        self.items.extend(items)
        if self.limit > 0:
            samples_to_remove = len(self.items) - self.limit
            if samples_to_remove > 0:
                self.items = self.items[samples_to_remove:]

    def clear(self) -> None:
        self.items.clear()

    @torch.no_grad()
    def sample(self) -> Experience:
        items = random.sample(self.items, self.sample_batch_size)
        experience = make_experience_batch(items, self.packing_samples)
        if self.cpu_offload:
            experience.to_device(self.target_device)
        return experience

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, idx: int) -> BufferItem:
        return self.items[idx]

    def collate_fn(self, batch) -> Experience:
        experience = make_experience_batch(batch, self.packing_samples)
        return experience

    def normalize(self, attribute: str, strategy) -> None:
        assert attribute == "advantages"
        items = []
        action_masks = []

        for item in self:
            items.append(getattr(item, attribute))
            action_masks.append(item.action_mask)

        if strategy.args.multi_attempt or strategy.args.summary: 
            sys_masks = []
            for item in self:
                sys_masks.append(item.sys_mask)
            sys_masks = torch.cat(sys_masks).float().flatten()

        items_vector = torch.cat(items).float().flatten()

        if action_masks[0] is None:
            # packing samples has no action mask
            action_masks_vector = torch.ones(items_vector.shape, device=items_vector.device, dtype=torch.bool)
            if strategy.args.multi_attempt or strategy.args.summary:
                action_masks_vector = torch.logical_and(action_masks_vector, torch.logical_not(sys_masks))
            num_actions = action_masks_vector.sum()
        else:
            action_masks_vector = torch.cat(action_masks).flatten()
            if strategy.args.multi_attempt or strategy.args.summary:
                action_masks_vector = torch.logical_and(action_masks_vector, torch.logical_not(sys_masks))
            num_actions = action_masks_vector.sum()

        # for DP
        # mean
        # TODO: check do we need to use all_reduce here
        items_vector = items_vector * action_masks_vector
        sum_and_count = torch.tensor([items_vector.sum(), num_actions], device=items_vector.device)
        all_sum, all_count = strategy.all_reduce(sum_and_count, "sum")
        mean = all_sum / all_count
        # std
        std = ((items_vector - mean).pow(2) * action_masks_vector).sum()
        all_std = strategy.all_reduce(std, "sum")
        rstd = (all_std / all_count).clamp(min=1e-8).rsqrt()

        for i, item in enumerate(self):
            setattr(item, attribute, (items[i] - mean) * rstd)
