# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import hashlib
import json
import logging
import typing as tp
from concurrent.futures import ProcessPoolExecutor
from contextlib import contextmanager
from functools import lru_cache, wraps
from pathlib import Path

import flashy
import flashy.distrib
import omegaconf
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

logger = logging.getLogger(__name__)


def model_hash(model: torch.nn.Module) -> str:
    """Return a model hash. This should allow us to track regressions in model init
    from the logs of past experiments.
    """
    hasher = hashlib.sha1()
    for p in model.parameters():
        hasher.update(p.data.cpu().numpy().tobytes())
    return hasher.hexdigest()


def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
    """Convenience function to map an omegaconf configuration to a dictionary.

    Args:
        cfg (omegaconf.DictConfig): Original configuration to map to dict.
    Returns:
        dict: Config as dictionary object.
    """
    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
    assert isinstance(dct, dict)
    return dct


def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
    if max_samples >= len(dataset):
        return dataset

    generator = torch.Generator().manual_seed(seed)
    perm = torch.randperm(len(dataset), generator=generator)
    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())


def get_loader(
    dataset,
    num_samples: tp.Optional[int],
    batch_size: int,
    num_workers: int,
    seed: int,
    disable_batch_sampler: bool = False,
    **kwargs,
) -> torch.utils.data.DataLoader:
    """Convenience function to load dataset into a dataloader with optional subset sampling.

    Args:
        dataset: Dataset to load.
        num_samples (Optional[int]): Number of samples to limit subset size.
        batch_size (int): Batch size.
        num_workers (int): Number of workers for data loading.
        seed (int): Random seed.
    """
    if num_samples is not None:
        dataset = random_subset(dataset, num_samples, seed)

    if disable_batch_sampler is True:
        dataloader = DataLoader(
            dataset, batch_size=batch_size, num_workers=num_workers, **kwargs
        )
    else:
        dataloader = flashy.distrib.loader(
            dataset, batch_size=batch_size, num_workers=num_workers, **kwargs
        )
    return dataloader


def get_dataset_from_loader(dataloader):
    dataset = dataloader.dataset
    if isinstance(dataset, torch.utils.data.Subset):
        return dataset.dataset
    else:
        return dataset


def check_finish(
    gen_sequence: torch.Tensor, eos_token_id: int
) -> tp.Optional[torch.Tensor]:
    all_finished = 0
    B, K, _ = gen_sequence.shape
    for b in range(B):
        if (gen_sequence[b, -1] == eos_token_id).any().item() is True:
            # 最終 layer で EOS が出ているならその batch は既に終了
            all_finished += 1
    if all_finished == B:
        # 全ての batch が終了しているなら EOS のみを返す (無駄な計算を避けるため)
        return torch.tensor([eos_token_id], device=gen_sequence.device).repeat(B, K, 1)
    else:
        return None


def multinomial(
    input: torch.Tensor, num_samples: int, replacement=False, *, generator=None
):
    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.

    Args:
        input (torch.Tensor): The input tensor containing probabilities.
        num_samples (int): Number of samples to draw.
        replacement (bool): Whether to draw with replacement or not.
    Keywords args:
        generator (torch.Generator): A pseudorandom number generator for sampling.
    Returns:
        torch.Tensor: Last dimension contains num_samples indices
            sampled from the multinomial probability distribution
            located in the last dimension of tensor input.
    """
    input_ = input.reshape(-1, input.shape[-1])
    output_ = torch.multinomial(
        input_, num_samples=num_samples, replacement=replacement, generator=generator
    )
    output = output_.reshape(*list(input.shape[:-1]), -1)
    return output


def repetitioin_penalty_process(
    penalty: float,
    input_ids: torch.Tensor,
    logits: torch.Tensor,
    out_of_index_ids: tp.List[int] = [],
) -> torch.Tensor:
    assert penalty > 0, "Repetition penalty must be strictly positive."
    card = logits.size(-1)
    # out of index しないようにする
    valid_indices = torch.ones_like(input_ids, dtype=torch.bool)
    for out_of_index_id in out_of_index_ids:
        valid_indices &= input_ids != out_of_index_id
    safe_input_ids = torch.where(
        valid_indices, input_ids, torch.zeros_like(input_ids) + card
    )
    logits = F.pad(logits, (0, 1), value=0.0)  # pad for out of index
    # 今まで出てきた ids に対する今回の logit を取得
    logit = torch.gather(logits, -1, safe_input_ids)
    # その logit に対して penalty をかける
    logit = torch.where(logit < 0, logit * penalty, logit / penalty)
    # logits にいじった値を戻す
    logits.scatter_(-1, safe_input_ids, logit)
    return logits[..., :-1]


def top_k_process(probs: torch.Tensor, k: int) -> torch.Tensor:
    """Sample next token from top K values along the last dimension of the input probs tensor.

    Args:
        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
        k (int): The k in “top-k”.
    Returns:
        torch.Tensor: Sampled tokens.
    """
    top_k_value, _ = torch.topk(probs, k, dim=-1)
    min_value_top_k = top_k_value[..., [-1]]
    probs *= (probs >= min_value_top_k).float()
    probs.div_(probs.sum(dim=-1, keepdim=True))
    return probs


def top_p_process(probs: torch.Tensor, p: float) -> torch.Tensor:
    """Sample next token from top P probabilities along the last dimension of the input probs tensor.

    Args:
        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
        p (int): The p in “top-p”.
    Returns:
        torch.Tensor: Sampled tokens.
    """
    assert p < 1.0, "Use p value less than 1 for eos process"
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort *= (~mask).float()
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
    probs = torch.zeros_like(probs_sort)
    probs.scatter_(dim=-1, index=probs_idx, src=probs_sort)
    return probs


def eos_process(
    input_ids: torch.Tensor,
    probs: torch.Tensor,
    eos_id: int,
) -> torch.Tensor:
    """要件:
    - K = 0 で EOS が出てたら以降 eos 以外の確率を 0 とする
    - K = 0 で EOS が出ていないなら K > 0 での出現確率を 0 とする
    """
    # K = 0 で EOS が出ているか check
    for b in range(input_ids.size(0)):
        for k in range(probs.size(1)):
            if k == 0:
                if (input_ids[b, 0] == eos_id).any().item() is True:
                    probs[b, 0] = 0.0
                    probs[b, 0, eos_id] = 1.0
            else:
                if (input_ids[b, k - 1] == eos_id).any().item() is True:
                    # K > 0 に関しては K - 1 で EOS が出ていたら出す
                    probs[b, k] = 0.0
                    probs[b, k, eos_id] = 1.0
                else:
                    # K - 1 で EOS が出ていないなら K では出さない
                    probs[b, k, eos_id] = 0.0
    return probs


def check_probs(
    probs: torch.Tensor,
    ng_tokens: tp.List[int],
) -> torch.Tensor:
    # K = 0 で EOS が出ているか check
    for b in range(probs.size(0)):
        for k in range(probs.size(1)):
            if (probs[b, k] > 0).any().item() is False:
                logger.warning(f"layer {k}'s probs is all zeros. output token randomly")
                probs[b, k] = 1.0
                for _id in ng_tokens:
                    probs[b, k, _id] = 0.0
    return probs


class DummyPoolExecutor:
    """Dummy pool executor to use when we actually have only 1 worker.
    (e.g. instead of ProcessPoolExecutor).
    """

    class DummyResult:
        def __init__(self, func, *args, **kwargs):
            self.func = func
            self.args = args
            self.kwargs = kwargs

        def result(self):
            return self.func(*self.args, **self.kwargs)

    def __init__(self, workers, mp_context=None):
        pass

    def submit(self, func, *args, **kwargs):
        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_tb):
        return


def get_pool_executor(num_workers: int, mp_context=None):
    return (
        ProcessPoolExecutor(num_workers, mp_context)
        if num_workers > 1
        else DummyPoolExecutor(1)
    )


def length_to_mask(
    lengths: torch.Tensor, max_len: tp.Optional[int] = None
) -> torch.Tensor:
    """Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
    For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]

    Args:
        lengths (torch.Tensor): tensor with lengths
        max_len (int): can set the max length manually. Defaults to None.
    Returns:
        torch.Tensor: mask with 0s where there is pad tokens else 1s
    """
    assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
    final_length = lengths.max().item() if not max_len else max_len
    final_length = max(
        final_length, 1
    )  # if all seqs are of len zero we don't want a zero-size tensor
    return torch.arange(final_length)[None, :].to(lengths.device) < lengths[:, None]


def hash_trick(word: str, vocab_size: int) -> int:
    """Hash trick to pair each word with an index

    Args:
        word (str): word we wish to convert to an index
        vocab_size (int): size of the vocabulary
    Returns:
        int: index of the word in the embedding LUT
    """
    hash = int(hashlib.sha256(word.encode("utf-8")).hexdigest(), 16)
    return hash % vocab_size


def with_rank_rng(base_seed: int = 1234):
    """Decorator for a function so that the function will use a Random Number Generator
    whose state depend on the GPU rank. The original RNG state is restored upon returning.

    Args:
        base_seed (int): Random seed.
    """

    def _decorator(fun: tp.Callable):
        @wraps(fun)
        def _decorated(*args, **kwargs):
            state = torch.get_rng_state()
            seed = base_seed ^ flashy.distrib.rank()
            torch.manual_seed(seed)
            logger.debug("Rank dependent seed set to %d", seed)
            try:
                return fun(*args, **kwargs)
            finally:
                torch.set_rng_state(state)
                logger.debug("RNG state restored.")

        return _decorated

    return _decorator


def collate(
    tensors: tp.List[torch.Tensor], dim: int = 0
) -> tp.Tuple[torch.Tensor, torch.Tensor]:
    """Get a list of tensors and collate them to a single tensor. according to the following logic:
    - `dim` specifies the time dimension which will be stacked and padded.
    - The output will contain 1 new dimension (dimension index 0) which will be the size of
    of the original list.

    Args:
        tensors (tp.List[torch.Tensor]): List of tensors to collate.
        dim (int): Dimension which will be stacked and padded.
    Returns:
        tp.Tuple[torch.Tensor, torch.Tensor]:
            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
                (dimension index 0) which will be the size of the original list.
            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
    """
    tensors = [x.transpose(0, dim) for x in tensors]
    lens = torch.LongTensor([len(x) for x in tensors])
    if torch.all(lens == lens[0]).item() is True:
        padded_tensors = torch.stack(tensors)
    else:
        padded_tensors = pad_sequence(tensors)
        padded_tensors = padded_tensors.transpose(0, 1)
        padded_tensors = padded_tensors.transpose(1, dim + 1)
    return padded_tensors, lens


# TODO: Move to flashy?
def copy_state(
    state: tp.Any,
    device: tp.Union[torch.device, str] = "cpu",
    dtype: tp.Optional[torch.dtype] = None,
) -> tp.Any:
    if isinstance(state, torch.Tensor):
        if dtype is None or not state.is_floating_point():
            dtype = state.dtype
        return state.detach().to(device=device, dtype=dtype, copy=True)
    elif isinstance(state, dict):
        return {k: copy_state(v, device, dtype) for k, v in state.items()}
    elif isinstance(state, list):
        return [copy_state(v, device, dtype) for v in state]


# TODO: Move to flashy?
@contextmanager
def swap_state(model, state, **kwargs):
    old_state = copy_state(model.state_dict())
    model.load_state_dict(state, **kwargs)
    try:
        yield
    finally:
        model.load_state_dict(old_state)


@lru_cache(None)
def warn_once(logger, msg):
    """Warn about a given message only once."""
    logger.warning(msg)


def is_jsonable(x: tp.Any):
    """Check if an object can be serialized into a json:"""
    try:
        json.dumps(x)
        return True
    except (TypeError, OverflowError):
        return False


def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
    """Wrapper around state dict loading of CLAP model
    addressing compatibility issues between CLAP and AudioCraft
    HuggingFace transformer version.
    See: https://github.com/LAION-AI/CLAP/issues/118
    """
    from clap_module.factory import load_state_dict  # type: ignore

    pkg = load_state_dict(path)
    pkg.pop("text_branch.embeddings.position_ids", None)
    clap_model.model.load_state_dict(pkg)


if __name__ == "__main__":
    card = 4
    n_q = 4
    dummy_logit = torch.randn(2, n_q, card)
    dummy_logit[:, :, :] = 0
    dummy_logit[:, :, card - 1] = 1

    print("Dummy logits:")
    print(dummy_logit)

    print("Sample top 3:")
    print(top_k_process(dummy_logit, 3))

    print("Sample top 3:")
    print(top_p_process(dummy_logit, 0.8))

    # for repetition penalty
    from audiocraft.modules.codebooks_patterns import DelayedPatternProvider

    pattern_provider = DelayedPatternProvider(n_q=n_q)

    prompt = torch.zeros((2, n_q, 1), dtype=torch.long) + 3
    B, K, T = prompt.shape
    start_offset = T
    max_gen_len = 10
    pattern = pattern_provider.get_pattern(max_gen_len)
    unknown_token = -1
    special_token_id = card
    eos_token_id = card - 1

    gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long)
    gen_codes[..., :start_offset] = prompt
    gen_sequence, indexes, mask = pattern.build_pattern_sequence(
        gen_codes, special_token_id
    )
    start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)

    prev_offset = 0
    gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
    for offset in range(start_offset_sequence, gen_sequence_len):
        curr_sequence = gen_sequence[..., prev_offset:offset]
        curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
        dummy_logit = repetitioin_penalty_process(
            5.0,
            gen_sequence,
            dummy_logit,
            [unknown_token, special_token_id],
        )
        print("repetitioin_penalty_process:")
        print(dummy_logit)

        dummy_logit = eos_process(
            gen_sequence,
            dummy_logit,
            eos_token_id,
        )
        print("eos_process")
        print(dummy_logit)

        break
