# Refer to the code in https://github.com/thunlp/InfLLM/blob/main/inf_llm/attention/context_manager.py

from copy import deepcopy
from typing import Optional, Tuple

import torch
from transformers.modeling_outputs import CausalLMOutput

try:
    from flash_attn import flash_attn_func as dense_decoding_func
except ImportError:
    from ..ops.flash_attn_triton import (
        _flash_attn_triton_decoding as dense_decoding_func,
    )

from ..ops.streaming_kernel import TritonMultiStageDotProductionAttention


class CudaCache:
    def __init__(self, num_units, unit_size, dtype):
        self.num_units = num_units
        self.unit_size = unit_size
        self.dtype = dtype
        self.data = torch.empty((num_units, unit_size), device="cuda", dtype=dtype)
        self.idle_set = set(list(range(num_units)))

    def alloc(self):
        assert len(self.idle_set) > 0

        idx = self.idle_set.pop()
        return self.data[idx], idx

    def delete(self, idx):
        assert idx not in self.idle_set
        self.idle_set.add(idx)


class MemoryUnit:
    def __init__(
        self,
        kv: Tuple[torch.Tensor, torch.Tensor],
        cache: CudaCache,
        load_to_cache: bool = False,
        pin_memory: bool = False,
    ):
        self.cache = cache

        if kv[0].is_cuda:
            cpu_data = tuple(_t.contiguous().to("cpu", non_blocking=True) for _t in kv)
        else:
            cpu_data = tuple(_t.contiguous() for _t in kv)

        if pin_memory:
            cpu_data = tuple(_t.pin_memory() for _t in cpu_data)

        if load_to_cache:
            gpu_data, gpu_data_id = cache.alloc()
            gpu_data = gpu_data.view((2,) + kv[0].shape)
            gpu_data[0].copy_(kv[0], non_blocking=True)
            gpu_data[1].copy_(kv[1], non_blocking=True)
            event = torch.cuda.Event()
            event.record(torch.cuda.current_stream())
        else:
            gpu_data, gpu_data_id = None, None
            event = None

        self.cpu_data = cpu_data
        self.gpu_data = gpu_data
        self.gpu_data_id = gpu_data_id
        self.event = event

    def load(self, target: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> bool:
        if self.gpu_data is not None:
            if target is not None:
                target[0].copy_(self.gpu_data[0], non_blocking=True)
                target[1].copy_(self.gpu_data[1], non_blocking=True)
                target_event = torch.cuda.Event()
                target_event.record(torch.cuda.current_stream())
            else:
                target_event = None

            return False, target_event

        gpu_data, gpu_data_id = self.cache.alloc()
        gpu_data = gpu_data.view((2,) + self.cpu_data[0].shape)
        if target is not None:
            target[0].copy_(self.cpu_data[0], non_blocking=True)
            target[1].copy_(self.cpu_data[1], non_blocking=True)
            target_event = torch.cuda.Event()
            target_event.record(torch.cuda.current_stream())
            gpu_data[0].copy_(target[0], non_blocking=True)
            gpu_data[1].copy_(target[1], non_blocking=True)

        else:
            gpu_data[0].copy_(self.cpu_data[0], non_blocking=True)
            gpu_data[1].copy_(self.cpu_data[1], non_blocking=True)

        event = torch.cuda.Event()
        event.record(torch.cuda.current_stream())
        self.event = event
        self.gpu_data = gpu_data
        self.gpu_data_id = gpu_data_id

        return True, target_event

    def get(self):
        assert self.gpu_data is not None
        self.event.wait()
        return self.gpu_data

    def offload(self):
        assert self.gpu_data is not None
        self.event.wait()
        self.gpu_data = None
        self.cache.delete(self.gpu_data_id)
        self.gpu_data_id = None


class VectorTensor:
    def __init__(self, hidden_size, element_dtype):
        init_cached_size = 16
        self.data = torch.empty(
            (init_cached_size, hidden_size), dtype=element_dtype, device="cuda"
        )
        self.length = 0
        self.cache_size = init_cached_size
        self.hidden_size = hidden_size

    def append_cache(self):
        new_cache_size = self.cache_size * 2
        data_shape = self.data.shape
        new_data = torch.empty(
            (new_cache_size,) + data_shape[1:], device="cuda", dtype=self.data.dtype
        )
        new_data[: self.cache_size, ...].copy_(self.data)
        self.data = new_data
        self.cache_size = new_cache_size

    def append(self, tensor: torch.Tensor):
        assert tensor.dtype == self.data.dtype
        assert tensor.size(1) == self.hidden_size
        assert tensor.is_contiguous()

        append_l = tensor.size(0)

        while self.length + append_l > self.cache_size:
            self.append_cache()

        self.data[self.length : self.length + append_l, ...].copy_(tensor)
        self.data = self.data.to(tensor.device)

        self.length += append_l

    def get_data(self):
        return self.data[: self.length, ...]

    def get_topk(self, tensor: torch.Tensor, topk):  # inner product
        assert tensor.dim() == 1 and tensor.size(0) == self.hidden_size
        logits = torch.matmul(self.data[: self.length], tensor[:, None]).squeeze(dim=-1)
        assert logits.dim() == 1 and logits.size(0) == self.length
        return logits.topk(topk, dim=0).indices.cpu().tolist()

    def __len__(self):
        return self.length


class Faiss:
    def __init__(self, hidden_size, element_dtype):
        import faiss

        # We use the CPU index here because the GPU index requires a long initialization time
        self.index = faiss.IndexFlatIP(hidden_size)
        self.hidden_size = hidden_size

    def append(self, tensor: torch.Tensor):
        assert tensor.dim() == 2 and tensor.size(1) == self.hidden_size
        self.index.add(tensor.cpu().float().numpy().astype("float32"))

    def get_data(self):
        raise ValueError

    def get_topk(self, tensor: torch.Tensor, topk):
        assert tensor.dim() == 1 and tensor.size(0) == self.hidden_size
        xq = tensor[None, :].cpu().float().numpy().astype("float32")
        topk_index = self.index.search(xq, topk)[1][0].tolist()
        return topk_index

    def __len__(self):
        return self.index.ntotal


GLOBAL_STREAM = None


class ContextManager:
    def __init__(
        self,
        position_embedding,
        n_init,
        n_local,
        block_size,
        max_cached_block,
        topk,
        exc_block_size,
        score_decay: Optional[float] = None,
        repr_topk: int = 1,
        cache_strategy="lru",
        chunk_topk_calc: Optional[int] = None,
        async_global_stream: bool = False,
        pin_memory: bool = False,
        faiss: bool = False,
        perhead: bool = False,
        dense_decoding: bool = False,
    ):
        self.length = 0
        self.position_embedding = position_embedding
        self.n_init = n_init
        self.n_local = n_local
        self.block_size = block_size
        self.max_cached_block = max_cached_block
        self.exc_block_size = exc_block_size
        self.score_decay = score_decay
        assert exc_block_size <= n_local  # no global token in input
        self.topk = topk
        self.Attn = TritonMultiStageDotProductionAttention
        self.initialized = False
        self.repr_topk = repr_topk
        self.cache_strategy = cache_strategy
        self.load_count = 0
        self.chunk_topk_calc = chunk_topk_calc
        self.async_global_stream = async_global_stream
        self.pin_memory = pin_memory
        self.faiss = faiss
        self.perhead = perhead

        self.dense_decoding = dense_decoding

        global GLOBAL_STREAM
        if self.async_global_stream and GLOBAL_STREAM is None:
            GLOBAL_STREAM = torch.cuda.Stream()

        assert cache_strategy in ["lru", "lru-s"]

        if cache_strategy == "lru-s":
            self.calc_block_score = True
        else:
            self.calc_block_score = False

    def remove_lru_blocks(
        self, u, num_remove: Optional[int] = None, ignore_blocks=None
    ):
        if num_remove is None:
            num_remove = len(self.cached_blocks[u]) - self.max_cached_block

        if num_remove <= 0:
            return

        lst = list(self.cached_blocks[u].items())
        lst.sort(key=lambda x: x[1])

        removed = 0
        for i in range(len(lst)):
            idx = lst[i][0]
            if ignore_blocks is None or (idx not in ignore_blocks):
                self.global_blocks[u][idx].offload()
                self.cached_blocks[u].pop(idx)
                removed += 1

            if removed >= num_remove:
                return

    def get_block_k(self, k, score):
        assert isinstance(score, torch.Tensor)
        assert k.dim() >= 2
        k = self.from_group_kv(k)
        assert k.shape[:-1] == score.shape
        assert k.shape[-2] == self.block_size
        score_topk = score.topk(self.repr_topk, dim=-1).indices
        assert score_topk.shape == (self.num_units, self.unit_size, self.repr_topk)
        ret = torch.gather(
            k,
            -2,
            score_topk[:, :, :, None].expand(
                self.num_units, self.unit_size, self.repr_topk, self.dim_head
            ),
        )
        return ret

    def from_group_kv(self, tensor):
        assert tensor.dim() == 4
        assert tensor.size(1) == self.num_heads_kv
        if self.num_heads == self.num_heads_kv:
            return tensor
        _, _, length, dim_head = tensor.shape
        num_group = self.num_heads // self.num_heads_kv
        tensor = tensor.view((self.num_units, self.unit_size_kv, 1, length, dim_head))
        tensor = tensor.expand(
            (self.num_units, self.unit_size_kv, num_group, length, dim_head)
        ).reshape((self.num_units, self.num_heads, length, dim_head))
        return tensor

    def init(self, local_q, local_k, local_v, global_q, global_k, global_v):
        assert local_q.dim() == 4
        batch_size, num_heads, len_q, dim_head = local_q.shape
        num_heads_kv = local_k.size(1)

        for _t in [local_q, local_k, local_v, global_q, global_k, global_v]:
            assert _t.size(0) == batch_size
            assert _t.size(1) == num_heads or _t.size(1) == num_heads_kv
            assert _t.size(2) == len_q
            assert _t.size(3) == dim_head
            assert _t.is_cuda

        self.batch_size = batch_size
        self.num_heads = num_heads
        self.num_heads_kv = num_heads_kv
        self.dim_head = dim_head
        self.num_units = batch_size
        self.unit_size = num_heads
        self.unit_size_kv = num_heads_kv

        self.global_blocks = [[] for _ in range(self.num_units)]  # [[memory_unit]]
        self.cached_blocks = [
            {} for _ in range(self.num_units)
        ]  # [[block_id: block_score]
        self.num_global_block = 0

        if self.faiss:
            self.block_k = [
                Faiss(dim_head * self.unit_size, global_k.dtype)
                for _ in range(self.num_units)
            ]
        else:
            self.block_k = [
                VectorTensor(dim_head * self.unit_size, global_k.dtype)
                for _ in range(self.num_units)
            ]

        self.local_k = torch.empty(
            (self.num_units, self.unit_size_kv, 0, dim_head),
            dtype=local_k.dtype,
            device=local_k.device,
        )
        self.local_v = torch.empty(
            (self.num_units, self.unit_size_kv, 0, dim_head),
            dtype=local_v.dtype,
            device=local_v.device,
        )

        if self.dense_decoding:
            self.dense_k = torch.empty(
                (self.num_units, self.unit_size_kv, 0, dim_head),
                dtype=local_k.dtype,
                device=local_k.device,
            )
            self.dense_v = torch.empty(
                (self.num_units, self.unit_size_kv, 0, dim_head),
                dtype=local_v.dtype,
                device=local_v.device,
            )

        self.global_remainder = (
            torch.empty(
                (self.num_units, self.unit_size_kv, 0, dim_head),
                dtype=global_k.dtype,
                device=global_k.device,
            ),
            torch.empty(
                (self.num_units, self.unit_size_kv, 0, dim_head),
                dtype=global_v.dtype,
                device=global_v.device,
            ),
        )

        self.global_remainder_local_score = torch.empty(
            (self.num_units, self.unit_size, 0),
            dtype=global_k.dtype,
            device=global_k.device,
        )

        self.init_k = torch.empty(
            (self.num_units, self.unit_size_kv, 0, dim_head),
            dtype=global_k.dtype,
            device=global_k.device,
        )
        self.init_v = torch.empty(
            (self.num_units, self.unit_size_kv, 0, dim_head),
            dtype=global_k.dtype,
            device=global_k.device,
        )
        self.init_exc = False
        self.dtype = local_q.dtype
        self.position_embedding._update_cos_sin_tables_len(
            self.n_local + self.exc_block_size + 1, local_k.device, local_k.dim()
        )

        buffer_len = (
            self.topk * self.block_size
            + self.exc_block_size
            + self.block_size
            + self.n_init
        )
        self.global_buffer = torch.zeros(
            (2, self.num_units, self.unit_size_kv, buffer_len, dim_head),
            dtype=global_k.dtype,
            device=global_k.device,
        )
        self.global_buffer_block_id_list = [
            [-1] * self.topk for _ in range(self.num_units)
        ]
        self.global_buffer_init_st = 0
        self.global_buffer_init_ed = 0
        self.cuda_cache = CudaCache(
            self.max_cached_block * self.num_units,
            self.unit_size_kv * self.block_size * dim_head * 2,
            local_k.dtype,
        )

        self.initialized = True

    def calc_block_topk(self, global_h_q):
        if not self._use_chunk_topk:
            if self.num_global_block <= self.topk:
                return [
                    list(range(len(self.global_blocks[0])))
                    for _ in range(self.num_units)
                ]

            global_h_q = global_h_q.mean(dim=2, keepdim=False)
            assert global_h_q.shape == (self.num_units, self.unit_size, self.dim_head)
            global_h_q = global_h_q.reshape(
                self.num_units, self.dim_head * self.unit_size
            )
            ret = []
            for u in range(self.num_units):
                ret.append(self.block_k[u].get_topk(global_h_q[u], self.topk))

        else:
            return self._cached_topk[self._topk_cur]

        return ret

    def get_global_hidden_and_mask(self, len_q, block_topk):
        assert len(block_topk) == self.num_units
        global_block_map = [[] for _ in range(self.num_units)]
        global_remainder_len = max(
            self._global_remainder_ed
            - self._global_remainder_st
            + len_q
            - self.n_local,
            0,
        )
        init_len = self.init_k.size(-2)
        sliding_window = None

        global_h_k = self.global_buffer[0]
        global_h_v = self.global_buffer[1]

        block_num = len(block_topk[0])
        for u in range(self.num_units):
            assert len(block_topk[u]) == block_num

            block_topk[u].sort()
            global_block_map[u] = deepcopy(self.global_buffer_block_id_list[u])
            for b_idx in block_topk[u]:
                if b_idx in global_block_map[u]:
                    continue

                st = -1
                ed = -1
                for j in range(self.topk):
                    if (
                        global_block_map[u][j] == -1
                        or global_block_map[u][j] not in block_topk[u]
                    ):
                        st = j * self.block_size
                        ed = st + self.block_size
                        global_block_map[u][j] = b_idx
                        break

                assert b_idx in self.cached_blocks[u]
                self.global_blocks[u][b_idx].load(
                    (global_h_k[u, :, st:ed, :], global_h_v[u, :, st:ed, :])
                )

        init_st = block_num * self.block_size
        init_ed = init_st + init_len
        if (
            self.global_buffer_init_st != init_st
            or self.global_buffer_init_ed != init_ed
        ):
            global_h_k[:, :, init_st:init_ed, :].copy_(self.init_k, non_blocking=True)
            global_h_v[:, :, init_st:init_ed, :].copy_(self.init_v, non_blocking=True)

        ed = init_ed

        rmd_st = init_ed
        rmd_ed = rmd_st + global_remainder_len
        ed = rmd_ed
        global_h_k[:, :, rmd_st:rmd_ed, :].copy_(
            self.global_remainder[0][
                :,
                :,
                self._global_remainder_st : self._global_remainder_st
                + global_remainder_len,
                :,
            ],
            non_blocking=True,
        )
        global_h_v[:, :, rmd_st:rmd_ed, :].copy_(
            self.global_remainder[1][
                :,
                :,
                self._global_remainder_st : self._global_remainder_st
                + global_remainder_len,
                :,
            ],
            non_blocking=True,
        )

        sliding_window = (self.global_remainder[0].size(-2) + rmd_st, self.n_local)

        self.global_buffer_block_id_list = deepcopy(global_block_map)
        self.global_buffer_init_st = init_st
        self.global_buffer_init_ed = init_ed

        for u in range(self.num_units):
            assert max(global_block_map[u][block_num:] + [-1]) == -1
            assert min(global_block_map[u][:block_num] + [0]) > -1
            global_block_map[u] = list(global_block_map[u][:block_num])

        global_h_k = global_h_k[:, :, :ed, :]
        global_h_v = global_h_v[:, :, :ed, :]
        return global_h_k, global_h_v, sliding_window, global_block_map, block_num

    def update_block_score(
        self, global_score: torch.FloatTensor, global_block_map, global_block_num
    ):
        if global_score is not None:
            global_score = global_score[:, :, : global_block_num * self.block_size]
            assert global_score.shape == (
                self.num_units,
                self.unit_size,
                global_block_num * self.block_size,
            )
            global_score = global_score.view(
                self.num_units, self.unit_size, global_block_num, self.block_size
            )
            global_score = global_score.sum(dim=-1).sum(dim=1)
            assert global_score.shape == (self.num_units, global_block_num)
            global_score = global_score.to(
                device="cpu", non_blocking=False
            )  # (num_units, global_block_num)
            for u in range(self.num_units):
                for k, v in self.cached_blocks[u].items():
                    self.cached_blocks[u][k] = v * self.score_decay
                score = global_score[u].tolist()
                assert len(score) >= len(global_block_map[u])
                for s, i in zip(score, global_block_map[u]):
                    self.cached_blocks[u][i] += s

    def _append(self, local_q, local_k, local_v, global_q):
        # get local_h_q, local_h_k, local_h_v
        local_h_q, local_h_k = self.position_embedding(local_q, local_k)
        local_h_v = local_v

        # calc local result first to overlap host-device communication
        attn = self.Attn(local_h_q.shape, local_h_q.dtype, local_h_q.device)
        with torch.cuda.device(local_h_k.device):
            attn.append(
                local_h_q,
                local_h_k,
                local_h_v,
                get_score=True,
                sliding_window=self.n_local,
            )

        # calc topk global repr k and load cache
        with torch.cuda.stream(GLOBAL_STREAM):
            block_topk = self.calc_block_topk(global_q)

            for u in range(self.num_units):
                num_remove = len(self.cached_blocks[u]) - self.max_cached_block
                for bidx in block_topk[u]:
                    if bidx not in self.cached_blocks[u]:
                        num_remove += 1

                # update cache
                self.remove_lru_blocks(u, num_remove, block_topk[u])

            if self.cache_strategy == "lru":
                self.load_count += 1
                for u in range(self.num_units):
                    for bidx in block_topk[u]:
                        self.cached_blocks[u][bidx] = self.load_count

            elif self.cache_strategy == "lru-s":
                for u in range(self.num_units):
                    for bidx in block_topk[u]:
                        self.cached_blocks[u][bidx] = 0
            else:
                raise ValueError

            # get global_h_k, global_h_v, global_mask
            #    Beacuse exc_block_size <= n_local, no global_k, global_v used in global part
            global_h_q = global_q
            (
                global_h_k,
                global_h_v,
                global_sliding_window,
                global_block_map,
                global_block_num,
            ) = self.get_global_hidden_and_mask(local_h_q.size(-2), block_topk)

        if self.async_global_stream:
            torch.cuda.current_stream().wait_stream(GLOBAL_STREAM)

        # calc global result
        with torch.cuda.device(global_h_q.device):
            attn.append(
                global_h_q,
                global_h_k,
                global_h_v,
                end=True,
                get_score=self.calc_block_score,
                sliding_window=global_sliding_window,
                complement_sliding_window=True,
            )

        o, score_list = attn.get_result()
        loc_score = score_list[0]
        glb_score = score_list[1]

        if self.async_global_stream:
            GLOBAL_STREAM.wait_stream(torch.cuda.current_stream())

        # update global score
        with torch.cuda.stream(GLOBAL_STREAM):
            self.update_block_score(glb_score, global_block_map, global_block_num)

        return o.view((self.batch_size, self.num_heads, -1, self.dim_head)), loc_score

    def get_batched_topk(self, global_q):
        length = global_q.shape[2]
        exc_num = (length + self.exc_block_size - 1) // self.exc_block_size
        exc_block_num = length // self.exc_block_size
        ret = []
        if self.num_global_block <= self.topk:
            for _ in range(exc_num):
                ret.append(
                    [
                        list(range(len(self.global_blocks[0])))
                        for _ in range(self.num_units)
                    ]
                )
            return ret

        global_h_q = global_q
        assert global_h_q.dim() == 4
        assert global_h_q.shape[:2] == (self.num_units, self.unit_size)
        assert global_h_q.shape[3] == self.dim_head

        block_k = torch.cat(
            [self.block_k[u].get_data()[None, :, :] for u in range(self.num_units)],
            dim=0,
        )
        assert block_k.shape == (
            self.num_units,
            self.num_global_block,
            self.dim_head * self.unit_size,
        )
        block_k = (
            block_k.reshape(
                self.num_units, self.num_global_block, self.unit_size, self.dim_head
            )
            .permute(0, 2, 1, 3)
            .contiguous()
        )

        if exc_block_num > 0:
            tmp_global_h_q = (
                global_h_q[:, :, : exc_block_num * self.exc_block_size, :]
                .reshape(
                    self.num_units,
                    self.unit_size,
                    exc_block_num,
                    self.exc_block_size,
                    self.dim_head,
                )
                .mean(dim=-2)
            )
            assert tmp_global_h_q.shape == (
                self.num_units,
                self.unit_size,
                exc_block_num,
                self.dim_head,
            )
            block_score = torch.matmul(tmp_global_h_q, block_k.transpose(-1, -2)).mean(
                dim=1
            )  # (num_units, exc_block_num, num_global_block)
            assert block_score.shape == (
                self.num_units,
                exc_block_num,
                self.num_global_block,
            )

            indices = block_score.topk(self.topk, dim=-1).indices.cpu()
            for b in range(exc_block_num):
                tmp = []
                for u in range(self.num_units):
                    tmp.append(indices[u, b].tolist())
                    assert len(tmp[-1]) == self.topk

                ret.append(tmp)

        if exc_block_num != exc_num:
            tmp_global_h_q = (
                global_h_q[:, :, exc_block_num * self.exc_block_size :, :]
                .reshape(
                    self.num_units,
                    self.unit_size,
                    length - exc_block_num * self.exc_block_size,
                    self.dim_head,
                )
                .mean(dim=-2, keepdim=True)
            )
            assert tmp_global_h_q.shape == (
                self.num_units,
                self.unit_size,
                1,
                self.dim_head,
            )
            block_score = torch.matmul(tmp_global_h_q, block_k.transpose(-1, -2))
            assert block_score.shape == (
                self.num_units,
                self.unit_size,
                1,
                self.num_global_block,
            )
            block_score = block_score.squeeze(dim=2).mean(dim=1)
            assert block_score.shape == (self.num_units, self.num_global_block)
            indices = block_score.topk(self.topk, dim=-1).indices.cpu()
            tmp = []
            for u in range(self.num_units):
                tmp.append(indices[u].tolist())
                assert len(tmp[-1]) == self.topk

            ret.append(tmp)

        return ret

    def append_global(self, exc_length, kv_length, local_score):
        global_remainder_ed = self._global_remainder_ed + exc_length
        global_remainder_st = self._global_remainder_st

        global_remainder_len = global_remainder_ed - global_remainder_st

        assert local_score.shape[:3] == (self.num_units, self.unit_size, kv_length)
        local_score = local_score[:, :, -exc_length - self.n_local :]
        self.global_remainder_local_score[
            :, :, global_remainder_ed - local_score.size(-1) : global_remainder_ed
        ].add_(local_score)

        if not self.init_exc and global_remainder_len > self.n_local:
            global_k = self.global_remainder[0]
            global_v = self.global_remainder[1]

            append_init_len = min(
                self.n_init - self.init_k.size(-2), global_remainder_len - self.n_local
            )
            self.init_k = torch.cat(
                (
                    self.init_k,
                    global_k[
                        :,
                        :,
                        global_remainder_st : global_remainder_st + append_init_len,
                        :,
                    ],
                ),
                dim=-2,
            )
            self.init_v = torch.cat(
                (
                    self.init_v,
                    global_v[
                        :,
                        :,
                        global_remainder_st : global_remainder_st + append_init_len,
                        :,
                    ],
                ),
                dim=-2,
            )
            global_remainder_st += append_init_len
            global_remainder_len -= append_init_len

            if self.init_k.size(-2) == self.n_init:
                self.init_exc = True

        while global_remainder_len - self.block_size >= self.n_local:
            global_remainder_len -= self.block_size
            for u in range(self.num_units):
                self.global_blocks[u].append(
                    (
                        MemoryUnit(
                            (
                                self.global_remainder[0][
                                    u,
                                    :,
                                    global_remainder_st : global_remainder_st
                                    + self.block_size,
                                    :,
                                ],
                                self.global_remainder[1][
                                    u,
                                    :,
                                    global_remainder_st : global_remainder_st
                                    + self.block_size,
                                    :,
                                ],
                            ),
                            self.cuda_cache,
                            False,
                            self.pin_memory,
                        )
                    )
                )

            global_block_k = self.get_block_k(
                self.global_remainder[0][
                    :, :, global_remainder_st : global_remainder_st + self.block_size, :
                ],
                self.global_remainder_local_score[
                    :, :, global_remainder_st : global_remainder_st + self.block_size
                ],
            )
            assert global_block_k.shape == (
                self.num_units,
                self.unit_size,
                self.repr_topk,
                self.dim_head,
            )
            global_block_k = global_block_k.mean(dim=-2, keepdim=False)
            global_block_k = global_block_k.reshape(
                self.num_units, self.unit_size * self.dim_head
            )
            global_block_k = global_block_k[:, None, :]

            self.num_global_block += 1
            for u in range(self.num_units):
                self.block_k[u].append(global_block_k[u])
            global_remainder_st += self.block_size

        self._global_remainder_ed = global_remainder_ed
        self._global_remainder_st = global_remainder_st

    def append(
        self,
        local_q,
        local_k,
        local_v,
        global_q,
        global_k,
        global_v,
    ):
        batch_size = local_q.size(0)
        input_length = local_q.size(-2)

        if self.perhead:
            num_heads = local_q.size(1)
            num_heads_kv = local_v.size(1)

            def repeat_kv(t):
                t = t.view(batch_size, num_heads_kv, 1, input_length, -1)
                t = t.expand(
                    batch_size,
                    num_heads_kv,
                    num_heads // num_heads_kv,
                    input_length,
                    -1,
                )
                t = t.reshape(batch_size * num_heads, 1, input_length, -1)
                return t

            local_q = local_q.view(batch_size * num_heads, 1, input_length, -1)
            local_k = repeat_kv(local_k)
            local_v = repeat_kv(local_v)
            global_q = global_q.view(batch_size * num_heads, 1, input_length, -1)
            global_k = repeat_kv(global_k)
            global_v = repeat_kv(global_v)

        if not self.initialized:
            self.init(local_q, local_k, local_v, global_q, global_k, global_v)

        input_length = local_q.size(-2)

        if self.async_global_stream:
            GLOBAL_STREAM.wait_stream(torch.cuda.current_stream())

        # append local and global tensor
        self.local_k = torch.cat((self.local_k, local_k), dim=-2)
        self.local_v = torch.cat((self.local_v, local_v), dim=-2)
        kv_length = self.local_k.size(-2)

        if self.dense_decoding:
            self.dense_k = torch.cat((self.dense_k, local_k), dim=-2)
            self.dense_v = torch.cat((self.dense_v, local_v), dim=-2)

        # append global remainder
        with torch.cuda.stream(GLOBAL_STREAM):
            self._global_remainder_st = 0
            self._global_remainder_ed = self.global_remainder[0].size(-2)

            self.global_remainder = (
                torch.cat((self.global_remainder[0], global_k), dim=-2),
                torch.cat((self.global_remainder[1], global_v), dim=-2),
            )

            self.global_remainder_local_score = torch.cat(
                (
                    self.global_remainder_local_score,
                    torch.zeros(
                        (self.num_units, self.unit_size, global_k.size(-2)),
                        dtype=global_k.dtype,
                        device=global_k.device,
                    ),
                ),
                dim=-1,
            )

        with torch.cuda.stream(GLOBAL_STREAM):
            global_q = self.position_embedding.apply_rotary_pos_emb_one_angle(
                global_q, self.n_local
            )

        use_chunk_topk = self.chunk_topk_calc is not None and input_length > 1
        self._use_chunk_topk = use_chunk_topk
        if use_chunk_topk:
            exc_block_num = input_length // self.exc_block_size
            exc_block_per_topk_chunk = self.chunk_topk_calc // self.exc_block_size
            calc_cur_list = [
                i * self.exc_block_size
                for i in range(0, exc_block_num + 1, exc_block_per_topk_chunk)
            ]
            if calc_cur_list[-1] < input_length:
                calc_cur_list.append(input_length)
            self._topk_cur = 0
            self._topk_calc_cur = -1

        o_list = []

        for st in range(0, input_length, self.exc_block_size):
            ed = min(st + self.exc_block_size, input_length)
            if use_chunk_topk and calc_cur_list[self._topk_calc_cur + 1] < ed:
                # calculate topk and sync with host here
                assert ed <= calc_cur_list[self._topk_calc_cur + 2]
                self._topk_calc_cur += 1
                with torch.cuda.stream(GLOBAL_STREAM):
                    self._cached_topk = self.get_batched_topk(
                        global_q[
                            :,
                            :,
                            calc_cur_list[self._topk_calc_cur] : calc_cur_list[
                                self._topk_calc_cur + 1
                            ],
                            :,
                        ]
                    )
                self._topk_cur = 0

            kv_st = max(kv_length + st - input_length - self.n_local, 0)
            kv_ed = kv_length + ed - input_length
            chunk_o, local_score = self._append(
                local_q[:, :, st:ed, :],
                self.local_k[:, :, kv_st:kv_ed, :],
                self.local_v[:, :, kv_st:kv_ed, :],
                global_q[:, :, st:ed, :],
            )
            o_list.append(chunk_o)

            # append global
            with torch.cuda.stream(GLOBAL_STREAM):
                self.append_global(ed - st, kv_ed - kv_st, local_score)

            if self.async_global_stream:
                torch.cuda.current_stream().wait_stream(GLOBAL_STREAM)

            if use_chunk_topk:
                self._topk_cur += 1

        self.length += input_length

        # update local and global tensor
        if self.local_k.size(-2) >= self.n_local:
            self.local_k = self.local_k[:, :, -self.n_local :, :]
            self.local_v = self.local_v[:, :, -self.n_local :, :]

        assert self._global_remainder_ed == self.global_remainder[0].size(-2)
        with torch.cuda.stream(GLOBAL_STREAM):
            self.global_remainder = (
                self.global_remainder[0][:, :, self._global_remainder_st :, :],
                self.global_remainder[1][:, :, self._global_remainder_st :, :],
            )
            self.global_remainder_local_score = self.global_remainder_local_score[
                :, :, self._global_remainder_st :
            ]

        ret = torch.cat(o_list, dim=-2)

        if self.perhead:
            ret = ret.view(batch_size, num_heads, input_length, -1)

        return ret

    def size(self, *args, **kwargs):
        return self.length


def inf_llm_forward(
    n_local,
    n_init,
    topk,
    block_size,
    max_cached_block,
    exc_block_size,
    repr_topk: int = 1,
    cache_strategy="lru",
    score_decay=None,
    chunk_topk_calc=None,
    async_global_stream=True,
    pin_memory=False,
    faiss=False,
    perhead=False,
    dense_decoding=False,
    *args,
    **kwargs
):
    def forward(
        self,
        query: torch.Tensor,
        key_value: torch.Tensor,
        position_bias: Optional[torch.Tensor],
        use_cache: bool,
        past_key_value,
        project_q,
        project_k,
        project_v,
        attention_out,
        dim_head,
        num_heads,
        num_heads_kv,
    ):
        batch_size = query.size(0)
        len_q = query.size(1)
        len_k = key_value.size(1)

        # assert use_cache

        h_q = project_q(query)  # (batch, len_q, num_heads * dim_head)
        h_k = project_k(key_value)  # (batch, len_k, num_heads * dim_head)
        h_v = project_v(key_value)  # (batch, len_k, num_heads * dim_head)

        h_q = (
            h_q.view(batch_size, len_q, num_heads, dim_head)
            .permute(0, 2, 1, 3)
            .contiguous()
        )  # (batch, num_heads, len_q, dim_head)
        h_k = (
            h_k.view(batch_size, len_k, num_heads_kv, dim_head)
            .permute(0, 2, 1, 3)
            .contiguous()
        )  # (batch, num_heads_kv, len_k, dim_head)
        h_v = (
            h_v.view(batch_size, len_k, num_heads_kv, dim_head)
            .permute(0, 2, 1, 3)
            .contiguous()
        )  # (batch, num_heads_kv, len_k, dim_head)

        if len_q == 1 and dense_decoding:
            past_k = past_key_value.dense_k
            past_v = past_key_value.dense_v

            h_k = torch.cat((past_k, h_k), dim=-2)
            h_v = torch.cat((past_v, h_v), dim=-2)

            past_key_value.dense_k = h_k
            past_key_value.dense_v = h_v

            h_q, h_k = position_bias(h_q, h_k)

            # (batch_size, seqlen, nheads, headdim)
            h_q = h_q.transpose(1, 2)
            h_k = h_k.transpose(1, 2)
            h_v = h_v.transpose(1, 2)

            # (batch_size, seqlen, nheads, headdim)
            o = dense_decoding_func(h_q, h_k, h_v)

            o = o.reshape(batch_size, len_q, dim_head * num_heads)
            o = attention_out(o)

            if use_cache:
                return o, past_key_value
            else:
                return o

        if past_key_value is None:
            past_key_value = ContextManager(
                position_bias,
                n_init,
                n_local,
                block_size,
                max_cached_block,
                topk,
                exc_block_size,
                score_decay,
                repr_topk,
                cache_strategy,
                chunk_topk_calc,
                async_global_stream,
                pin_memory,
                faiss,
                perhead,
                dense_decoding=dense_decoding,
            )

        local_q, local_k, local_v = h_q, h_k, h_v
        global_q, global_k, global_v = h_q, h_k, h_v

        o = past_key_value.append(
            local_q,
            local_k,
            local_v,
            global_q,
            global_k,
            global_v,
        )

        o = o.view(batch_size, num_heads, len_q, dim_head).permute(0, 2, 1, 3)
        o = o.reshape(batch_size, len_q, dim_head * num_heads)
        o = attention_out(o)

        if use_cache:
            return o, past_key_value
        else:
            return o

    return forward


class GreedySearch:
    def __init__(self, model, tokenizer):
        model.eval()
        self.device = model.device
        self.model = model
        self.tokenizer = tokenizer
        self.past_kv = None

    def clear(self):
        self.past_kv = None

    def _process_texts(self, input_text):
        model_inputs = {}
        input_ids = self.tokenizer.encode(input_text)

        model_inputs["input_ids"] = input_ids
        model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])

        for key in model_inputs:
            model_inputs[key] = (
                torch.tensor(model_inputs[key]).int().unsqueeze(0).cuda()
            )

        return model_inputs

    def generate(self, text=None, input_ids=None, **kwargs):
        if input_ids is None:
            model_inputs = self._process_texts(text)
            input_ids = model_inputs["input_ids"]

        with torch.inference_mode():
            result = self._decode(input_ids, **kwargs)

        self.clear()
        return result

    def _decode(
        self,
        input_ids,
        max_length=100,
        extra_end_token_ids=[],
        chunk_size: int = 4096,
        output=False,
    ):
        if input_ids.dim() == 1:
            input_ids = input_ids[None, :]
        input_ids = input_ids.cuda()
        attention_mask = torch.ones_like(input_ids)
        assert input_ids.size(0) == 1
        length = input_ids.size(1)
        end_token_ids = extra_end_token_ids + [self.tokenizer.eos_token_id]
        logits = None
        past_key_values = self.past_kv
        if output:
            output_text = ""

        for i in range(max_length + 1):
            if i == 0:
                if chunk_size is None:
                    chunk_size = input_ids.size(1)
                for st in range(0, input_ids.size(1) - 1, chunk_size):
                    ed = min(input_ids.size(1) - 1, st + chunk_size)
                    out = self.model(
                        input_ids=input_ids[:, st:ed],
                        attention_mask=attention_mask[:, :ed],
                        use_cache=True,
                        return_dict=True,
                        past_key_values=past_key_values,
                    )
                    logits, past_key_values = out.logits, out.past_key_values

                out = self.model(
                    input_ids=input_ids[:, -1:],
                    attention_mask=attention_mask,
                    use_cache=True,
                    return_dict=True,
                    past_key_values=past_key_values,
                )
                logits, past_key_values = out.logits, out.past_key_values
            else:
                out = self.model(
                    input_ids=input_ids[:, -1:],
                    attention_mask=attention_mask,
                    past_key_values=past_key_values,
                    use_cache=True,
                    return_dict=True,
                )
                logits, past_key_values = out.logits, out.past_key_values

            logits = logits[:, -1, :]
            word = logits.argmax(dim=-1)
            if word.item() in end_token_ids or i == max_length:
                break

            input_ids = torch.cat(
                (input_ids, word.view(1, 1).to(input_ids.device)), dim=-1
            )
            attention_mask = torch.cat(
                (
                    attention_mask,
                    torch.ones(
                        (attention_mask.size(0), 1),
                        dtype=torch.int,
                        device=attention_mask.device,
                    ),
                ),
                dim=-1,
            )
            if output:
                tmp = self.tokenizer.decode(input_ids.squeeze(0)[length:])
                if len(tmp) > len(output_text):
                    import sys

                    sys.stdout.write(tmp[len(output_text) :])
                    sys.stdout.flush()
                    output_text = tmp

        self.past_kv = past_key_values

        if output:
            sys.stdout.write("\n")
            sys.stdout.flush()

        # return [self.tokenizer.decode(input_ids.squeeze(0)[length:])]
        return input_ids


class InfLLMGenerator(GreedySearch):
    def generate(
        self,
        input_ids=None,
        generation_config=None,
        pad_token_id=None,
        max_new_tokens=None,
    ):
        if max_new_tokens is not None:
            max_new_tokens = max_new_tokens
        else:
            max_new_tokens = generation_config.max_new_tokens
        return super().generate(
            text=None,
            input_ids=input_ids,
            max_length=max_new_tokens,
            chunk_size=8192,
            extra_end_token_ids=[pad_token_id] if pad_token_id is not None else [],
        )

    @torch.no_grad()
    def __call__(self, input_ids=None, *args, **kwargs):
        # chunked forward
        chunk_size = 8192
        all_logits = torch.empty(0, dtype=torch.bfloat16).to(input_ids.device)
        for st in range(0, input_ids.size(1), chunk_size):
            torch.cuda.empty_cache()
            ed = min(input_ids.size(1), st + chunk_size)
            out = self.model(
                input_ids=input_ids[:, st:ed],
            )
            logits = out.logits.to(torch.bfloat16)
            all_logits = torch.cat((all_logits, logits), dim=1)

        return CausalLMOutput(logits=all_logits)
