/*
 * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "src/turbomind/kernels/ban_bad_words.h"
#include <cfloat>
// #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
// #include "src/turbomind/utils/cuda_utils.h"
#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace turbomind {

template<typename T>
__device__ inline T getMaxValue();

template<>
__device__ inline float getMaxValue<float>()
{
    return FLT_MAX;
}

template<>
__device__ inline half getMaxValue<half>()
{
    return __ushort_as_half((unsigned short)0x7BFFU);
}

#ifdef ENABLE_BF16
template<>
__device__ inline __nv_bfloat16 getMaxValue<__nv_bfloat16>()
{
#if __CUDA_ARCH__ >= 800
    return __ushort_as_bfloat16((unsigned short)0x7F7FU);
#endif
    return {};
}
#endif

template<typename T>
__global__ void ban_bad_words(T*         logits,
                              const int* output_ids_buf,
                              const int* parent_ids_buf,
                              int        batch_size,
                              int        beam_width,
                              const int* bad_words,
                              size_t     bad_words_len,
                              bool       share_words,
                              int        id_offset,
                              int        vocab_size_padded,
                              size_t     step)
{
    const int id        = blockIdx.x * blockDim.x + threadIdx.x;
    const int batch_idx = blockIdx.y / beam_width;
    const int beam_idx  = blockIdx.y % beam_width;

    const int* base_bad_words         = share_words ? bad_words : bad_words + batch_idx * 2 * bad_words_len;
    const int* base_bad_words_offsets = base_bad_words + bad_words_len;

    if (id >= bad_words_len || base_bad_words_offsets[id] < 0) {
        return;
    }

    const int item_end   = base_bad_words_offsets[id];
    const int item_start = (id > 0) ? base_bad_words_offsets[id - 1] : 0;
    const int item_size  = item_end - item_start;

    /* The single-token case unconditionally bans the token */
    bool should_ban = item_size == 1;

    /* Multi-token case and enough previously generated tokens to look for a match */
    if (item_size > 1 && step >= item_size - 1) {
        should_ban             = true;
        int        parent_id   = beam_idx;
        const bool gather_beam = beam_width > 1;

        for (int token_idx = item_size - 2; token_idx >= 0; token_idx--) {
            const int previous_token = output_ids_buf[(step - (item_size - 1) + token_idx) * batch_size * beam_width
                                                      + id_offset + batch_idx * beam_width + parent_id];

            if (previous_token != base_bad_words[item_start + token_idx]) {
                should_ban = false;
                break;
            }
            if (gather_beam) {
                parent_id = parent_ids_buf[(step - (item_size - 1) + token_idx) * beam_width * batch_size + id_offset
                                           + batch_idx * beam_width + parent_id];

                if (parent_id < 0 || parent_id >= beam_width) {
                    should_ban = false;
                    break;
                }
            }
        }
    }

    if (should_ban) {
        int banned_token = base_bad_words[item_end - 1];
        if (0 < banned_token && banned_token < vocab_size_padded) {
            logits[batch_idx * beam_width * vocab_size_padded + beam_idx * vocab_size_padded + banned_token] =
                -getMaxValue<T>();
        }
    }
}

template<typename T>
void invokeBanBadWords(T*           logits,
                       const int*   output_ids_buf,
                       const int*   parent_ids_buf,
                       int          batch_size,
                       int          local_batch_size,
                       int          beam_width,
                       const int*   bad_words,
                       bool         share_words,
                       size_t       bad_words_len,
                       int          id_offset,
                       int          vocab_size_padded,
                       size_t       step,
                       cudaStream_t stream)
{
    dim3 block, grid;
    block.x = min((unsigned long)((bad_words_len + 32 - 1) / 32) * 32, 256UL);
    grid.x  = (bad_words_len + block.x - 1) / block.x;
    grid.y  = local_batch_size * beam_width;

    ban_bad_words<<<grid, block, 0, stream>>>(logits,
                                              output_ids_buf,
                                              parent_ids_buf,
                                              batch_size,
                                              beam_width,
                                              bad_words,
                                              bad_words_len,
                                              share_words,
                                              id_offset,
                                              vocab_size_padded,
                                              step);
}

#define INSTANTIATE_INVOKE_BAN_BAD_WORDS(T)                                                                            \
    template void invokeBanBadWords<T>(T * logits,                                                                     \
                                       const int*   output_ids_buf,                                                    \
                                       const int*   parent_ids_buf,                                                    \
                                       int          batch_size,                                                        \
                                       int          local_batch_size,                                                  \
                                       int          beam_width,                                                        \
                                       const int*   bad_words,                                                         \
                                       bool         share_words,                                                       \
                                       size_t       bad_words_len,                                                     \
                                       int          id_offset,                                                         \
                                       int          vocab_size_padded,                                                 \
                                       size_t       step,                                                              \
                                       cudaStream_t stream);

INSTANTIATE_INVOKE_BAN_BAD_WORDS(float);

}  // namespace turbomind
