#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from vllm import LLM, SamplingParams

from sal.config import Config
from sal.models.reward_models import PRM
from sal.utils.score import aggregate_scores


def t_best_of_n(x, config: Config, llm: LLM, prm: PRM):
    convs = [
        [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": prompt},
        ]
        for prompt in x["problem"]
    ]
    tokenizer = llm.get_tokenizer()
    # TODO: set the augmented template from a file
    if config.custom_chat_template is not None:
        tokenizer.chat_template = config.custom_chat_template
    templated_convs = tokenizer.apply_chat_template(
        convs, tokenize=False, add_generation_prompt=True
    )

    # Duplicate convs to generate config.n completions per prompt so we can do continous batching
    # This makes [p1, p2, p3, p4] become [p1, p1, p2, p2, p3, p3, p4, p4] for e.g. config.n=2
    templated_convs = [c for conv in templated_convs for c in [conv] * config.n]

    # Initialize empty lists for completions and completion tokens
    completions = [[] for _ in range(len(x["problem"]))]
    completion_tokens = [[] for _ in range(len(x["problem"]))]

    sampling_params = SamplingParams(
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        n=1,  # Since we've already duplicated the prompt_token_ids, we only need to generate 1 completion per prompt
    )

    responses = llm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )
    if len(responses) != len(x["problem"]) * config.n:
        raise ValueError(
            f"Generated {len(responses)} responses instead of {len(x['problem'] * config.n)}"
        )

    print('round 1 completions', len(completions), config.n, len(responses))

    for i in range(len(completions)):
        completions[i] = [
            output.text
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]
        completion_tokens[i] = [
            len(output.token_ids)
            for r in responses[i * config.n : (i + 1) * config.n]
            for output in r.outputs
        ]
    
    new_convs = []

    reflection_prompt = "Wait! Maybe I made some mistakes! I need to rethink from scratch."
    reflection_prompt_token_ids = tokenizer.encode(reflection_prompt)


    # for i in range(len(convs)):
    #     tmp_conv = convs[i].copy()
    #     tmp_conv.append({"role": "assistant", "content": completions[i // config.n][i % config.n] + reflection_prompt})
    #     new_convs.append(tmp_conv)
    
    # templated_convs = tokenizer.apply_chat_template(
    #     new_convs,
    #     tokenize=True,
    # )

    for i in range(len(templated_convs)):
        tmp_conv = templated_convs[i]
        tmp_conv += completions[i // config.n][i % config.n] + reflection_prompt
        tmp_conv = tokenizer.encode(tmp_conv)
        new_convs.append(tmp_conv)

    templated_convs = new_convs

    for i in range(len(templated_convs)):
        templated_convs[i] = {
            "prompt_token_ids": templated_convs[i]
        }

    new_responses = llm.generate(
        templated_convs,
        sampling_params=sampling_params,
        use_tqdm=False,
    )

    print(len(new_responses), len(templated_convs), config.n)

    for i in range(len(completions)):
        # concatenate the completions with the new responses
        print('completions', i, len(completions[i]))
        completions[i] = [
            completions[i][j] + reflection_prompt + output.text
            for j in range(config.n)
            for output in new_responses[i * config.n + j].outputs
        ]
        completion_tokens[i] = [
            completion_tokens[i][j] + len(reflection_prompt_token_ids) + len(output.token_ids)
            for j in range(config.n)
            for output in new_responses[i * config.n + j].outputs
        ]

    # Check we generated the correct number of completions for each prompt
    for c in completions:
        if len(c) != config.n:
            raise ValueError(f"Generated {len(c)} completions instead of {config.n}")

    scores = prm.score(x["problem"], completions)
    agg_scores = [
        [aggregate_scores(s, config.agg_strategy) for s in score] for score in scores
    ]

    # Select the completion with the highest score
    pred = [completion[np.argmax(s)] for completion, s in zip(completions, agg_scores)]

    x["completions"] = completions
    x["scores"] = scores
    x["pred"] = pred
    x["completion_tokens"] = completion_tokens

    return x
