# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import json
from typing import Optional

import fire
import torch
import pandas as pd
from tqdm import tqdm
from transformers import Seq2SeqTrainingArguments

from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.extras.misc import get_device_count
from llamafactory.extras.packages import is_vllm_available
from llamafactory.hparams import get_infer_args
from llamafactory.model import load_tokenizer


if is_vllm_available():
    from vllm import LLM, SamplingParams
    from vllm.lora.request import LoRARequest


def tokenize(x, tokenizer):
    message_text = [
        {
            "role": "user",
            "content": x,
        },
    ]
    return tokenizer.apply_chat_template(message_text, tokenize=False)


def vllm_infer(
    model_name_or_path: str,
    adapter_name_or_path: str = None,
    dataset: str = "alpaca_en_demo",
    dataset_dir: str = "data",
    template: str = "default",
    cutoff_len: int = 2048,
    max_samples: Optional[int] = None,
    vllm_config: str = "{}",
    save_name: str = "generated_predictions.jsonl",
    temperature: float = 0.95,
    top_p: float = 0.7,
    top_k: int = 50,
    max_new_tokens: int = 1024,
    repetition_penalty: float = 1.0,
    skip_special_tokens: bool = True,
    default_system: Optional[str] = None,
    enable_thinking: bool = True,
    seed: Optional[int] = None,
    pipeline_parallel_size: int = 1,
    image_max_pixels: int = 768 * 768,
    image_min_pixels: int = 32 * 32,
    video_fps: float = 2.0,
    video_maxlen: int = 128,
    batch_size: int = 1024,
    quantized: bool = False,
    pred_multi: bool = False,
    num_modi: int = -1,
    output_dir: str = "dummy_dir",
):
    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.

    Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
    """
    if pipeline_parallel_size > get_device_count():
        raise ValueError(
            "Pipeline parallel size should be smaller than the number of gpus."
        )

    model_args, data_args, _, generating_args = get_infer_args(
        dict(
            model_name_or_path=model_name_or_path,
            adapter_name_or_path=adapter_name_or_path,
            dataset=dataset,
            dataset_dir=dataset_dir,
            template=template,
            cutoff_len=cutoff_len,
            max_samples=max_samples,
            preprocessing_num_workers=16,
            default_system=default_system,
            enable_thinking=enable_thinking,
            vllm_config=vllm_config,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
        )
    )

    training_args = Seq2SeqTrainingArguments(output_dir="dummy_dir")
    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate

    if quantized:
        engine_args = {
            "model": model_args.model_name_or_path,
            "trust_remote_code": True,
            "dtype": torch.bfloat16,
            "max_model_len": cutoff_len + max_new_tokens,
            "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
            "quantization": "bitsandbytes",
            "disable_log_stats": True,
            "max_lora_rank": 32,
            "enable_lora": model_args.adapter_name_or_path is not None,
        }
    else:
        engine_args = {
            "model": model_args.model_name_or_path,
            "trust_remote_code": True,
            "dtype": model_args.infer_dtype,
            "max_model_len": cutoff_len + max_new_tokens,
            "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
            "disable_log_stats": True,
            "max_lora_rank": 32,
            "enable_lora": model_args.adapter_name_or_path is not None,
        }
    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}

    if isinstance(model_args.vllm_config, dict):
        engine_args.update(model_args.vllm_config)

    llm = LLM(**engine_args)

    # load datasets
    # dataset_module = get_dataset(template_obj, model_args, data_args, training_args, "ppo", **tokenizer_module)
    # train_dataset = dataset_module["train_dataset"]

    sampling_params = SamplingParams(
        repetition_penalty=generating_args.repetition_penalty
        or 1.0,  # repetition_penalty must > 0
        temperature=generating_args.temperature,
        top_p=generating_args.top_p or 1.0,  # top_p must > 0
        top_k=generating_args.top_k or -1,  # top_k must > 0
        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
        max_tokens=generating_args.max_new_tokens,
        skip_special_tokens=skip_special_tokens,
        seed=seed,
    )
    if model_args.adapter_name_or_path is not None:
        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
    else:
        lora_request = None

    # Store all results in these lists
    all_prompts, all_preds, all_labels = [], [], []

    # Load the dataset
    df = pd.read_csv(dataset_dir)
    if pred_multi:
        assert num_modi > 0, "num_modi must be greater than 0 for multi-prediction."
        for i in range(num_modi):
            df[f"prompt_{i}"] = df[f"prompt_{i}"].apply(
                lambda x: tokenize(x, tokenizer=tokenizer)
            )
            prompts = df[f"prompt_{i}"].tolist()
            vllm_inputs = [template_obj.apply_template(prompt) for prompt in prompts]
            results = llm.generate(
                vllm_inputs, sampling_params, lora_request=lora_request
            )
            preds = [result.outputs[0].text for result in results]
            df[f"pred_{i}"] = preds
            print("*" * 70)
            print(
                f"{len(df) * num_modi} total generated results have been saved at {output_dir}."
            )
            print("*" * 70)
    else:
        df[f"prompt"] = df[f"prompt"].apply(lambda x: tokenize(x, tokenizer=tokenizer))
        prompts = df[f"prompt"].tolist()
        results = llm.generate(prompts, sampling_params, lora_request=lora_request)
        preds = [result.outputs[0].text for result in results]
        df[f"pred"] = preds
        print("*" * 70)
        print(f"{len(df)} total generated results have been saved at {output_dir}.")
        print("*" * 70)

    df.to_csv(output_dir, index=False)
    gc.collect()


if __name__ == "__main__":
    fire.Fire(vllm_infer)
