# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The file is temporarily reverted by Reasoning360 to use `dataframe` rather than `Dataset`, to support heterogeneous keys of multi-domain data

import copy
import os
from collections import defaultdict
from typing import List, Optional, Union

import numpy as np
import pandas as pd
import torch
from omegaconf import ListConfig
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer, ProcessorMixin

import verl.utils.torch_functional as verl_F
from verl.utils.model import compute_position_id_with_mask


def collate_fn(data_list: list[dict]) -> dict:
    tensors = defaultdict(list)
    non_tensors = defaultdict(list)

    for data in data_list:
        for key, val in data.items():
            if isinstance(val, torch.Tensor):
                tensors[key].append(val)
            else:
                non_tensors[key].append(val)

    for key, val in tensors.items():
        tensors[key] = torch.stack(val, dim=0)

    for key, val in non_tensors.items():
        non_tensors[key] = np.array(val, dtype=object)

    return {**tensors, **non_tensors}


def process_image(image: dict, max_pixels: int = 2048 * 2048, min_pixels: int = 512 * 512):
    import math
    from io import BytesIO

    from PIL import Image

    if isinstance(image, dict):
        image = Image.open(BytesIO(image["bytes"]))

    if (image.width * image.height) > max_pixels:
        resize_factor = math.sqrt(max_pixels / (image.width * image.height))
        width, height = int(image.width * resize_factor), int(image.height * resize_factor)
        image = image.resize((width, height))

    if (image.width * image.height) < min_pixels:
        resize_factor = math.sqrt(min_pixels / (image.width * image.height))
        width, height = int(image.width * resize_factor), int(image.height * resize_factor)
        image = image.resize((width, height))

    if image.mode != "RGB":
        image = image.convert("RGB")

    return image


class RLHFDataset(Dataset):
    """
    We assume the dataset contains a column that contains prompts and other information
    """

    def __init__(
        self,
        data_files: Union[str, List[str]],
        tokenizer: PreTrainedTokenizer,
        processor: Optional[ProcessorMixin] = None,
        prompt_key="prompt",
        image_key="images",
        max_prompt_length=1024,
        filter_prompts=True,
        cache_dir="~/.cache/verl/rlhf",
        chat_template_func=None,
        return_raw_chat=False,
        truncation="error",
        filter_overlong_prompts=False,
        config=None,
    ):
        self.config = config
        # Prioritize config if provided
        if config is not None:
            self.cache_dir = os.path.expanduser(config.get("cache_dir", cache_dir))
            self.prompt_key = config.get("prompt_key", prompt_key)
            self.image_key = config.get("image_key", image_key)
            self.video_key = config.get("video_key", None)
            self.max_prompt_length = config.get("max_prompt_length", max_prompt_length)
            self.return_raw_chat = config.get("return_raw_chat", return_raw_chat)
            self.return_full_prompt = config.get("return_full_prompt", False)
            self.truncation = config.get("truncation", truncation)
            self.filter_overlong_prompts = config.get("filter_overlong_prompts", filter_overlong_prompts)
            self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
            self.num_workers = min(self.num_workers, os.cpu_count())
            self.use_shm = config.get("use_shm", False)
            self.chat_template_func = config.get("chat_template_func", chat_template_func)
            self.need_tools_kwargs = config.get("need_tools_kwargs", False)
            self.filter_prompts = config.get("filter_prompts", filter_prompts)
        else:
            self.cache_dir = os.path.expanduser(cache_dir)
            self.prompt_key = prompt_key
            self.image_key = image_key
            self.video_key = None
            self.max_prompt_length = max_prompt_length
            self.return_raw_chat = return_raw_chat
            self.return_full_prompt = False
            self.truncation = truncation
            self.filter_overlong_prompts = filter_overlong_prompts
            self.num_workers = max(1, os.cpu_count() // 4)
            self.num_workers = min(self.num_workers, os.cpu_count())
            self.use_shm = False
            self.chat_template_func = chat_template_func
            self.need_tools_kwargs = False
            self.filter_prompts = filter_prompts

        parquet_files = data_files
        if not isinstance(parquet_files, (List, ListConfig)):
            parquet_files = [parquet_files]
        self.parquet_files = copy.deepcopy(parquet_files)
        self.original_parquet_files = copy.deepcopy(parquet_files)  # use for resume
        self.tokenizer = tokenizer
        self.processor = processor
        # whether to store the dataset in state_dict()
        # default not store
        self.serialize_dataset = False
        self._download()
        self._read_files_and_tokenize()

    def _download(self, use_origin_parquet=False):
        from verl.utils.fs import copy_to_local

        parquet_files = self.parquet_files if not use_origin_parquet else self.original_parquet_files
        for i, parquet_file in enumerate(parquet_files):
            self.parquet_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir)

    def _read_files_and_tokenize(self):
        dataframes = []
        for parquet_file in self.parquet_files:
            # read parquet files and cache
            try:
                dataframe = pd.read_parquet(parquet_file)
            except Exception:
                # if pandas fails (most likely due to nested columns), use polars
                # NOTE: added by Reasoning360
                import polars as pl

                dataframe = pl.read_parquet(parquet_file).to_pandas()
            dataframes.append(dataframe)
        self.dataframe = pd.concat(dataframes)

        print(f"dataset len: {len(self.dataframe)}")

        print(self.dataframe.head())

        # Safely check if apply_chat_template exists in dataframe
        # NOTE: added by Reasoning360
        if "apply_chat_template" not in self.dataframe:
            print("Warning: apply_chat_template column not found in dataframe. Defaulting to True.")
            self.dataframe["apply_chat_template"] = [True] * len(self.dataframe)

        # filter out too long prompts
        if self.filter_overlong_prompts:
            tokenizer = self.tokenizer
            prompt_key = self.prompt_key
            self.dataframe = self.dataframe[
                self.dataframe.apply(
                    lambda doc: len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True) if doc["apply_chat_template"] else tokenizer.encode(doc["raw_prompt"])) <= self.max_prompt_length,
                    axis=1,
                )
            ]

            print(f"filter dataset len: {len(self.dataframe)}")

    def resume_dataset_state(self):
        self.serialize_dataset = False if hasattr(self, "original_parquet_files") else True
        # resume dataframe if not it's serialized in data.pt
        if not self.serialize_dataset:
            self._download(use_origin_parquet=True)  # download and resume from original parquet files
            self._read_files_and_tokenize()
        else:
            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, item):
        """
        Note that we also return the raw_input_ids so that it can be combined with other chat template
        """
        row_dict: dict = self.dataframe.iloc[item].to_dict()

        chat = row_dict.pop(self.prompt_key)

        prompt_with_chat_template = self.tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False) if row_dict["apply_chat_template"] else row_dict["raw_prompt"]

        is_multi_modal = self.image_key in row_dict
        if is_multi_modal:  # expand image token
            raw_prompt = prompt_with_chat_template.replace("<image>", "<|vision_start|><|image_pad|><|vision_end|>")
            row_dict["multi_modal_data"] = {"image": [process_image(image) for image in row_dict.pop(self.image_key)]}
            image_inputs = self.processor.image_processor(row_dict["multi_modal_data"]["image"], return_tensors="pt")
            image_grid_thw = image_inputs["image_grid_thw"]
            row_dict["multi_modal_inputs"] = {key: val for key, val in image_inputs.items()}

            if image_grid_thw is not None:
                merge_length = self.processor.image_processor.merge_size**2
                index = 0
                while "<image>" in prompt_with_chat_template:
                    prompt_with_chat_template = prompt_with_chat_template.replace(
                        "<image>",
                        "<|vision_start|>" + "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length) + "<|vision_end|>",
                        1,
                    )
                    index += 1

                prompt_with_chat_template = prompt_with_chat_template.replace("<|placeholder|>", self.processor.image_token)
        else:
            raw_prompt = prompt_with_chat_template

        input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(
            prompt=prompt_with_chat_template,
            tokenizer=self.tokenizer,
            max_length=self.max_prompt_length,
            pad_token_id=self.tokenizer.pad_token_id,
            left_pad=True,
            truncation=self.truncation,
        )

        if is_multi_modal:
            from verl.models.transformers.qwen2_vl import get_rope_index

            position_ids = get_rope_index(
                self.processor,
                input_ids=input_ids[0],
                image_grid_thw=image_grid_thw,
                attention_mask=attention_mask[0],
            )  # (3, seq_len)
        else:
            position_ids = compute_position_id_with_mask(attention_mask)

        row_dict["input_ids"] = input_ids[0]
        row_dict["attention_mask"] = attention_mask[0]
        row_dict["position_ids"] = position_ids[0]
        row_dict["raw_prompt_ids"] = self.tokenizer.encode(raw_prompt, add_special_tokens=False)

        # encode prompts without chat template
        if self.return_raw_chat:
            row_dict["raw_prompt"] = chat.tolist()

        # add index for each prompt
        index = row_dict.get("extra_info", {}).get("index", 0)
        row_dict["index"] = index

        return row_dict

    def __getstate__(self):
        if not self.serialize_dataset:
            state = self.__dict__.copy()

            if "dataframe" in state:
                del state["dataframe"]
            return state
        return self.__dict__.copy()
