# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import copy

from omegaconf import ListConfig
from typing import List, Union
import copy
import pandas as pd

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, PreTrainedTokenizer
from verl.utils.fs import copy_local_path_from_hdfs

from verl.utils.model import compute_position_id_with_mask
import verl.utils.torch_functional as verl_F

from uniform_eval.support.job_utils import get_reference


def collate_fn(data_list: list[dict]) -> dict:
    tensors = {}
    non_tensors = {}
    for data in data_list:
        for key, val in data.items():
            if isinstance(val, torch.Tensor):
                if key not in tensors:
                    tensors[key] = []
                tensors[key].append(val)
            else:
                if key not in non_tensors:
                    non_tensors[key] = []
                non_tensors[key].append(val)

    for key, val in tensors.items():
        tensors[key] = torch.stack(val, dim=0)

    for key, val in non_tensors.items():
        non_tensors[key] = np.array(val, dtype=object)
        
    output = {}
    output.update(tensors)
    output.update(non_tensors)
    return output


class RLHFDataset(Dataset):
    """
    We assume the dataset contains a column that contains prompts and other information
    """

    def __init__(self,
                 jsonl_files: Union[str, List[str]],
                 tokenizer: PreTrainedTokenizer,
                 prompt_key='messages',
                 max_prompt_length=1024,
                 filter_prompts=True,
                 cache_dir='~/.cache/verl/rlhf',
                 chat_template_func=None,
                 return_raw_chat=False,
                 truncation='error'):
        if not isinstance(jsonl_files, (List, ListConfig)):
            jsonl_files = [jsonl_files]

        self.jsonl_files = jsonl_files
        self.cache_dir = os.path.expanduser(cache_dir)
        self.tokenizer = tokenizer

        self.prompt_key = prompt_key
        self.max_prompt_length = max_prompt_length
        self.filter_prompts = filter_prompts

        self.return_raw_chat = return_raw_chat
        self.chat_template_func = chat_template_func
        self.truncation = truncation

        # whether to store the dataset in state_dict()
        # default not store
        self.serialize_dataset = False
        self._download()
        self._read_files_and_tokenize()

    def _download(self, use_origin_parquet=False):
        from verl.utils.fs import copy_local_path_from_hdfs

        # we ignore use_origin_parquet because we do not support saving training files for resume
        for i, jsonl_file in enumerate(self.jsonl_files):
            self.jsonl_files[i] = copy_local_path_from_hdfs(src=jsonl_file, cache_dir=self.cache_dir)

    def _uni_data_format(self, item):
        job = item.copy()
        new_item = {}
        new_item['data_source'] = item.get('data_source',item.get('task',"default"))
        for key in ['answer', 'ability', 'task']:
            new_item[key] = item.get(key, "default")

        for key in ['reward_model', 'extra_info']:
            new_item[key] = item.get(key, {})
        
        if self.prompt_key in item:
            new_item[self.prompt_key] = item[self.prompt_key]

        new_item['job'] = job
        if 'rollout_item' in item:  # for rollout
            new_item['rollout_item'] = item['rollout_item']

        assert self.prompt_key == "messages", "\"prompt_key\" has to be set as \"message\""
        # construct messages if not existed
        if self.prompt_key == "messages" and "messages" not in new_item:
            messages = []
            assert "prompt" in job
            system_str = job.get("eval_args", {}).get("system_str", None)
            if system_str:
                messages.append({'role': 'system', 'content': system_str})

            queries, responses = get_reference(job)
            turn_idx = len(queries) - 1
            for i in range(turn_idx):
                messages.append({'role': 'user', 'content': queries[i]})
                messages.append({'role': 'assistant', 'content': responses[i]})
            messages.append({'role': 'user', 'content': queries[turn_idx]})

            new_item['messages'] = messages

        return new_item

    def _read_files_and_tokenize(self):
        dataframe = []
        for jsonl_file in self.jsonl_files:
            # read jsonl files and cache
            with open(jsonl_file,'r') as f:
                lines = f.readlines()
                for line in lines:
                    item = self._uni_data_format(json.loads(line))
                    dataframe.append(item)
        self.dataframe = dataframe

        print(f'original dataset len: {len(self.dataframe)}')

        # filter out too long prompts #jing
        # tokenizer = self.tokenizer
        # prompt_key = self.prompt_key
        # self.dataframe = self.dataframe[self.dataframe.apply(lambda doc: len(
        #     tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)) <= self.max_prompt_length,
        #                                                      axis=1)]


        # filter out too long prompts
        tokenizer = self.tokenizer
        prompt_key = self.prompt_key
        self.dataframe = [doc for doc in self.dataframe if len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)) <= self.max_prompt_length]

        # # # assert if too long prompts
        # tokenizer = self.tokenizer
        # prompt_key = self.prompt_key
        # for idx, doc in enumerate(self.dataframe):
        #     prompt = tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)
        #     prompt_length = len(prompt)
        #     if prompt_length > self.max_prompt_length:
        #         print(f"\nFound too long prompt at index {idx}:")
        #         print("\nExceeding prompt:")
        #         print(doc[prompt_key])
        #         print(f"Prompt length: {prompt_length}")
        #         print(f"Max allowed length: {self.max_prompt_length}")
        #         print(f"Exceeding by: {prompt_length - self.max_prompt_length}")
        #         assert False, f"Prompt length ({prompt_length}) exceeds maximum allowed length ({self.max_prompt_length})"

        print(f'filter dataset len: {len(self.dataframe)}')

    def resume_dataset_state(self):
        self.serialize_dataset = False #if hasattr(self, 'original_parquet_files') else True
        # resume dataframe if not it's serialized in data.pt
        if not self.serialize_dataset:
            self._download(use_origin_parquet=True)  # download and resume from original parquet files
            self._read_files_and_tokenize()
        else:
            print(r'old dataloader ckpt file is used, please train from scratch for better ckpt performance')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, item):
        """
        Note that we also return the raw_input_ids so that it can be combined with other chat template
        """
        
        row_dict = copy.deepcopy(self.dataframe[item])

        # chat = row_dict.pop(self.prompt_key)
        chat = row_dict[self.prompt_key]

        prompt_with_chat_template = self.tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
        # prompt_with_chat_template = chat[0]['content']
        row_dict['prompt_with_chat_template'] = prompt_with_chat_template

        input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(prompt=prompt_with_chat_template,
                                                                         tokenizer=self.tokenizer,
                                                                         max_length=self.max_prompt_length,
                                                                         pad_token_id=self.tokenizer.pad_token_id,
                                                                         left_pad=True,
                                                                         truncation=self.truncation)

        position_ids = compute_position_id_with_mask(attention_mask)

        row_dict['input_ids'] = input_ids[0]
        row_dict['attention_mask'] = attention_mask[0]
        row_dict['position_ids'] = position_ids[0]

        # encode prompts without chat template
        if self.return_raw_chat:
            row_dict['raw_prompt'] = chat.tolist()

        # add index for each prompt
        index = row_dict.get("extra_info", {}).get("index", 0)
        row_dict["index"] = index

        return row_dict

    def __getstate__(self):
        if not self.serialize_dataset:
            state = self.__dict__.copy()

            if 'dataframe' in state:
                del state['dataframe']
            return state
        return self.__dict__.copy()
