# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from omegaconf import ListConfig
import os
from typing import List, Union, Optional
import copy
import pandas as pd
from collections import defaultdict

import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer, ProcessorMixin

from verl.utils.model import compute_position_id_with_mask
import verl.utils.torch_functional as verl_F


def collate_fn(data_list: list[dict]) -> dict:
    tensors = defaultdict(list)
    non_tensors = defaultdict(list)

    for data in data_list:
        for key, val in data.items():
            if isinstance(val, torch.Tensor):
                tensors[key].append(val)
            else:
                non_tensors[key].append(val)

    for key, val in tensors.items():
        tensors[key] = torch.stack(val, dim=0)

    for key, val in non_tensors.items():
        non_tensors[key] = np.array(val, dtype=object)

    return {**tensors, **non_tensors}


def process_image(image: dict, max_pixels: int = 2048 * 2048, min_pixels: int = 512 * 512):
    import math
    from io import BytesIO
    from PIL import Image

    if isinstance(image, dict):
        image = Image.open(BytesIO(image['bytes']))

    if (image.width * image.height) > max_pixels:
        resize_factor = math.sqrt(max_pixels / (image.width * image.height))
        width, height = int(image.width * resize_factor), int(image.height * resize_factor)
        image = image.resize((width, height))

    if (image.width * image.height) < min_pixels:
        resize_factor = math.sqrt(min_pixels / (image.width * image.height))
        width, height = int(image.width * resize_factor), int(image.height * resize_factor)
        image = image.resize((width, height))

    if image.mode != 'RGB':
        image = image.convert('RGB')

    return image


class RLHFDataset(Dataset):
    """
    We assume the dataset contains a column that contains prompts and other information
    """

    def __init__(self,
                 parquet_files: Union[str, List[str]],
                 tokenizer: PreTrainedTokenizer,
                 processor: Optional[ProcessorMixin] = None,
                 prompt_key='question',
                 image_key='images',
                 max_prompt_length=1024,
                 filter_prompts=True,
                 cache_dir='~/.cache/verl/rlhf',
                 chat_template_func=None,
                 return_raw_chat=False,
                 truncation='left',
                 finetune_prompt=0,
                 control_output=False,
                 first_no_lang=False,
                 first_random_prompt=False,
                 all_random=False,
                 language_num=10,
                 langcode_to_name=[]):
        if not isinstance(parquet_files, (List, ListConfig)):
            parquet_files = [parquet_files]

        self.parquet_files = copy.deepcopy(parquet_files)
        self.original_parquet_files = copy.deepcopy(parquet_files)  # use for resume
        self.cache_dir = os.path.expanduser(cache_dir)
        self.tokenizer = tokenizer
        self.processor = processor

        self.prompt_key = prompt_key
        self.image_key = image_key
        self.max_prompt_length = max_prompt_length
        self.filter_prompts = filter_prompts

        self.return_raw_chat = return_raw_chat
        self.chat_template_func = chat_template_func
        self.truncation = truncation

        # whether to store the dataset in state_dict()
        # default not store
        self.serialize_dataset = False
        self.finetune_prompt=finetune_prompt

        self.control_output=control_output
        self.first_no_lang=first_no_lang
        self.first_random_prompt=first_random_prompt
        self.all_random=all_random
        self.language_num=language_num
        self._download()
        self._read_files_and_tokenize()
        self.i = 0
        self.langcode_to_name=langcode_to_name

    def _download(self, use_origin_parquet=False):
        from verl.utils.fs import copy_to_local
        parquet_files = self.parquet_files if not use_origin_parquet else self.original_parquet_files
        for i, parquet_file in enumerate(parquet_files):
            self.parquet_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir)

    def _read_files_and_tokenize(self):
        dataframes = []
        for parquet_file in self.parquet_files:
            # read parquet files and cache
            dataframe = pd.read_parquet(parquet_file)
            dataframes.append(dataframe)
        self.dataframe = pd.concat(dataframes)

        print(f'original dataset len: {len(self.dataframe)}')

        # filter out too long prompts
        tokenizer = self.tokenizer
        prompt_key = self.prompt_key
        print(self.dataframe)
        self.dataframe = self.dataframe[self.dataframe.apply(lambda doc: len(
            tokenizer.apply_chat_template([{"role": "system", "content": doc[prompt_key].split("\n\n")[0]},{"role": "user", "content": "\n\n".join(doc[prompt_key].split("\n\n")[1:])}], add_generation_prompt=True)) <= self.max_prompt_length,
                                                             axis=1)]

        print(f'filter dataset len: {len(self.dataframe)}')

    def resume_dataset_state(self):
        self.serialize_dataset = False if hasattr(self, 'original_parquet_files') else True
        # resume dataframe if not it's serialized in data.pt
        if not self.serialize_dataset:
            self._download(use_origin_parquet=True)  # download and resume from original parquet files
            self._read_files_and_tokenize()
        else:
            print(r'old dataloader ckpt file is used, please train from scratch for better ckpt performance')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, item):
        """
        Note that we also return the raw_input_ids so that it can be combined with other chat template
        """
        row_dict: dict = self.dataframe.iloc[item].to_dict()
        control_response={
                                "Chinese": "好的 ",
                                "English": "Okay,",
                                "Japanese": "はい、",
                                "German": "Gut,",
                                "Arabic": "حسنًا،",
                                "Spanish": "Bien,",
                                "French": "D'accord,",
                                "Italian": "Va bene,",
                                "Indonesia": "Baik,",
                                "Korean": "알겠습니다,",
                                "Portuguese": "Tudo bem,",
                                "Russian": "Хорошо,",
                                "Thai": "ดี,",
                                "Vietnamese": "Được rồi,"
                            }
        chat = row_dict.pop(self.prompt_key)
        system_prompt=chat.split("\n\n")[0]
        if not self.all_random:
            if self.first_no_lang:
                system_prompt = system_prompt.replace(" in English", "")
                system_prompt = system_prompt.replace(" in a randomly selected language", "")
            elif self.first_random_prompt:
                system_prompt = system_prompt.replace(" in English", " in a randomly selected language")
        if self.first_no_lang or self.first_random_prompt or " in English" not in system_prompt:
            prompt_with_chat_template = self.tokenizer.apply_chat_template([{"role": "system", "content": system_prompt},{"role": "user", "content": "\n\n".join(chat.split("\n\n")[1:])}], add_generation_prompt=True, tokenize=False)
        else:
            prompt_with_chat_template = self.tokenizer.apply_chat_template([{"role": "system", "content": system_prompt},{"role": "user", "content": "\n\n".join(chat.split("\n\n")[1:])}], add_generation_prompt=True, tokenize=False) + ("" if not self.control_output else control_response["English"])

        if self.finetune_prompt > 0:
            translations = [
                                chat.split("\n\n")[0],
                                "下面是描述任务的说明。用中文一步步思考。然后在'#### '之后输出最终答案（只有数字）。",
                                "فيما يلي تعليمات تصف مهمة. فكر فيها خطوة بخطوة باللغة العربية. أخرج الإجابة النهائية (فقط الرقم) بعد '#### '.",
                                "A continuación se muestra una instrucción que describe una tarea. Piénsalo paso a paso en español. Produce la respuesta final (solo número) después de '#### '.",
                                "Unten folgt eine Anweisung, die eine Aufgabe beschreibt. Denken Sie Schritt für Schritt auf Deutsch darüber nach. Geben Sie die endgültige Antwort (nur Nummer) nach '#### ' aus.",
                                "Voici une instruction qui décrit une tâche. Pensez-y étape par étape en français. Produisez la réponse finale (seulement le numéro) après '#### '.",
                                "Di bawah ini adalah instruksi yang menggambarkan sebuah tugas. Pikirkan langkah demi langkah dalam bahasa Indonesia. Keluarkan jawaban akhir (hanya angka) setelah '#### '.",
                                "Di seguito è riportata un'istruzione che descrive un compito. Pensa passo dopo passo in italiano. Produci la risposta finale (solo numero) dopo '#### '.",
                                "以下はタスクを説明する指示です。日本語で一歩ずつ考えます。「#### 」の後に最終回答（数字のみ）を出力してください。",
                                "아래는 작업을 설명하는 지침입니다. 한국어로 단계별로 생각하세요. '#### ' 다음에 최종 답변 (숫자만) 을 출력하세요.",
                                "Ниже приводится инструкция, описывающая задачу. Продумайте ее шаг за шагом на русском языке. Выведите окончательный ответ (только число) после '#### '.",
                                "Abaixo está uma instrução que descreve uma tarefa. Pense passo a passo em português. Produza a resposta final (apenas número) após '#### '.",
                                "ด้านล่างนี้เป็นคำแนะนำที่อธิบายงาน. คิดทีละขั้นตอนเป็นภาษาไทย. แสดงคำตอบสุดท้าย (เฉพาะตัวเลข) หลัง '#### '.",
                                "Dưới đây là hướng dẫn mô tả một nhiệm vụ. Nghĩ từng bước bằng tiếng Việt. Đưa ra câu trả lời cuối cùng (chỉ số) sau '#### '."
                            ]
            import random
            if self.langcode_to_name==[]:
                if self.language_num == 10:
                    self.langcode_to_name = ["Bengali", "Chinese", "English", "French", "German", "Japanese", "Russian", "Swahili", "Spanish", "Thai"] # ['Chinese', 'English', 'Japanese', 'German', 'Arabic', 'Spanish', 'French', 'Italian', 'Indonesia', 'Korean', 'Portuguese', 'Russian', 'Thai', 'Vietnamese']
                elif self.language_num == 15:
                    self.langcode_to_name = ["Arabic", "Bengali", "Chinese", "English", "French", "German", "Japanese", 'Korean', 'Portuguese', "Russian", "Swahili", "Spanish", "Thai", "Telugu", 'Vietnamese']

            
            if not self.all_random:
                prompt_with_chat_templates=[prompt_with_chat_template]
            else:
                prompt_with_chat_templates=[]
            start=len(prompt_with_chat_templates)
            for i in range(start,self.finetune_prompt):
                # sys_str = random.sample(translations,1)[0]
                lang=random.sample(self.langcode_to_name,1)[0]
                sys_str = chat.split("\n\n")[0].replace("English", lang)
                prompt_with_chat_templates.append(self.tokenizer.apply_chat_template([{"role": "system", "content": sys_str},{"role": "user", "content": "\n\n".join(chat.split("\n\n")[1:])}], add_generation_prompt=True, tokenize=False)+ ("" if not self.control_output else control_response[lang]))
            
        
        if self.image_key in row_dict:  # expand image token
            raw_prompt = prompt_with_chat_template.replace('<image>', '<|vision_start|><|image_pad|><|vision_end|>')
            row_dict['multi_modal_data'] = {'image': [process_image(image) for image in row_dict.pop(self.image_key)]}
            image_inputs = self.processor.image_processor(row_dict['multi_modal_data']['image'], return_tensors='pt')
            image_grid_thw = image_inputs['image_grid_thw']
            row_dict['multi_modal_inputs'] = {key: val for key, val in image_inputs.items()}

            if image_grid_thw is not None:
                merge_length = self.processor.image_processor.merge_size**2
                index = 0
                while '<image>' in prompt_with_chat_template:
                    prompt_with_chat_template = prompt_with_chat_template.replace(
                        '<image>',
                        '<|vision_start|>' + '<|placeholder|>' * (image_grid_thw[index].prod() // merge_length) +
                        '<|vision_end|>',
                        1,
                    )
                    index += 1

                prompt_with_chat_template = prompt_with_chat_template.replace('<|placeholder|>',
                                                                              self.processor.image_token)
        else:
            raw_prompt = prompt_with_chat_template
        if self.finetune_prompt > 0:
            for i in range(self.finetune_prompt):
                input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(prompt=prompt_with_chat_templates[i],
                                                                                tokenizer=self.tokenizer,
                                                                                max_length=self.max_prompt_length,
                                                                                pad_token_id=self.tokenizer.pad_token_id,
                                                                                left_pad=True,
                                                                                truncation=self.truncation)

                position_ids = compute_position_id_with_mask(attention_mask)
                if 'input_ids' not in row_dict:
                    row_dict['input_ids'] = input_ids
                    row_dict['attention_mask'] = attention_mask
                    row_dict['position_ids'] = position_ids
                else:
                    row_dict['input_ids'] = torch.cat((row_dict['input_ids'], input_ids), dim=0)
                    row_dict['attention_mask'] = torch.cat((row_dict['attention_mask'], attention_mask), dim=0)
                    row_dict['position_ids'] = torch.cat((row_dict['position_ids'], position_ids), dim=0)
                
        else:
            input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(prompt=prompt_with_chat_template,
                                                                            tokenizer=self.tokenizer,
                                                                            max_length=self.max_prompt_length,
                                                                            pad_token_id=self.tokenizer.pad_token_id,
                                                                            left_pad=True,
                                                                            truncation=self.truncation)

            if self.image_key in row_dict:
                from verl.models.transformers.qwen2_vl import get_rope_index

                position_ids = get_rope_index(
                    self.processor,
                    input_ids=input_ids[0],
                    image_grid_thw=image_grid_thw,
                    attention_mask=attention_mask[0],
                )  # (3, seq_len)
            else:
                position_ids = compute_position_id_with_mask(attention_mask)

            row_dict['input_ids'] = input_ids[0]
            row_dict['attention_mask'] = attention_mask[0]
            row_dict['position_ids'] = position_ids[0]
        row_dict['raw_prompt_ids'] = self.tokenizer.encode(raw_prompt, add_special_tokens=False)

        # encode prompts without chat template
        if self.return_raw_chat:
            row_dict['raw_prompt'] = chat.tolist()

        # add index for each prompt
        index = row_dict.get("extra_info", {}).get("index", self.i)
        row_dict["index"] = index
        #data_source = row_dict.get("data_source", "gsm8k/") + str(index)
        #row_dict["data_source"] = data_source
        self.i += 1

        return row_dict

    def __getstate__(self):
        if not self.serialize_dataset:
            state = self.__dict__.copy()

            if 'dataframe' in state:
                del state['dataframe']
            return state
        return self.__dict__.copy()
