from .image_base import ImageBaseDataset
from .utils import build_judge, Spatial457_utils
from ..smp import *


class COCO_Caption_Scorer():
    def __init__(self, ref, gt):
        from pycocoevalcap.bleu.bleu import Bleu
        from pycocoevalcap.rouge.rouge import Rouge
        from pycocoevalcap.cider.cider import Cider

        self.ref = ref
        self.gt = gt
        print('setting up scorers...')
        self.scorers = [
            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
            (Rouge(), 'ROUGE_L'),
            (Cider(), 'CIDEr'),
        ]

    def compute_scores(self):
        total_scores = {}
        for scorer, method in self.scorers:
            print('computing %s score...' % (scorer.method()))
            score, scores = scorer.compute_score(self.gt, self.ref)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    print('%s: %0.3f' % (m, sc * 100))
                total_scores['Bleu'] = [x * 100 for x in score]
            else:
                print('%s: %0.3f' % (method, score * 100))
                total_scores[method] = score * 100

        print('*****DONE*****')
        for key, value in total_scores.items():
            print('{}:{}'.format(key, value))
        return total_scores
    

class MMMUProCaptionDataset(ImageBaseDataset):

    TYPE = 'Caption'

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        if 'MMMU_Pro_V' in self.dataset_name:
            self.data['question'] = ['placeholder'] * len(self.data)

    DATASET_URL = {
        'MMMU_Pro_10c_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_Pro_10c.tsv',
        'MMMU_Pro_10c_COT_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_Pro_10c.tsv',
        'MMMU_Pro_V_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_Pro_V.tsv',
        'MMMU_Pro_V_COT_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_Pro_V.tsv',
    }

    @staticmethod
    def split_MMMU(msgs):
        text, images = None, []
        for s in msgs:
            if s['type'] == 'image':
                images.append(s['value'])
            elif s['type'] == 'text':
                assert text is None
                text = s['value']
        text_segs = text.split('<image ')
        if len(text_segs) == 1:
            return msgs

        segs = [dict(type='text', value=text_segs[0])]
        for i, seg in enumerate(text_segs):
            if i == 0:
                continue
            assert istype(seg[0], int) and seg[1] == '>'
            image_idx = int(seg[0]) - 1
            segs.append(dict(type='image', value=images[image_idx]))
            segs.append(dict(type='text', value=seg[2:]))
        return segs
    
    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        if 'MMMU_Pro_V' in self.dataset_name:
            question = 'Answer the following multiple-choice question in the image. '
            if 'COT' in self.dataset_name:
                question += (
                    "The last line of your response should be of the following format: 'Answer: $LETTER' "
                    "(without quotes) where LETTER is one of the options. Think step by step before answering. "
                )
            else:
                question += "Answer directly with the option letter from the given choices. "
            if isinstance(tgt_path, list):
                assert len(tgt_path) == 1
                tgt_path = tgt_path[0]
            return [dict(type='image', value=tgt_path), dict(type='text', value=question)]
        else:
            question = line['question']
            options = {
                cand: line[cand]
                for cand in string.ascii_uppercase
                if cand in line and not pd.isna(line[cand])
            }


            options_prompt = 'Options:\n'
            for key, item in options.items():
                options_prompt += f'{key}. {item}\n'
            prompt = ''
            prompt += f'Question: {question}\n'
            # we do not need the options for generating captions
            # if len(options):
                # prompt += options_prompt
                # if 'COT' in self.dataset_name:
                #     prompt += (
                #         "Answer the following multiple-choice question. The last line of your response should be of "
                #         "the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of the options. "
                #         "Think step by step before answering. "
                #     )
                # else:
                #     prompt += "Answer directly with the option letter from the given choices. "

            msgs = []
            if isinstance(tgt_path, list):
                msgs.extend([dict(type='image', value=p) for p in tgt_path])
            else:
                msgs = [dict(type='image', value=tgt_path)]
            msgs.append(dict(type='text', value=prompt))
            msgs = self.split_MMMU(msgs)
            caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
            msgs.append(dict(type='text', value=caption_prompt))
            return msgs

    def cot_postproc(self, response):
        lines = response.strip().split('\n')
        lines = [x.strip() for x in lines]
        cands = [x for x in lines if x.startswith('Answer:')]
        if len(cands) == 1:
            counter = defaultdict(lambda: 0)
            for ch in cands[0]:
                if ch in string.ascii_uppercase:
                    counter[ch] += 1
            if len(counter) == 1:
                return list(counter.keys())[0]
            else:
                return cands[0][7:]
        return response

    def evaluate(self, eval_file, **judge_kwargs):
        pass

class MMMUCaptionDataset(ImageBaseDataset):
    TYPE = 'Caption'
    DATASET_URL = {
        'MMMU_DEV_VAL_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
        'MMMU_TEST_CAPTION': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
    }

    DATASET_MD5 = {
        'MMMU_DEV_VAL_CAPTION': '585e8ad75e73f75dcad265dfd0417d64',
        'MMMU_TEST_CAPTION': 'c19875d11a2d348d07e5eb4bdf33166d',
    }

    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass
    
    @staticmethod
    def split_MMMU(msgs):
        text, images = None, []
        for s in msgs:
            if s['type'] == 'image':
                images.append(s['value'])
            elif s['type'] == 'text':
                assert text is None
                text = s['value']
        text_segs = text.split('<image ')
        if len(text_segs) == 1:
            return msgs

        segs = [dict(type='text', value=text_segs[0])]
        
        for i, seg in enumerate(text_segs):
            if i == 0:
                continue
            assert istype(seg[0], int) and seg[1] == '>'
            image_idx = int(seg[0]) - 1
            segs.append(dict(type='image', value=images[image_idx]))
            segs.append(dict(type='text', value=seg[2:]))
        return segs

    def build_prompt(self, line):
        msgs = super().build_prompt(line)
        msgs = self.split_MMMU(msgs)
        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        msgs.append(dict(type='text', value=caption_prompt))
        return msgs
    
class MathVisionCaption(ImageBaseDataset):
    TYPE = 'Caption'
    DATASET_URL = {
        'MathVisionCaption':
        'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
        'MathVisionCaption_MINI':
        'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
    }
    DATASET_MD5 = {
        'MathVisionCaption': '93f6de14f7916e598aa1b7165589831e',
        'MathVisionCaption_MINI': '060fe4fa5d868987ce179307bd5f8a33'
    }

    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass

    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        original_question = line['question']

        # original_question is formatted as "Hint: {hint} Question: {question}", We only need the question part
        if 'Question:' in original_question:
            original_question = original_question.split('Question:')[-1].strip()

        caption_prompt = f"Given the image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image.\n" \
            f"Question: {original_question}"

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=caption_prompt))
        return msgs


class MathVistaCaption(ImageBaseDataset):
    TYPE = 'VQA'
    DATASET_URL = {
        'MathVista_MINI_CAPTION':
        'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
    }

    # It returns a DataFrame
    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass

    def build_prompt(self, line):
        msgs = super().build_prompt(line)
        # traverse all msgs with type 'text'
        for i, msg in enumerate(msgs):
            if msg['type'] == 'text':
                # detect if there is "Question:" in the text
                if 'Question:' in msg['value']:
                    # remove the "Question:" part
                    msgs[i]['value'] = 'Question:' + msg['value'].split('Question:')[-1].strip()
                    
        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        msgs.append(dict(type='text', value=caption_prompt))
        return msgs
    
class ImageMCQCaptionDataset(ImageBaseDataset):
    TYPE = 'Caption'

    DATASET_URL = {
        'ScienceQA_VAL_CAPTION': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
        'ScienceQA_TEST_CAPTION': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',

        'SpatialEval_Caption': 'https://opencompass.openxlab.space/utils/VLMEval/SpatialEval.tsv',
        'RealWorldQA_Caption': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
    }

    def build_prompt(self, line):

        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        question = line['question']

        # we do not need the options for generating captions
        options = {
            cand: line[cand]
            for cand in string.ascii_uppercase
            if cand in line and not pd.isna(line[cand])
        }
        options_prompt = 'Options:\n'
        for key, item in options.items():
            options_prompt += f'{key}. {item}\n'
        # hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
        prompt = ''
        # if hint is not None:
        #     prompt += f'Hint: {hint}\n'
        prompt += f'Question: {question}\n'

        if len(options):
            prompt += options_prompt

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=prompt))
        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        msgs.append(dict(type='text', value=caption_prompt))
        return msgs
    

class MathVerseCaption(ImageBaseDataset):
    TYPE = 'Caption'
    DATASET_URL = {
        'MathVerse_MINI_Caption':
        'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv',  # noqa
    }

    # Given one data record, return the built prompt (a multi-modal message), can override
    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)
        if 'cot' in self.dataset_name:
            question = line['query_cot']
        else:
            question = line['question']

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=question))


        # traverse all msgs with type 'text'
        for i, msg in enumerate(msgs):
            if msg['type'] == 'text':
                # detect if there is "Question:" in the text
                if 'Question:' in msg['value']:
                    # remove the part before "Question:"
                    msgs[i]['value'] = 'Question: ' + msg['value'].split('Question:')[-1].strip()
                if 'Choices:' in msg['value']:
                    # remove the part after "Choices:"
                    msgs[i]['value'] = msgs[i]['value'].split('Choices:')[0].strip()

        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        msgs.append(dict(type='text', value=caption_prompt))

        return msgs

    # It returns a DataFrame
    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass

class OlympiadBenchCaption(ImageBaseDataset):
    TYPE = 'Caption'
    DATASET_URL = {
        'OlympiadBenchCaption':
        'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
    }

    def dump_image(self, line):
        os.makedirs(self.img_root, exist_ok=True)

        tgt_path_z = []
        if isinstance(line['image'], list):
            for i in range(len(line['image'])):
                tgt_path = osp.join(self.img_root,
                                    f"{line['index']}--{i + 1}.jpg")
                if not read_ok(tgt_path):
                    decode_base64_to_image_file(line['image'][i], tgt_path)
                tgt_path_z.append(tgt_path)
        else:
            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
            if not read_ok(tgt_path):
                decode_base64_to_image_file(line['image'], tgt_path)
            tgt_path_z.append(tgt_path)
        return tgt_path_z

    def build_prompt(self, line):

        from .utils.olympiadbench import get_answer_type_text, make_input

        self.is_chinese = 'zh' in line['source']
        self.is_math = 'maths' in line['source']
        self.is_theorem_proving = 'TP' in line['source']
        if self.is_chinese:
            subject_content = '数学' if self.is_math else '物理'
            if self.is_theorem_proving:
                prompt = (
                    f"Question: 以下是中国{subject_content}竞赛中的证明题。请根据题目的要求，运用逻辑推理及常用定理证明题目中的命题。"
                    "证明过程中使用的变量和公式请使用LaTeX格式表示。")
            else:
                answer_type_text = get_answer_type_text(
                    line['answer_type'],
                    is_chinese=True,
                    multiple_answer=line['is_multiple_answer'])
                if line['is_multiple_answer']:
                    multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
                else:
                    multiple_answer_text = '\\boxed{答案}'
                unit_text = ''
                if line['unit']:
                    multiple_answer_text += '(单位)'
                    unit_text = '，注意答案的单位不要放在\\boxed{}中'
                prompt = (
                    f'Question: 以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
                    f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
                    f'显式给出结果{unit_text}。')
        else:
            subject_content = 'Math' if self.is_math else 'Physics'
            if self.is_theorem_proving:
                prompt = (
                    f'Question: The following is a theorem proving problem from an International {subject_content} competition. '
                    'Please use logical reasoning and common theorems to prove the proposition in the problem '
                    'according to the given requirements. '
                    'Please use LaTeX format to represent the variables and formulas used in the proof.'
                )
            else:
                if line['is_multiple_answer']:
                    multiple_answer_text = '\\boxed{multiple answers connected with commas}'
                else:
                    multiple_answer_text = '\\boxed{answer}'
                unit_text = ''
                if line['unit']:
                    multiple_answer_text += '(unit)'
                    unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
                answer_type_text = get_answer_type_text(
                    line['answer_type'],
                    is_chinese=False,
                    multiple_answer=line['is_multiple_answer'])
                prompt = (
                    f'Question: The following is an open-ended problem from an International {subject_content} competition. '
                    f'{answer_type_text}Please calculate the answer according to the given requirements and '
                    'the information provided. Please use LaTeX format to represent the variables and formulas '
                    'used in the solution process and results. Please end your solution with "So the final answer '
                    f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
                )

        if self.is_math:
            input = make_input(prompt, line['question'])
        else:
            if 'context' in line.keys() and str(
                    line['context']) != 'nan':  # cannot be null
                input = make_input(prompt,
                                   line['context'] + '\n' + line['question'])
            else:
                input = make_input(prompt, line['question'])

        ret = [dict(type='text', value=input)]
        tgt_path = self.dump_image(line)

        ret.extend([dict(type='image', value=s) for s in tgt_path])
        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        ret.append(dict(type='text', value=caption_prompt))
        return ret

    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass
    

class LogicVistaCaption(ImageBaseDataset):
    TYPE = 'Caption'
    DATASET_URL = {
        'LogicVistaCaption':
        'https://opencompass.openxlab.space/utils/VLMEval/LogicVista.tsv'
    }

    def evaluate(self, eval_file, **judge_kwargs):
        pass

    def build_prompt(self, line):
        msgs = super().build_prompt(line)
        caption_prompt = f"Given the above image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image."
        msgs.append(dict(type='text', value=caption_prompt))
        return msgs

class Spatial457Caption(ImageBaseDataset):
    TYPE = "Caption"
    # When ROBUST is True, if the models does not follow the format, all of the response will be treated as answers.
    ROBUST = True

    DATASET_URL = {
        "Spatial457Caption": "http://opencompass.openxlab.space/utils/VLMEval/Spatial457.tsv",
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dataset_utils = Spatial457_utils()

    def evaluate(self, eval_file, **judge_kwargs):
        pass

    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        question = "Question: " + line['question']

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=question))


        set_type = line["category"]

        caption_prompt = f"Given a image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image. \n"

        msgs.insert(0, {"type": "text", "value": caption_prompt})

        return msgs


class MMERealWorldCaption(ImageBaseDataset):

    TYPE = 'Caption'

    DATASET_MD5 = {
        'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
        'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
        'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
    }

    @classmethod
    def supported_datasets(cls):
        return ['MMERealWorldCaption']

    def load_data(
        self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
    ):
        dataset = dataset.replace("MMERealWorldCaption", "MME-RealWorld")
        def check_integrity(pth):
            data_file = osp.join(pth, f"{dataset}.tsv")

            if not os.path.exists(data_file):
                print(f"{data_file} does not exist.")
                return False

            # if md5(data_file) != self.DATASET_MD5[dataset]:
            #     return False
            return True

        def generate_tsv(pth):
            tsv_file = os.path.join(pth, f"{dataset}.tsv")

            if os.path.exists(tsv_file):
                print(f"{tsv_file} already exists.")
                return

            json_dir = os.path.join(pth, dataset)
            json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]

            data_list = []
            for json_file in json_files:
                with open(os.path.join(json_dir, json_file), "r") as f:
                    data = json.load(f)
                    for item in tqdm(data):
                        choice_prompt = (
                            "The choices are listed below:\n"
                            if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
                            else "选项如下所示:\n"
                        )
                        data_list.append(
                            {
                                "index": item["index"],
                                "image": item["image"],
                                "question": item["question"],
                                "multi-choice options": choice_prompt
                                + "\n".join(item["multi-choice options"]),
                                "A": item["multi-choice options"][0][4:],
                                "B": item["multi-choice options"][1][4:],
                                "C": item["multi-choice options"][2][4:],
                                "D": item["multi-choice options"][3][4:],
                                "E": item["multi-choice options"][4][4:],
                                "answer": item["answer"],
                                "category": item["category"],
                                "l2-category": item["l2-category"],
                            }
                        )
            df = pd.DataFrame(data_list)
            df.to_csv(tsv_file, sep="\t", index=False)
            print(f"TSV file saved to {tsv_file}")

        # Check if dataset is cached and has integrity
        if dataset == "MME-RealWorld-Lite":
            url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv'  # noqa: E501
            file_md5 = (
                self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
            )
            datas = self.prepare_tsv(url, file_md5)
            choice_prompt = "The choices are listed below:\n"
            for index, item in datas.iterrows():
                options = eval(item["multi-choice options"])
                datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
                    options
                )
                datas.loc[index, "A"] = options[0][4:]
                datas.loc[index, "B"] = options[1][4:]
                datas.loc[index, "C"] = options[2][4:]
                datas.loc[index, "D"] = options[3][4:]
                datas.loc[index, "E"] = options[4][4:]
            return datas

        update_flag = False
        cache_path = get_cache_path(repo_id)
        if cache_path is not None and check_integrity(cache_path):
            dataset_path = cache_path
            print(f"Using cached dataset from {cache_path}")
        else:
            from huggingface_hub import snapshot_download

            # Download or find the dataset path
            dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
            generate_tsv(dataset_path)
            update_flag = True

        data_path = os.path.join(dataset_path, f"{dataset}.tsv")
        if file_size(data_path, "GB") > 1:
            local_path = data_path.replace(".tsv", "_local.tsv")
            if (
                not osp.exists(local_path)
                or os.environ.get("FORCE_LOCAL", None)
                or update_flag
            ):
                from vlmeval.tools import LOCALIZE

                LOCALIZE(data_path, local_path)
            data_path = local_path
        return load(data_path)

    def post_build(self, dataset):
        self.TYPE = 'MMERealWorld'

    # Given one data record, return the built prompt (a multi-modal message), can override
    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        if 'Question:' not in line['question']:
            question = 'Question: ' + line['question']

        choice_prompt = line['multi-choice options'] + '\n'

        question += ' ' + choice_prompt

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=question))

        caption_prompt = f"Given a image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image. \n"

        msgs.insert(0, {"type": "text", "value": caption_prompt})

        return msgs

    # It returns a dictionary
    @classmethod
    def evaluate(self, eval_file, **judge_kwargs):
        pass
    

class ImageYORNCaptionDataset(ImageBaseDataset):

    TYPE = 'Caption'

    DATASET_URL = {
        'MMECaption': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
        'HallusionBenchCaption': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
        'POPECaption': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
        'AMBERCaption': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
    }


    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        question = 'Question: ' + line['question']

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=question))

        caption_prompt = f"Given a image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. But do not repeat the question in your response.\n" \
            f"3. DO NOT generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image. \n"

        msgs.insert(0, {"type": "text", "value": caption_prompt})
        return msgs

    # It returns a dataframe
    def evaluate(self, eval_file, **judge_kwargs):
        pass

class VLAAThinkingCaptionDataset(ImageBaseDataset):
    TYPE = 'Caption'
    IMAGE_BASE_DIR = '/home/ec2-user/workspace/dataset/VLM/vlaa_thinking/images'

    DATASET_URL = {
        'VLAAThinkingCaption_RL': '/home/ec2-user/workspace/dataset/VLM/vlaa_thinking/VLAA-Thinking-GRPO-25K.tsv',
        'VLAAThinkingCaption_SFT': '/home/ec2-user/workspace/dataset/VLM/vlaa_thinking/VLAA-Thinking-SFT-126K.tsv',
    }

    def post_build(self, dataset):
        # for each line in self.data, append the IMAGE_BASE_DIR to the image_path
        if 'image_path' in self.data.columns:
            print(f"Appending {self.IMAGE_BASE_DIR} to image_path in VLAAThinkingCaption dataset.")
            self.data['image_path'] = self.data['image_path'].apply(
                lambda x: osp.join(self.IMAGE_BASE_DIR, x) if isinstance(x, str) else x
            )
        return

    def build_prompt(self, line):
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = toliststr(line['image_path'])
        else:
            tgt_path = self.dump_image(line)

        question = 'Question: ' + line['question'] + '\n' + "Caption for the image:"

        msgs = []
        if isinstance(tgt_path, list):
            msgs.extend([dict(type='image', value=p) for p in tgt_path])
        else:
            msgs = [dict(type='image', value=tgt_path)]
        msgs.append(dict(type='text', value=question))

        caption_prompt = f"Given a image and a question, generate a caption for the image." \
            f"You need to pay attention to the following points: \n" \
            f"1. The caption MUST contain all necessary information to answer the question. A blind human should be able to answer the question based on the caption without seeing the image.\n" \
            f"2. Do not repeat the question in your response.\n" \
            f"3. DO not generate the answer to the question, you can only provide a caption describing the image.\n" \
            f"4. If there are multiple images, please generate a separate caption for each image. \n"

        msgs.insert(0, {"type": "text", "value": caption_prompt})
        return msgs

    def evaluate(self, eval_file, **judge_kwargs):
        pass

class ImageCaptionDataset(ImageBaseDataset):

    TYPE = 'Caption'

    DATASET_URL = {
        'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
    }

    DATASET_MD5 = {
        'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
    }

    def load_data(self, dataset):
        data = super().load_data(dataset)
        if 'question' not in data:
            data['question'] = [(
                'Please describe this image in general. Directly provide the description, '
                'do not include prefix like "This image depicts". '
            )] * len(data)
        return data

    # It returns a dictionary of scores
    @classmethod
    def evaluate(self, eval_file, **kwargs):
        data = load(eval_file)
        lt = len(data)
        lines = [data.iloc[i] for i in range(lt)]
        ref, gt = {}, {}
        for i, line in enumerate(lines):
            ref[str(i)] = [str(line['prediction'])]
            gt[str(i)] = eval(line['answer'])

        scorer = COCO_Caption_Scorer(ref, gt)
        coco_caption_score_dict = scorer.compute_scores()
        score_pth = eval_file.replace('.xlsx', '_score.json')
        dump(coco_caption_score_dict, score_pth)
        return coco_caption_score_dict
