# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""

import os
import datasets

from verl.utils.hdfs_io import copy, makedirs
import argparse

from verl.utils.reward_score.math import remove_boxed, last_boxed_only_string


def extract_solution(solution_str):
    return remove_boxed(last_boxed_only_string(solution_str))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--local_dir', default='~/datasets/DAPO-Math-17k-HF/data')
    parser.add_argument('--hdfs_dir', default=None)

    args = parser.parse_args()

    # 'lighteval/MATH' is no longer available on huggingface.
    # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
    data_source = 'DAPO-Math-17k-HF_corMatch'
    print(f"Loading the {data_source} dataset from huggingface...", flush=True)
    # dataset = datasets.load_dataset(data_source, trust_remote_code=True)
    train_dataset = datasets.load_dataset('parquet', data_files='~/datasets/DAPO-Math-17k-HF/data/dapo-math-17k.parquet', split='train')
    def get_question_content(example):
        return example['prompt'][0]['content']
    
    unique_questions = set()
    unique_indices = []
    
    for idx, example in enumerate(train_dataset):
        question = get_question_content(example)
        if question not in unique_questions:
            unique_questions.add(question)
            unique_indices.append(idx)
    
    train_dataset = train_dataset.select(unique_indices)

    # instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
    instruction_following = 'Answer the given question. You should first estimate the total number of tokens you will need to answer this question based on its difficulty. Then you think about the reasoning process in the mind and provide the user with the answer. The token budget and whole solution are enclosed within <budget> </budget> and <solution> </solution> tags, respectively, i.e., <budget> token budget here, just an integer </budget><solution> solution here, please output the final answer within \\boxed{} </solution>.\n\nQuestion: '

    # add a row to each data item that represents a unique id
    def make_map_fn(split):

        def process_fn(example, idx):
            question_raw = example.pop('prompt')
            question_raw = question_raw[0]['content']
            prefix = 'Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\n'
            suffix = '\n\nRemember to put your answer on its own line after "Answer:".'
            # question_raw = question_raw.replace(prefix, '').replace(suffix, '')

            question = instruction_following + question_raw.replace(prefix, '').replace(suffix, '')

            r_model = example.pop('reward_model')
            answer = r_model['ground_truth']
            data = {
                "data_source": data_source,
                "prompt": [{
                    "role": "user",
                    "content": question
                }],
                "ability": "math",
                "reward_model": {
                    "style": "rule",
                    "ground_truth": answer
                },
                "extra_info": {
                    'split': split,
                    'index': idx,
                    'answer': answer,
                    "question": question_raw,
                }
            }
            return data

        return process_fn

    train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)

    def get_question_content(example):
        return example['extra_info']['question']
    
    unique_questions = set()
    unique_indices = []
    
    for idx, example in enumerate(train_dataset):
        question = get_question_content(example)
        if question not in unique_questions:
            unique_questions.add(question)
            unique_indices.append(idx)
    
    train_dataset = train_dataset.select(unique_indices)

    local_dir = args.local_dir
    hdfs_dir = args.hdfs_dir

    train_dataset.to_parquet(os.path.join(local_dir, 'train_budget.parquet'))
    if hdfs_dir is not None:
        makedirs(hdfs_dir)

        copy(src=local_dir, dst=hdfs_dir)
