import json
from datasets import load_dataset
from transformers import AutoTokenizer
import random
random.seed(0)

path_prefix = ""

data_paths = [f"{path_prefix}/data/GSMtrain_r1_06.json",
             f"{path_prefix}/data/Mathtrain_r1_06.json"]
output_path = f"{path_prefix}/data/s1r1-mixed-budget-06.json"



modelname = f"{path_prefix}/DeepSeek-R1-Distill-Qwen-1___5B"

tokenizer = AutoTokenizer.from_pretrained(modelname)


def transferdata(d):
    if d['deepseek_grade']=='Yes':
        d['instruction'] = 'Answer the given question. You should first estimate the total number of tokens you will need to answer this question based on its difficulty. Then you think about the reasoning process in the mind and provide the user with the answer. The token budget and whole solution are enclosed within <budget> </budget> and <solution> </solution> tags, respectively, i.e., <budget> token budget here, just an integer </budget><solution> solution here, please output the final answer within \\boxed{} </solution>.\n\nQuestion: ' + d['question']
        d['input'] = ''
        tokencnt = len(tokenizer.encode(d['deepseek_thinking_trajectory'], add_special_tokens=False))
        if tokencnt > 16000:
            return None
        d['output'] = '<budget>{}</budget><solution>{}</solution>'.format(tokencnt,d['deepseek_thinking_trajectory'])
        return d, tokencnt
    else:
        return None

def transferdata_gsm(d):
    if d['is_cor']:
        d['instruction'] = 'Answer the given question. You should first estimate the total number of tokens you will need to answer this question based on its difficulty. Then you think about the reasoning process in the mind and provide the user with the answer. The token budget and whole solution are enclosed within <budget> </budget> and <solution> </solution> tags, respectively, i.e., <budget> token budget here, just an integer </budget><solution> solution here, please output the final answer within \\boxed{} </solution>.\n\nQuestion: ' + d['question']
        d['input'] = ''
        d['output'] = '<budget>{}</budget><solution>{}</solution>'.format(d['token_count'],d['model_response'])
        return d, d['token_count']
    else:
        return None

def transferdata_math(d):
    if d['is_cor']:
        d['instruction'] = 'Answer the given question. You should first estimate the total number of tokens you will need to answer this question based on its difficulty. Then you think about the reasoning process in the mind and provide the user with the answer. The token budget and whole solution are enclosed within <budget> </budget> and <solution> </solution> tags, respectively, i.e., <budget> token budget here, just an integer </budget><solution> solution here, please output the final answer within \\boxed{} </solution>.\n\nQuestion: ' + d['problem']
        d['input'] = ''
        d['output'] = '<budget>{}</budget><solution>{}</solution>'.format(d['token_count'],d['model_response'])
        return d, d['token_count']
    else:
        return None

train_data = []
tokencnt = [[], [], []]

data = json.load(open(data_paths[0], 'r'))
for d in data:
    if transferdata_gsm(d) is not None:
        d1, tokencnt1 = transferdata_gsm(d)
        train_data.append(d1)
        tokencnt[0].append(tokencnt1)
print(len(train_data))

data = json.load(open(data_paths[1], 'r'))
for d in data:
    if transferdata_math(d) is not None:
        d1, tokencnt1 = transferdata_math(d)
        train_data.append(d1)
        tokencnt[1].append(tokencnt1)

print(len(train_data))
print(len(train_data))

print('GSM8K: {}'.format(sum(tokencnt[0])/len(tokencnt[0])))
print('Math: {}'.format(sum(tokencnt[1])/len(tokencnt[1])))
print('Total: {}'.format(sum([sum(i) for i in tokencnt])/len(train_data)))
