from datasets import load_dataset, Dataset
from ast import literal_eval
from transformers import AutoTokenizer



mbpp = load_dataset("Gen-Verse/MBPP-ReasonFlux")

def get_tests(row):
    tests = [{'input': x, 'output': y} for x, y in zip(row['test_input'], row['test_output'])]
    row['tests'] = tests
    return row

mbpp = mbpp.map(get_tests)

def add_prompt(row):
    
    prompt =  row['prompt']
                
    messages = [
    {"role": "system", "content": "You are a coding assistant.  Your task is to output ONLY valid code for the given task.  Do not include explanations, comments, markdown formatting (```), or natural language.  Output exactly and only the code required. Use input() to input and print() to output."},
    {"role": "user", "content": prompt}
        ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    row['instruct_prompt'] = text
    return row

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")


mbpp = mbpp.map(add_prompt)
mbpp = mbpp.filter(lambda x: len(tokenizer.tokenize(x['instruct_prompt'])) < 512)


mbpp.save_to_disk('datasets/mbpp_io')

