from datasets import load_dataset, Dataset, load_from_disk
from ast import literal_eval
from transformers import AutoTokenizer


cf = load_dataset("open-r1/codeforces")

try:
    print("Filtering out codecontests tasks")
    cc = load_from_disk("datasets/codecontests")
    cc_tasks = list(
        map(lambda x: x.strip(), cc["train"]["description"] + cc["test"]["description"])
    )

    cf = cf.filter(["description"].strip() not in cc_tasks)

except:
    print("Something went wrong with filtering out codecontests tasks")



cf = cf.filter(lambda x: x['description'] is not None and len(x['description'].strip()) > 0) # remove empty descriptions
cf = cf.filter(lambda x: x['generated_checker'] is None or x['generated_checker'] == '') # no extra code for tests
cf = cf.filter(lambda x: x['executable']) # have 3 correct solutions for the task
cf = cf.filter(lambda x: x['interaction_format'] is None) # no interaction format

def combine_tests(row):
    examples = row['examples']
    official_tests = row['official_tests']
    if examples is None:
        examples = []
    if official_tests is None:
        official_tests = []
    
    row['tests'] = examples + official_tests
    return row

cf = cf.map(combine_tests) # combine examples and official tests
cf = cf.filter(lambda x: len(x['tests']) > 0) # remove tasks with no tests

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")


def add_prompt(row):
    
    prompt =  "Solve the following problem:\n\n" + row['description']
    input_format = row['input_format']
    output_format = row['output_format']
    
    if input_format != '' and input_format is not None:
        prompt += "\n\nInput Format:\n" + input_format
        
    if output_format != '' and output_format is not None:
        prompt += "\n\nOutput Format:\n" + output_format
            
    messages = [
    {"role": "system", "content": "You are a coding assistant.  Your task is to output ONLY valid code for the given task.  Do not include explanations, comments, markdown formatting (```), or natural language.  Output exactly and only the code required. Use input() to input and print() to output."},
    {"role": "user", "content": prompt}
        ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    row['prompt'] = text
    return row


cf = cf.map(add_prompt)
cf = cf.filter(lambda x: len(tokenizer.tokenize(x["prompt"])) < 512)


cf.save_to_disk("datasets/codeforces")
