from datasets import load_dataset, Dataset
from ast import literal_eval
from transformers import AutoTokenizer


import subprocess
import os

def clone_livecodebench():
    # Only run if the data directory does not exist
    if not os.path.exists("code_generation_lite"):
        subprocess.run(["git", "lfs", "install"], check=True)
        subprocess.run(["git", "clone", "https://huggingface.co/datasets/livecodebench/code_generation_lite"], check=True)
        os.chdir("code_generation_lite")
        subprocess.run(["git", "lfs", "pull"], check=True)
        os.chdir("..")

clone_livecodebench()

data_files = {
    "test1": "code_generation_lite/test.jsonl",
    "test2": "code_generation_lite/test2.jsonl",
    "test3": "code_generation_lite/test3.jsonl",
    "test4": "code_generation_lite/test4.jsonl",
    "test5": "code_generation_lite/test5.jsonl",
    "test6": "code_generation_lite/test6.jsonl",
}

livecodebench = load_dataset("json", data_files=data_files)

def transform_public_tests(row):
    row['public_tests'] = literal_eval(row['public_test_cases'])
    return row
    
livecodebench = livecodebench.map(transform_public_tests)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")

def get_prompt(row):
    messages = [
    {"role": "system", "content": "You are a coding assistant.  Your task is to output ONLY valid code for the given task.  Do not include explanations, comments, markdown formatting (```), or natural language.  Output exactly and only the code required. Use input() to input and print() to output."},
    {"role": "user", "content": "Solve the following problem:\n\n" + row['question_content']}
        ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    return text

def add_prompt(row):
    row['prompt'] = get_prompt(row)
    return row

livecodebench = livecodebench.map(add_prompt)

livecodebench = livecodebench.filter(lambda x: len(tokenizer.tokenize(x['prompt'])) < 512)

# Concatenate all splits in ds_dict into one list of examples
all_test_examples = []
for split in livecodebench.keys():
    all_test_examples.extend(livecodebench[split])

# Create a new Dataset from the combined examples
test_dataset = Dataset.from_list(all_test_examples)

# Add the new "test" split to ds_dict
livecodebench["test"] = test_dataset


livecodebench.save_to_disk('datasets/livecodebench')

