from datasets import load_dataset

language_mapping = {1: "Python 2", 2: "C++", 3: "Python 3", 4: "Java"}


def map_solution(x):
    language = language_mapping[x["solutions"]["language"][0]]
    return {
        "solution": x["solutions"]["solution"][0],
        "description": f"{x['description']}\nWrite a code in {language}.",
    }


# data = load_dataset("deepmind/code_contests")
# data = data.select_columns(["solutions", "description"])
# data = data.filter(
#     lambda x: (len(x["solutions"]["solution"]) > 0)
#     and (x["solutions"]["language"][0] in language_mapping)
# )
# data = data.map(
#     map_solution,
#     remove_columns=["solutions"],
# )
# data.save_to_disk("data_dir/code-contest")


def load_writingPrompts():
    dataset = load_dataset("euclaise/writingprompts")
    dataset = dataset["validation"].select(range(3000))

    def apply_chat_template(x):
        instruction = x["prompt"]
        instruction = f"Below is the beginning of a story. Write a story that continues from here.\n\n{instruction}"

        return {"solution": x["story"], "description": instruction}

    dataset = dataset.map(
        apply_chat_template,
        num_proc=8,
    )
    dataset.save_to_disk(f"data_dir/writingPrompts-dist")


load_writingPrompts()
