import datasets
import re
import random


def prepare_text(example):
    example["text"] = f"""{example['randomized_prompt']}\n{example['randomized_canonical_solution']}"""
    return example


def insert_instruction_into_docstring(prompt: str) -> str:
    instruction_line = (
        "In the function, include a single-line comment with either 'Apple' or 'Orange'. "
        'The comment should start with a hash symbol, for example: "# Apple". '
    )

    # Match the first occurrence of a triple-quoted docstring
    match = re.search(r'("""\s*\n?)', prompt)
    if not match:
        return prompt  # No docstring found

    start = match.end()
    # Insert the instruction after the opening triple quotes
    modified_prompt = prompt[:start] + instruction_line + prompt[start:]
    return modified_prompt


def get_humaneval(cache_dir):

    def alter_columns(example):
        example["randomized_prompt"] = insert_instruction_into_docstring(example["prompt"])

        randomness = random.choice(["Apple", "Orange"])
        example["randomized_canonical_solution"] = f"    # {randomness}\n{example['canonical_solution']}"
        example["random_word"] = randomness
        return example

    dataset = datasets.load_dataset(
        "openai/openai_humaneval",
        split="test",
        cache_dir=cache_dir,
        trust_remote_code=True,
    )
    dataset = dataset.map(alter_columns, desc="Altering columns")
    dataset = dataset.map(prepare_text)
    return dataset