import collections

# Ref: https://github.com/unslothai/unsloth/blob/f26d4e739ed507de7a9088da53d10fd02f58d160/unsloth/chat_templates.py#L1355


def standardize_sharegpt(
    dataset,
    aliases_for_system=[
        "system",
    ],
    aliases_for_user=[
        "user",
        "human",
        "input",
    ],
    aliases_for_assistant=[
        "gpt",
        "assistant",
        "output",
        "bing",
        "chatgpt",
        "bard",
    ],
    num_proc=8,
    messages_col="messages",
):
    """
    Standardizes ShareGPT and other formats to user/assistant Hugging Face format.

    Get aliases for the system, user and assistant roles.
    These shall map to "system", "user" and "assistant" respectively.

    aliases_for_system    = ["system",],
    aliases_for_user      = ["user", "human", "input",],
    aliases_for_assistant = ["gpt", "assistant", "output",],
    """

    convos = dataset[:10]["conversations"]
    uniques = collections.defaultdict(list)
    for convo in convos:
        for message in convo:
            for key, value in message.items():
                uniques[key].append(value)
    pass

    # Must be only 2 entries
    assert len(uniques.keys()) == 2

    keys = list(uniques.keys())
    length_first = len(set(uniques[keys[0]]))
    length_second = len(set(uniques[keys[1]]))

    if length_first < length_second:
        # Role is assigned to the first element
        role_key = keys[0]
        content_key = keys[1]
    else:
        role_key = keys[1]
        content_key = keys[0]
    pass

    # Check roles are in aliases
    all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant)
    roles = set(uniques[role_key])
    leftover_aliases = (all_aliases | roles) - all_aliases
    if len(leftover_aliases) != 0:
        raise TypeError(f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases.")
    pass

    # Mapping for aliases
    aliases_mapping = {}
    for x in aliases_for_system:
        aliases_mapping[x] = "system"
    for x in aliases_for_user:
        aliases_mapping[x] = "user"
    for x in aliases_for_assistant:
        aliases_mapping[x] = "assistant"

    def _standardize_dataset(examples):
        convos = examples["conversations"]
        all_convos = []
        for convo in convos:
            new_convo = [
                {
                    "role": aliases_mapping[message[role_key]],
                    "content": message[content_key],
                }
                for message in convo
            ]
            all_convos.append(new_convo)
        pass
        return {
            messages_col: all_convos,
        }

    pass

    return dataset.map(_standardize_dataset, batched=True, desc="Standardizing format", num_proc=num_proc)
