# Copyright 2024
# [ANONYMIZED_INSTITUTION],
# [ANONYMIZED_FACULTY],
# [ANONYMIZED_DEPARTMENT]
#
# Authors:
# AUTHOR_1 (author1@example.com)
# AUTHOR_2 (author2@example.com)
#
# Code generation tools and workflows:
# First versions of this code were potentially generated
# with the help of AI writing assistants including
# GitHub Copilot, ChatGPT, Microsoft Copilot, Google Gemini.
# Afterwards, the generated segments were manually reviewed and edited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from pydantic import Field

from topollm.config_classes.config_base_model import ConfigBaseModel
from topollm.config_classes.constants import ITEM_SEP, KV_SEP, NAME_PREFIXES
from topollm.typing.enums import DescriptionType


class TokenizerConfig(ConfigBaseModel):
    """Configurations for the tokenizer."""

    add_prefix_space: bool = Field(
        default=False,
        title="Add prefix space.",
        description="Whether to add prefix space.",
    )

    max_length: int = Field(
        default=512,
        title="Maximum length of the input sequence.",
        description="The maximum length of the input sequence.",
    )

    return_special_tokens_mask: bool = Field(
        default=True,
        title="Return special tokens mask.",
        description="Whether to return special tokens mask.",
    )

    def get_config_description(
        self,
        description_type: DescriptionType = DescriptionType.LONG,
        short_description_separator: str = "-",
    ) -> str:
        """Return the config description."""
        match description_type:
            case DescriptionType.LONG:
                description: str = (
                    f"{NAME_PREFIXES['add_prefix_space']}"
                    f"{KV_SEP}"
                    f"{str(object=self.add_prefix_space)}"
                    f"{ITEM_SEP}"
                    f"{NAME_PREFIXES['max_length']}"
                    f"{KV_SEP}"
                    f"{str(object=self.max_length)}"
                )
            case DescriptionType.SHORT:
                # This should be a combined description which is short enough to be used in the model name
                description: str = (
                    f"{NAME_PREFIXES['add_prefix_space_short']}"
                    f"{short_description_separator}"
                    f"{str(object=self.add_prefix_space)}"
                    f"{short_description_separator}"
                    f"{NAME_PREFIXES['max_length_short']}"
                    f"{short_description_separator}"
                    f"{str(object=self.max_length)}"
                )
            case _:
                msg: str = f"Unknown {description_type = }"
                raise ValueError(msg)

        return description
