from pydantic import BaseModel

class PRConfig(BaseModel):
    use_refine_llm: bool = True


class TEConfig(BaseModel):
    entail_threshold: float = 0.5     # optional: hard threshold for entailment
    contra_threshold: float = 0.5     # optional: hard threshold for contradiction
    uncertain_maxp: float = 0.7
    topk: int = 1                     # number of chunks to retrieve per atom
    max_chunk_chars: int = 420        # max characters per chunk in char-level chunking
    chunk_overlap: int = 80           # overlap for char-level chunking
    emb_model: str = "BAAI/bge-m3"    # embedding model
    te_model1: str = "joeddav/xlm-roberta-large-xnli"
    te_model2: str = "facebook/bart-large-mnli"
    device: str = "auto"              # "auto" | "cpu" | "cuda:0"

    use_atomize_llm: bool = True

    # use_fast_tokenizer: bool = False
    # tokenizer_override1: str | None = "xlm-roberta-large"
    # tokenizer_override2: str | None = None


class ClassifyConfig(BaseModel):
    finetuned_model: str | None = None
    zeroshot_model: str = "MoritzLaurer/deberta-v3-large-zeroshot-v1"
    conf_threshold: float = 0.60      # tau_conf: MSP threshold
    margin_threshold: float = 0.20    # tau_margin: top1-top2 margin threshold
    entropy_threshold: float = 1.50   # tau_H: entropy threshold (nats)
    device: str = "auto"

    use_fast_tokenizer: bool = False
    tokenizer_override: str | None = "microsoft/deberta-v3-large"


class ChunkConfig(BaseModel):
    mode: str = "sentence"                # "word" or "char" or "sentence"
    lang: str = "auto"                # "auto", "zh", or "en"
    max_words: int = 7            # max words per chunk
    overlap_words: int = 2           # overlap in words between consecutive chunks
    min_words: int = 5


CFG_PR = PRConfig()
CFG_TE = TEConfig()
CFG_CLF = ClassifyConfig()
CFG_CHUNK = ChunkConfig()
