import re
from sentence_transformers import SentenceTransformer, util
embedding_model = SentenceTransformer(os.environ.get("EMBEDDING_MODEL_PATH"))

def call_embedding_model(clarify_questions_list, groundtruth_questions_list):
    clarify_questions_embeddings = embedding_model.encode(clarify_questions_list, convert_to_tensor=True)
    groundtruth_questions_embeddings = embedding_model.encode(groundtruth_questions_list, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(clarify_questions_embeddings, groundtruth_questions_embeddings)
    similarity_list = []
    for i in range(len(clarify_questions_list)):
        for j in range(len(groundtruth_questions_list)):
            similarity_list.append(cos_scores[i][j].item())
    print(similarity_list)
    return max(similarity_list)
def check_language_by_frequency(
    text, 
    char_threshold=0.20, 
    word_count_threshold=3, 
):
    chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
    english_chars = re.findall(r'[a-zA-Z]', text)

    chinese_count = len(chinese_chars)
    english_count = len(english_chars)
    total_lang_chars = chinese_count + english_count

    if total_lang_chars == 0:
        return "other"

    chinese_ratio = chinese_count / total_lang_chars
    english_ratio = english_count / total_lang_chars

    has_chinese = chinese_ratio > char_threshold
    has_english = english_ratio > char_threshold

    if has_chinese and not has_english:
        return "zh"
    if not has_chinese and has_english:
        return "en"
    if not has_chinese and not has_english:
        return "other"

    if has_chinese and has_english:

        is_chinese_dominant = chinese_count > english_count

        if is_chinese_dominant:
            english_words = re.findall(r'\b[a-zA-Z-]+\b', text.lower())
            word_count = len(english_words)
            
            if word_count <= word_count_threshold:
                return "zh"
        else:
            chinese_chunks = re.findall(r'[\u4e00-\u9fff]+', text)
            word_count = len(chinese_chunks)

            if word_count <= word_count_threshold:
                return "en"
        
        return "zh-en"
