import re

import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


pattern = r'-?\d+(?:\.\d+)?'
threshold = 0.5
model_card = "princeton-nlp/sup-simcse-roberta-large"
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(model_card, cache_dir='../model/')
model = AutoModel.from_pretrained(model_card, cache_dir='../model/').to(device)


def encoding_sentence(conditions, question):
    with torch.no_grad():
        c_with_number = []
        for c_idx in range(len(conditions)):
            q_number = re.findall(pattern, conditions[c_idx])
            if len(q_number)>=1:
                c_with_number.append(c_idx)
        sentences = conditions + [question]
        inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
        embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
        embeddings = embeddings.cpu()
        similarity_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(similarity_matrix, 0)
        ctoc = np.ma.masked_equal(similarity_matrix, 0)
        ctoc = ctoc.mean(axis=1)[:-1]
        ctoq = similarity_matrix[-1][:-1]
        ic_candidate_index = [i for i in range(len(ctoc)) if ctoq[i] < threshold or ctoc[i] < threshold]
        ic_candidate_index = [i for i in ic_candidate_index if i in c_with_number]
    return 1/np.mean(ctoq), ic_candidate_index
