import torch
from transformers import AutoProcessor, AutoModel
from scipy import spatial
import clip
import pandas as pd
from PIL import Image
from transformers import AutoProcessor, AutoModel


class clip_infer:
    def __init__(self, device='cuda', model=None, transform=None):
        self.device = device
        self.model = model
        self.transform = transform
        self.model.eval()

    def evaluate(self, prompt, images_PIL):
        def encode(image, model, transform):
            image_input = transform(image).unsqueeze(0).to(self.device)
            with torch.no_grad():
                image_features = model.encode_image(image_input).detach().cpu().float()
            return image_features

        with torch.no_grad():
            generated_features = encode(images_PIL, self.model, self.transform)
            text_features = clip.tokenize(prompt, truncate=True).to(self.device)
 
            text_features = self.model.encode_text(text_features).detach().cpu().float()
            score = 1 - spatial.distance.cosine(generated_features.view(generated_features.shape[1]),
                                                    text_features.view(text_features.shape[1]))

        return score
