from tqdm import tqdm
from PIL import Image
import torch
import os
import numpy  as np

from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("clip-vit-base-patch16")

def get_clip_score(image_path,text):
    image = Image.open(image_path)
    inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    print(outputs)
    logits_per_image = outputs.logits_per_image
    print(logits_per_image, logits_per_image.shape)  # 1,4
    return logits_per_image

image_path='zero123_dataset/object_to_forget_angle/image/minion.png'
text = ['a minion standing','shirtless putin at pride','A cheerful yellow animated character in blue overalls waves with one hand, featuring a single central eye with a goggle.']  # text must be a list
get_clip_score(image_path,text)
