from huggingface_hub import notebook_login
from huggingface_hub import login

# login(token='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')
# notebook_login()


import torch
from diffusers import StableDiffusionPipeline
import numpy as np
# from transformers import CLIPTextModel, CLIPTokenizer

model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda"
# Load the tokenizer and model
# tokenizer = CLIPTokenizer.from_pretrained(model_id)
# model = CLIPTextModel.from_pretrained(model_id)

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to(device)

text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer


prompts = ['airplane in the sky in real life',
          'sedan car in real life',
          'bird in real life',
          'cat in real life',
          'deer with corner in real life',

          'dog in real life',
          'frog in real life',
          'horse in real life',
          'ship in the ocean in real life',
          'truck in real life',]

embeds = []
for i in range(10):
    inputs = tokenizer(prompts[i], return_tensors="pt").input_ids.to("cuda")  # or "cpu"
    text_embeddings = text_encoder(inputs)[0]
    # embeds.append(inputs[0][1])
    # print(text_embeddings[0][1].shape)
    embeds.append(text_embeddings[0][1].detach().cpu())

embeds = np.array(embeds)
pt_embeds = torch.from_numpy(embeds)
print(pt_embeds.shape)

torch.save(pt_embeds, "./embeds.pt")
