from transformers import AutoModel, AutoTokenizer
import json 
from tqdm import tqdm 
import numpy as np
import torch 

checkpoint = "salesforce/codet5p-110m-embedding"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)

path = 'livecodebench/execution2/test.jsonl'
dataset = []
with open(path, 'r') as f:
    for line in f:
        item = json.loads(line)
        dataset.append(item['synthetic_text'])
        
print(len(dataset))
# dataset = dataset[:10]
all_embeddings = []
for item in tqdm(dataset):
    input = tokenizer.encode(item, max_length=4096,return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model(input)[0]
    all_embeddings.append(embedding.cpu().numpy())

all_embeddings_np = np.vstack(all_embeddings)
print(all_embeddings_np.shape)
# print(all_embeddings_np)
np.save("lcb-codeexe-embeddings.npy", all_embeddings_np)