import json
import torch
from datasets import Dataset
import numpy as np
from util.globals import *
from transformers import AutoTokenizer, pipeline,AutoModelForCausalLM
with open("/home/ssliang/unlearning/data/zsre_test.json", "r") as f:
   dataset=json.load(f)
print(len(dataset))
alldata=[]
tokenizer = AutoTokenizer.from_pretrained("/home/ssliang/unlearning/opt-1.3b")
model=AutoModelForCausalLM.from_pretrained("/home/ssliang/unlearning/opt-1.3b")
newdata=[]
for i in range(500):
    newdata.append(dataset[i]['src'])
unlearntokenized = tokenizer(newdata,  padding=True)
#print(unlearntokenized)
for i in range(500):
    print(i)
    input_ids=np.array(unlearntokenized["input_ids"][i]).reshape(1,len(unlearntokenized["input_ids"][i]))
    attention_mask=np.array(unlearntokenized["attention_mask"][i]).reshape(1,len(unlearntokenized["attention_mask"][i]))
    input_ids=torch.tensor(input_ids).to("cuda:0")
    attention_mask=torch.tensor(attention_mask).to("cuda:0")
    model.to("cuda:0")
    outputs = model(input_ids, attention_mask=attention_mask)
    
    outputsnew=outputs.logits.squeeze()
    #print('outputs',outputsnew.detach().numpy().shape)
    #input_ids=unlearntokenized["input_ids"]
    alldata.append(outputsnew.cpu().detach().numpy())
print('alldata',np.array(alldata).shape)
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
tsne = TSNE(n_components=2, perplexity=3, n_iter=300)
newarray=np.array(alldata)
X_embedded = tsne.fit_transform(newarray[:,-1,:])
print(X_embedded)
model = KMeans(n_clusters=5,verbose=1,max_iter=20,tol=0.01,n_init=3)
model.fit(X_embedded)
y_predict = model.labels_
print('predict',y_predict)
split_list = [str(item).split(' ') for item in y_predict]

with open("./labels_zsre.txt", 'w',encoding='utf-8') as f:
    for line in split_list :
        f.write(str(line).strip('[').strip(']'))