import torch
import numpy as np
import pickle, json, time, re, sys, os
import networkx as nx
from multiprocessing import Pool
import dgl
from dgl import from_networkx
import dgl
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

def run_one_design(design_name):
    # print(design_name, ep)
    folder_dir = f'/home/coguest5/hdl_fusion/data_collect/vlg/data/ori_vlg'
    with open(f'{folder_dir}/{design_name}.v', 'r') as f:
            lines = f.readlines()
    documents = ""
    for line in lines:
        line = re.sub(r'\n', '', line)
        documents += line

    return documents


def get_dataset(design_lst):
    checkpoint = "intfloat/e5-mistral-7b-instruct"
    device = "cuda"
    device = 'cpu'
    model = SentenceTransformer("intfloat/e5-mistral-7b-instruct")
    model.max_seq_length = 4096
    max_length = 10000


    for design in design_lst:
        print("Current design: ", design)

        save_dir = f"../rtl_emb/{design}"
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        else:
            os.system(f"rm -rf {save_dir}")
            os.makedirs(save_dir)
    
        documents = run_one_design(design)
        # inputs = tokenizer.encode(documents, return_tensors="pt", truncation=True, max_length=512, padding='max_length').to(device)
        # document_embeddings = model(inputs)[0]

        document_embeddings = model.encode(documents)
        with open(f"{save_dir}/{design}.pkl", 'wb') as f:
            pickle.dump(document_embeddings, f)

    

if __name__ == '__main__':
    global design_lst_all
    with open("/home/coguest5/hdl_fusion/dataset/dataset_js/design_all.json", 'r') as f:
        design_lst_all = json.load(f)
    
    get_dataset(design_lst_all)