import spacy
import json
from rich.progress import track
from transformers import AutoTokenizer, AutoProcessor

#part_of_speech_pair_dict = {'DET_DET': 0, 'DET_ADJ': 1, 'DET_PROPN': 2, 'DET_NOUN': 3, 'DET_AUX': 4, 'DET_VERB': 5, 'DET_PUNCT': 6, 'DET_NUM': 7, 'DET_ADP': 8, 'DET_ADV': 9, 'DET_PART': 10, 'DET_SYM': 11, 'DET_PRON': 12, 'DET_SCONJ': 13, 'DET_X': 14, 'DET_CCONJ': 15, 'DET_INTJ': 16, 'DET_SPECIAL': 17, 'ADJ_DET': 18, 'ADJ_ADJ': 19, 'ADJ_PROPN': 20, 'ADJ_NOUN': 21, 'ADJ_AUX': 22, 'ADJ_VERB': 23, 'ADJ_PUNCT': 24, 'ADJ_NUM': 25, 'ADJ_ADP': 26, 'ADJ_ADV': 27, 'ADJ_PART': 28, 'ADJ_SYM': 29, 'ADJ_PRON': 30, 'ADJ_SCONJ': 31, 'ADJ_X': 32, 'ADJ_CCONJ': 33, 'ADJ_INTJ': 34, 'ADJ_SPECIAL': 35, 'PROPN_DET': 36, 'PROPN_ADJ': 37, 'PROPN_PROPN': 38, 'PROPN_NOUN': 39, 'PROPN_AUX': 40, 'PROPN_VERB': 41, 'PROPN_PUNCT': 42, 'PROPN_NUM': 43, 'PROPN_ADP': 44, 'PROPN_ADV': 45, 'PROPN_PART': 46, 'PROPN_SYM': 47, 'PROPN_PRON': 48, 'PROPN_SCONJ': 49, 'PROPN_X': 50, 'PROPN_CCONJ': 51, 'PROPN_INTJ': 52, 'PROPN_SPECIAL': 53, 'NOUN_DET': 54, 'NOUN_ADJ': 55, 'NOUN_PROPN': 56, 'NOUN_NOUN': 57, 'NOUN_AUX': 58, 'NOUN_VERB': 59, 'NOUN_PUNCT': 60, 'NOUN_NUM': 61, 'NOUN_ADP': 62, 'NOUN_ADV': 63, 'NOUN_PART': 64, 'NOUN_SYM': 65, 'NOUN_PRON': 66, 'NOUN_SCONJ': 67, 'NOUN_X': 68, 'NOUN_CCONJ': 69, 'NOUN_INTJ': 70, 'NOUN_SPECIAL': 71, 'AUX_DET': 72, 'AUX_ADJ': 73, 'AUX_PROPN': 74, 'AUX_NOUN': 75, 'AUX_AUX': 76, 'AUX_VERB': 77, 'AUX_PUNCT': 78, 'AUX_NUM': 79, 'AUX_ADP': 80, 'AUX_ADV': 81, 'AUX_PART': 82, 'AUX_SYM': 83, 'AUX_PRON': 84, 'AUX_SCONJ': 85, 'AUX_X': 86, 'AUX_CCONJ': 87, 'AUX_INTJ': 88, 'AUX_SPECIAL': 89, 'VERB_DET': 90, 'VERB_ADJ': 91, 'VERB_PROPN': 92, 'VERB_NOUN': 93, 'VERB_AUX': 94, 'VERB_VERB': 95, 'VERB_PUNCT': 96, 'VERB_NUM': 97, 'VERB_ADP': 98, 'VERB_ADV': 99, 'VERB_PART': 100, 'VERB_SYM': 101, 'VERB_PRON': 102, 'VERB_SCONJ': 103, 'VERB_X': 104, 'VERB_CCONJ': 105, 'VERB_INTJ': 106, 'VERB_SPECIAL': 107, 'PUNCT_DET': 108, 'PUNCT_ADJ': 109, 'PUNCT_PROPN': 110, 'PUNCT_NOUN': 111, 'PUNCT_AUX': 112, 'PUNCT_VERB': 113, 'PUNCT_PUNCT': 114, 'PUNCT_NUM': 115, 'PUNCT_ADP': 116, 'PUNCT_ADV': 117, 'PUNCT_PART': 118, 'PUNCT_SYM': 119, 'PUNCT_PRON': 120, 'PUNCT_SCONJ': 121, 'PUNCT_X': 122, 'PUNCT_CCONJ': 123, 'PUNCT_INTJ': 124, 'PUNCT_SPECIAL': 125, 'NUM_DET': 126, 'NUM_ADJ': 127, 'NUM_PROPN': 128, 'NUM_NOUN': 129, 'NUM_AUX': 130, 'NUM_VERB': 131, 'NUM_PUNCT': 132, 'NUM_NUM': 133, 'NUM_ADP': 134, 'NUM_ADV': 135, 'NUM_PART': 136, 'NUM_SYM': 137, 'NUM_PRON': 138, 'NUM_SCONJ': 139, 'NUM_X': 140, 'NUM_CCONJ': 141, 'NUM_INTJ': 142, 'NUM_SPECIAL': 143, 'ADP_DET': 144, 'ADP_ADJ': 145, 'ADP_PROPN': 146, 'ADP_NOUN': 147, 'ADP_AUX': 148, 'ADP_VERB': 149, 'ADP_PUNCT': 150, 'ADP_NUM': 151, 'ADP_ADP': 152, 'ADP_ADV': 153, 'ADP_PART': 154, 'ADP_SYM': 155, 'ADP_PRON': 156, 'ADP_SCONJ': 157, 'ADP_X': 158, 'ADP_CCONJ': 159, 'ADP_INTJ': 160, 'ADP_SPECIAL': 161, 'ADV_DET': 162, 'ADV_ADJ': 163, 'ADV_PROPN': 164, 'ADV_NOUN': 165, 'ADV_AUX': 166, 'ADV_VERB': 167, 'ADV_PUNCT': 168, 'ADV_NUM': 169, 'ADV_ADP': 170, 'ADV_ADV': 171, 'ADV_PART': 172, 'ADV_SYM': 173, 'ADV_PRON': 174, 'ADV_SCONJ': 175, 'ADV_X': 176, 'ADV_CCONJ': 177, 'ADV_INTJ': 178, 'ADV_SPECIAL': 179, 'PART_DET': 180, 'PART_ADJ': 181, 'PART_PROPN': 182, 'PART_NOUN': 183, 'PART_AUX': 184, 'PART_VERB': 185, 'PART_PUNCT': 186, 'PART_NUM': 187, 'PART_ADP': 188, 'PART_ADV': 189, 'PART_PART': 190, 'PART_SYM': 191, 'PART_PRON': 192, 'PART_SCONJ': 193, 'PART_X': 194, 'PART_CCONJ': 195, 'PART_INTJ': 196, 'PART_SPECIAL': 197, 'SYM_DET': 198, 'SYM_ADJ': 199, 'SYM_PROPN': 200, 'SYM_NOUN': 201, 'SYM_AUX': 202, 'SYM_VERB': 203, 'SYM_PUNCT': 204, 'SYM_NUM': 205, 'SYM_ADP': 206, 'SYM_ADV': 207, 'SYM_PART': 208, 'SYM_SYM': 209, 'SYM_PRON': 210, 'SYM_SCONJ': 211, 'SYM_X': 212, 'SYM_CCONJ': 213, 'SYM_INTJ': 214, 'SYM_SPECIAL': 215, 'PRON_DET': 216, 'PRON_ADJ': 217, 'PRON_PROPN': 218, 'PRON_NOUN': 219, 'PRON_AUX': 220, 'PRON_VERB': 221, 'PRON_PUNCT': 222, 'PRON_NUM': 223, 'PRON_ADP': 224, 'PRON_ADV': 225, 'PRON_PART': 226, 'PRON_SYM': 227, 'PRON_PRON': 228, 'PRON_SCONJ': 229, 'PRON_X': 230, 'PRON_CCONJ': 231, 'PRON_INTJ': 232, 'PRON_SPECIAL': 233, 'SCONJ_DET': 234, 'SCONJ_ADJ': 235, 'SCONJ_PROPN': 236, 'SCONJ_NOUN': 237, 'SCONJ_AUX': 238, 'SCONJ_VERB': 239, 'SCONJ_PUNCT': 240, 'SCONJ_NUM': 241, 'SCONJ_ADP': 242, 'SCONJ_ADV': 243, 'SCONJ_PART': 244, 'SCONJ_SYM': 245, 'SCONJ_PRON': 246, 'SCONJ_SCONJ': 247, 'SCONJ_X': 248, 'SCONJ_CCONJ': 249, 'SCONJ_INTJ': 250, 'SCONJ_SPECIAL': 251, 'X_DET': 252, 'X_ADJ': 253, 'X_PROPN': 254, 'X_NOUN': 255, 'X_AUX': 256, 'X_VERB': 257, 'X_PUNCT': 258, 'X_NUM': 259, 'X_ADP': 260, 'X_ADV': 261, 'X_PART': 262, 'X_SYM': 263, 'X_PRON': 264, 'X_SCONJ': 265, 'X_X': 266, 'X_CCONJ': 267, 'X_INTJ': 268, 'X_SPECIAL': 269, 'CCONJ_DET': 270, 'CCONJ_ADJ': 271, 'CCONJ_PROPN': 272, 'CCONJ_NOUN': 273, 'CCONJ_AUX': 274, 'CCONJ_VERB': 275, 'CCONJ_PUNCT': 276, 'CCONJ_NUM': 277, 'CCONJ_ADP': 278, 'CCONJ_ADV': 279, 'CCONJ_PART': 280, 'CCONJ_SYM': 281, 'CCONJ_PRON': 282, 'CCONJ_SCONJ': 283, 'CCONJ_X': 284, 'CCONJ_CCONJ': 285, 'CCONJ_INTJ': 286, 'CCONJ_SPECIAL': 287, 'INTJ_DET': 288, 'INTJ_ADJ': 289, 'INTJ_PROPN': 290, 'INTJ_NOUN': 291, 'INTJ_AUX': 292, 'INTJ_VERB': 293, 'INTJ_PUNCT': 294, 'INTJ_NUM': 295, 'INTJ_ADP': 296, 'INTJ_ADV': 297, 'INTJ_PART': 298, 'INTJ_SYM': 299, 'INTJ_PRON': 300, 'INTJ_SCONJ': 301, 'INTJ_X': 302, 'INTJ_CCONJ': 303, 'INTJ_INTJ': 304, 'INTJ_SPECIAL': 305, 'SPECIAL_DET': 306, 'SPECIAL_ADJ': 307, 'SPECIAL_PROPN': 308, 'SPECIAL_NOUN': 309, 'SPECIAL_AUX': 310, 'SPECIAL_VERB': 311, 'SPECIAL_PUNCT': 312, 'SPECIAL_NUM': 313, 'SPECIAL_ADP': 314, 'SPECIAL_ADV': 315, 'SPECIAL_PART': 316, 'SPECIAL_SYM': 317, 'SPECIAL_PRON': 318, 'SPECIAL_SCONJ': 319, 'SPECIAL_X': 320, 'SPECIAL_CCONJ': 321, 'SPECIAL_INTJ': 322, 'SPECIAL_SPECIAL': 323} 
part_of_speech_dict = {'DET': 0, 'ADJ': 1, 'PROPN': 2, 'NOUN': 3, 'AUX': 4, 'VERB': 5, 'PUNCT': 6, 'NUM': 7, 'ADP': 8, 'ADV': 9, 'PART': 10, 'SYM': 11, 'PRON': 12, 'SCONJ': 13, 'X': 14, 'CCONJ': 15, 'INTJ': 16, 'SPECIAL': 17}


# 加载英文模型
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

tokenizer = AutoTokenizer.from_pretrained("../pretrained_weight/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076")
 
# 定义一个不对单词进行拆分的Tokenizer
class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
    
    def __call__(self, text):
        words = text.split(' ')
        return spacy.tokens.Doc(self.vocab, words=words)
    
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

type = ['train','val','test']


for t in type:

    entry_list = json.load(open("./data/text_data/" + t + '.json',"r"))

    for entry in track(entry_list):
        
        tokens = entry['token']
        sentence = ' '.join(entry["token"])
        doc = nlp(sentence)
        part_of_speech = []
        position = []
        
        part_of_speech = ['SPECIAL']
        position = [0]
        
        assert len(tokens) == len(doc.text.split())
        for idx, token in enumerate(tokens):
            new_tokens = tokenizer.encode(token, add_special_tokens=False)
            part_of_speech += [doc[idx].pos_] * len(new_tokens)
            position += [idx + 1] * len(new_tokens)
        part_of_speech += ['SPECIAL']
        position += [len(tokens) + 1]
        
        assert len(part_of_speech) == len(position)
            
        for i in range(len(part_of_speech)):
            part_of_speech[i] = part_of_speech_dict[part_of_speech[i]]
            
        entry["part_of_speech_single"] = part_of_speech
        entry["position_single"] = position
        
    json.dump(entry_list, open("./data/text_data/" + t + '_bert.json',"w"))
    
