import pickle
import numpy as np

num_samples = 160000
N = 512

with open("list_of_tagged_sequence.pkl", "rb") as f:
    list_of_tagged_sequence = pickle.load(f)

POS_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X']
POS2id = {POS:i for i, POS in enumerate(POS_list)}

data_id = []
for tagged_sequence in list_of_tagged_sequence:
    if len(data_id) == num_samples:
        break
    if len(tagged_sequence) < N:
        continue
    data_id.append([POS2id[POS] for _, POS in tagged_sequence[:N:]])
data_id = np.array(data_id, dtype=np.int32)

data_one_hot = np.identity(18, dtype=bool)[data_id]
dynamics = np.mean(data_id, axis=0)