import pickle
import numpy as np

num_samples = 160000
N = 512
a = "PROPN"
b = "PROPN"

with open("list_of_tagged_sequence.pkl", "rb") as f:
    list_of_tagged_sequence = pickle.load(f)

data_a = []
data_b = []
for tagged_sequence in list_of_tagged_sequence:
    if len(data_a) == num_samples:
        break
    if len(tagged_sequence) < N:
        continue
    data_a.append([POS == a for _, POS in tagged_sequence[:N:]])
    data_b.append([POS == b for _, POS in tagged_sequence[:N:]])
data_a = np.array(data_a, dtype=bool)
data_b = np.array(data_b, dtype=bool)

mean_da = np.mean(data_a, axis=0)
mean_db = np.mean(data_b, axis=0)

sum_dadb = np.zeros((data_a.shape[1], data_b.shape[1]))
for da_seq, db_seq in zip(data_a, data_b):
    sum_dadb += da_seq[:,np.newaxis] * db_seq[np.newaxis,:]
mean_dadb = sum_dadb / data_a.shape[0]

correlation = mean_dadb - mean_da[:,np.newaxis] * mean_db[np.newaxis,:]