import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

gt = pd.read_csv("imdb-age/gt.csv")
label_to_id = {}
id_to_age = {}
for row in gt.itertuples():
    label_to_id[row.label] = row.Index
    id_to_age[row.Index] = row.score

KK = len(label_to_id)
print("items", KK)
W = np.zeros(dtype=np.float32, shape=(KK, KK))

df = pd.read_csv("imdb-age/crowd_labels.csv")
data = []
discard = 0
used = 0
for row in df.itertuples():
    i = label_to_id[row.left]
    j = label_to_id[row.right]
    i_s = id_to_age[i]
    j_s = id_to_age[j]
    if i_s == j_s:
        discard += 1
    else:
        used += 1
        winner = label_to_id[row.label]
        if winner == i:
            res = 1
            W[i, j] += 1
        elif winner == j:
            res = 0
            W[j, i] += 1
        else:
            assert False
        data.append((i, j, res))

pp = W / (W + W.T + 1e-6)
B = np.sum(pp, axis=1)
# B.sort()
# plt.hist(B, bins=100)
# plt.show()
rng = np.random.default_rng(2383)
D = 6
K = 2 ** (D + 1)
sel_idx = np.ceil(rng.random(K) * KK).astype(np.int32)
# print(sel_idx)
p = pp[sel_idx, :]
print(p)
print(B)
# pass
