import json
import os
import numpy as np
from sklearn.cluster import KMeans

paper_directory = "/saved_keypoints/acl_2017/test/355/"
num_clusters = 10

keypoint_embeddings = np.load(os.path.join(paper_directory, "keypoint_embeddings.npy"))
with open(os.path.join(paper_directory, "keypoint_metadata.json"), "r") as f:
	keypoint_metadata = json.load(f)

X_embeddings = []
X_metadata = []

for idx in range(len(keypoint_metadata)):
	level, reviewer_id, keypoint = keypoint_metadata[idx]
	'''
	I am retaining the level 1 and 2 reviews (since there is only one level 1 and 2 review per paper)
	and only one level 3, 4, 5 review per paper (to avoid duplicates). Also I am ensuring that the reviews retained from levels 3, 4, 5 are from different reviewers. (As I dont want keypoints from level 4 and 5 reviews to go to similar clusters just because they draw from same reviewer's content)
	'''
	if level == 'level3':
		if int(reviewer_id) != 1:
			continue
	
	if level == 'level4':
		if int(reviewer_id) != 2:
			continue

	if level == 'level5':
		if int(reviewer_id) != 3:
			continue
	
	X_metadata.append(keypoint_metadata[idx])
	X_embeddings.append(keypoint_embeddings[idx])

X_embeddings = np.array(X_embeddings)

print(X_embeddings.shape)
print(len(X_metadata))
# print(json.dumps(X_metadata, indent=4))

kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init="auto").fit(X_embeddings)

assigned_clusters = {
	f"level{i}": [] for i in range(1,6)
}

cluster_keypoints = {
	i: [] for i in range(num_clusters)
}

for metadata, label in zip(X_metadata, kmeans.labels_):
	level, reviewer_id, keypoint = metadata
	assigned_clusters[level].append(label.item())
	cluster_keypoints[int(label)].append(level)

for key, val in assigned_clusters.items():
	print(f"{key}: {val}")

print()

for key, val in cluster_keypoints.items():
	print(f"{key}: {val}")