import numpy as np
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn import manifold
import time
import codecs
import argparse
import dill as pickle

parser = argparse.ArgumentParser()
parser.add_argument("--vocab-file", help="numpy input vocabaulary file")
parser.add_argument("--point-file", help="numpy input vocabulary representations")
parser.add_argument("--output-path", help="output path clustering model and result files")
parser.add_argument("--cluster", help="cluster numbers comma separated (e.g. 5,10,15)")
parser.add_argument("--delimeter", default='|||', help="delimeter (default: '|||')")

args = parser.parse_args()

vocab_file = args.vocab_file
point_file = args.point_file
outputpath = args.output_path

Ks = [int(k) for k in args.cluster.split(',')]

print ("Run for the following cluster sizes", Ks)

vocab = np.load(vocab_file)
points = np.load(point_file)
			
for K in Ks:
	starttime = time.time()
	print("Perform "+str(K)+" Clustering!")
	clustering = AgglomerativeClustering(n_clusters=K,compute_distances=True).fit(points)

	fn = outputpath+'/model-'+str(K)+'-agglomerative-clustering.pkl'
	with open(f"{fn}", "wb") as fp:
		pickle.dump(clustering,fp)
				
	clusters = defaultdict(list)
	for i,label in enumerate(clustering.labels_):
		clusters[clustering.labels_[i]].append(vocab[i])

	print("Write "+str(K)+" Clusters!")	
	target = open(outputpath+'/clusters-'+str(K)+'.txt','w')
				
	for key in clusters.keys():
		for word in clusters[key]:
			target.write(word+args.delimeter+str(key)+"\n")
	target.close()
				
	endtime = time.time()
	diff = endtime-starttime
	print(str(K)+": Time-taken: "+str(diff)+" sec")
