import numpy as np 
from code import *
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

# load data
print('loading data')
phy_data = np.loadtxt("datasets/phy.dat")
np.random.shuffle(phy_data)
phy_test = phy_data[-10000:,:]
distances_phy_test = euclidean_distances(phy_test, phy_test)**2
print('data loaded')

print('getting k means ++ baseline')
k = 10
# get k means ++ baseline
baseline_phy_10 = 0.0
for i in range(20):
	baseline_phy_10 += 0.05*kmeans_cost_label(phy_test, k_means_pp(phy_test, 10, distances_phy_test, 10000), 80, 10)[1]

# get true labels using k means ++ seeding and tons of lloyd steps
kmeans_scikit_phy_10 = KMeans(n_clusters=10).fit(phy_test)
true_labels_phy_10 = kmeans_scikit_phy_10.labels_

print('noisy oracle: 50 perct corruption (avg over 10 trials)')
# noisy perturb true labels by 50%
err = 50.0
pvals = np.linspace(.01, .15, 15)
cost_oracle = []
cost_algo = []
for i in range(10):
	noisy_orc_labels = np.array([noisy_oracle(true_labels_phy_10, k, 10, err/100.0) for k in range(10000)])
	cost_oracle.append(kmeans_cost_label(phy_test, noisy_orc_labels, 80, 10)[1])

	lowest = float('inf')
	for p in pvals:
		curr_cost = k_means_cost(phy_test, algo1(phy_test, noisy_orc_labels, 10, p))[1]
		if curr_cost < lowest:
			lowest = curr_cost
	cost_algo.append(lowest)

print('k means ++', 1.0, 'Noisy predictor:', np.average(cost_oracle)/baseline_phy_10, 'Our Algo:', np.average(cost_algo)/ baseline_phy_10)
