import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm as tqdm

medoid_labels = np.load('200k_2000cluster/clara_labels.npy')
data = np.load('data.npz')
labels = data['label'][:200000]

dominant_labels = []
unique_counts = []
dominant_percentage = []
im_ratio = []
num_labels = []
num_samples_per_cluster = []
for i in tqdm(range(2000)):
    indexes = np.where(medoid_labels == i)[0]
    num_samples_per_cluster.append(len(indexes))
    curr_label = labels[indexes]
    unique, counts = np.unique(curr_label, return_counts=True)
    print(len(counts))
    all_counts = np.zeros(3329)
    max_idx = np.argmax(counts)
    dominant_labels.append(unique[max_idx])
    unique_counts.append(len(unique))
    all_counts[unique] = counts
    dominant_percentage.append(all_counts/len(indexes))
    min_count = 1
    if np.min(all_counts) != 0:
        min_count = np.min(all_counts)
    im_ratio.append(np.max(all_counts)/min_count)
    num_labels.append(len(unique))

dominant_percentage = np.array(dominant_percentage)
print(dominant_percentage.shape)
print(np.max(dominant_percentage, axis = 1))
print(np.min(dominant_percentage, axis = 1))
unique_ratio = []
for i in range(len(num_labels)):
    unique_ratio.append(num_labels[i]/num_samples_per_cluster[i])

df = pd.DataFrame({
    'Unique Num': num_labels,
    'Imb_ratio':im_ratio,
    'Num_samples':num_samples_per_cluster,
    'Label Sample Ratio':unique_ratio
    })
df.to_csv('medoids_stats.csv', index=False)