import os.path
import sys
import h5py
import math
import gc
import numpy as np
#from numba import cuda
import matplotlib.pyplot as plt
import argparse
#from tensorflow.python.keras.layers import Lambda
#from sklearn.model_selection import train_test_split
#K-center: https://github.com/google/active-learning/blob/master/sampling_methods/kcenter_greedy.py
# Trace and metadata parameters
from pathlib import Path
from sklearn.cluster import KMeans
#from sklearn_extra.cluster import KMedoid
import copy
import time
from tqdm import tqdm as tqdm
from scipy.spatial import distance
import pandas as pd

num_samples = np.arange(20000,90000, 10000)
print(num_samples)

data = np.load('data.npz')
#data = data['data'][:200000]
labels = data['label'][:200000]
minmax_full_ids = np.load('200k_200_Norm_1/minmax_balance_alpha_0.5_160000_index.npy')
minmax_ids = np.load('200k_200_Norm_1/minmax_balance_alpha_1.0_160000_index.npy')

all_minmax_ratio = []
all_mm_ratio = []
all_rnd_ratio = []
all_minmax_count = []
all_rnd_count = []
all_mm_count = []

for num_sample in num_samples:
    random_ids = np.random.choice(200000, num_sample, replace=False)
    minmax_bal_ids = minmax_full_ids[:num_sample]
    minmax_ids = minmax_ids[:num_sample]
    random_labels = labels[random_ids]
    mmbal_labels = labels[minmax_full_ids]
    mm_labels = labels[minmax_ids]
    unique_rnd, rnd_count = np.unique(random_labels, return_counts=True)
    unique_mmbal, mmbal_count = np.unique(mmbal_labels, return_counts=True)
    unique_mm, mm_count = np.unique(mm_labels, return_counts=True)
    all_minmax_ratio.append(np.max(mmbal_count)/np.min(mmbal_count))
    all_rnd_ratio.append(np.max(rnd_count)/np.min(rnd_count))
    all_mm_ratio.append(np.max(mm_count)/np.min(mm_count))
    all_minmax_count.append(len(unique_mmbal))
    all_rnd_count.append(len(unique_rnd))
    all_mm_count.append(len(unique_mm))

res = {}
res['all_minmax_ratio'] = all_minmax_ratio
res['all_rnd_ratio'] = all_rnd_ratio
res['all_minmax_count'] = all_minmax_count
res['all_rnd_count'] = all_rnd_count
res['all_mm_ratio'] = all_mm_ratio
res['all_mm_count'] = all_mm_count

df = pd.DataFrame(res)
df.to_csv('analysis_distribution.csv')

