#!/usr/bin/env python
# coding: utf-8

# In[1]:


pip install PIMS


# In[3]:


pip install fastcluster


# In[4]:


get_ipython().run_line_magic('matplotlib', 'inline')

# importing relevant libraries
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc#, plot_precision_recall_curve
from sklearn.datasets import make_classification
from tqdm import tqdm
from umap import UMAP
from pynndescent import NNDescent
from fastcluster import single
from scipy.cluster.hierarchy import cut_tree, fcluster, dendrogram
from scipy.spatial.distance import squareform
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
import sklearn.datasets
import matplotlib
import matplotlib.pyplot as plt
from scipy.spatial.distance import hamming
import warnings
warnings.filterwarnings('ignore')

# turning off automatic plot showing, and setting style
plt.style.use('bmh')


# In[5]:


ndim = 30
noise = 0.1
num_samples = 10000


# In[27]:


# circles
[X_circle, y_circle] = sklearn.datasets.make_circles(n_samples=num_samples, shuffle=True, noise=noise, random_state=None, factor=0.5)
#[X_circle, y_circle] = sklearn.datasets.make_moons(n_samples=num_samples, shuffle=True, noise=noise, random_state=None)


# In[28]:


X_circle_ndim = np.concatenate([X_circle,np.random.normal(0,noise,(num_samples,ndim-2))], axis=1)
X_circle_ndim = X_circle_ndim[:, np.random.permutation(X_circle_ndim.shape[1])]


# In[8]:


X_circle_ndim.shape


# In[29]:


plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.figsize'] = 6, 6

plt.figure(dpi=150)

colors = ['red','blue']
#fig = plt.figure(figsize=(6, 6))
plt.scatter(X_circle[:,0],X_circle[:,1], s=10, c=y_circle, cmap=matplotlib.colors.ListedColormap(colors))
plt.show()


# In[ ]:


X_train = X_circle_ndim
y = y_circle


# In[ ]:


# np.random.shuffle(y)


# In[ ]:


et = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=1,
                          max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# et = RandomForestClassifier(n_estimators=100, min_samples_leaf=1,
#                           max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)


# validation instance
skf = StratifiedKFold(n_splits=5, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, X_train, y, cv=skf, method='predict_proba')

# evaluating the model
#print('Area under the ROC Curve:', roc_auc_score(y_moon, preds, multi_class='ovo'))
print('Area under the ROC Curve:', roc_auc_score(y, preds[:,1]))

et.fit(X_train, y)


# In[ ]:


leaves_train = et.apply(X_train)
print(leaves_train.shape)
print(leaves_train)

distances_train = np.zeros((500,500))

total = 0
for i in range(500):
    for j in range(500):
        distances_train[i,j] = hamming(leaves_train[i,:], leaves_train[j,:])

score_train = sum(distances_train)/499

print(np.mean(score_train))
print(np.cov(score_train))


# In[30]:


[X_circle, y_circle] = sklearn.datasets.make_circles(n_samples=num_samples, shuffle=True, noise=noise, random_state=None, factor=0.5)


# In[31]:


X_circle_ndim = np.concatenate([X_circle,np.random.normal(0,noise,(num_samples,ndim-2))], axis=1)
X_circle_ndim = X_circle_ndim[:, np.random.permutation(X_circle_ndim.shape[1])]


# In[39]:


plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.figsize'] = 6, 6

plt.figure(dpi=150)

colors = ['red','blue']
#fig = plt.figure(figsize=(6, 6))
plt.scatter(X_circle[:,0],X_circle[:,1], s=10, c=y_circle, cmap=matplotlib.colors.ListedColormap(colors))
plt.savefig('simulated_circles.png', bbox_inches='tight')
plt.show()


# In[ ]:


X_test_in = X_circle_ndim


# In[ ]:


leaves_test_in = et.apply(X_test_in)
print(leaves_test_in.shape)
print(leaves_test_in)

distances_test_in = np.zeros((500,500))

total = 0
for i in range(500):
    for j in range(500):
        distances_test_in[i,j] = hamming(leaves_test_in[i,:], leaves_test_in[j,:])

score_test_in = sum(distances_test_in)/499

print(np.mean(score_test_in))
print(np.cov(score_test_in))


# In[ ]:





# In[40]:


# lines

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.figsize'] = 6, 6

plt.figure(dpi=150)

x1 = np.linspace(-1,1,int(num_samples/2))
y1 = np.ones((int(num_samples/2),1))*0.5 + np.random.normal(0,noise,(int(num_samples/2),1))
label1 = np.ones((int(num_samples/2),1))

x2 = np.linspace(-1,1,int(num_samples/2))
y2 = -np.ones((int(num_samples/2),1))*0.5 + np.random.normal(0,noise,(int(num_samples/2),1))
label2 = np.zeros((int(num_samples/2),1))

labels = np.vstack([label1,label2])
print(labels.shape)

X1 = np.concatenate([x1,x2])
Y1 = np.concatenate([y1,y2])
print(X1.shape)
#X_lines = np.concatenate([X1,Y1])
#X_lines.shape
plt.scatter(X1, Y1, s=10, c=labels, cmap=matplotlib.colors.ListedColormap(colors))
plt.savefig('simulated_lines.png', bbox_inches='tight')
plt.show()


# In[ ]:


X1 = np.reshape(X1, (num_samples,1))
print(X1.shape)
Y1 = np.reshape(Y1, (num_samples,1))
print(X1.shape)
X_lines = np.concatenate([X1,Y1], axis=1)
print(X_lines.shape)


# In[ ]:


X_lines_ndim = np.concatenate([X_lines,np.random.normal(0,noise,(num_samples,ndim-2))], axis=1)


# In[ ]:


X_test_out = X_lines_ndim


# In[ ]:


leaves_test_out = et.apply(X_test_out)
print(leaves_test_out.shape)
print(leaves_test_out)

distances_test_out = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out[i,j] = hamming(leaves_test_out[i,:], leaves_test_out[j,:])

score_test_out_Lines = sum(distances_test_out)/499

print(np.mean(score_test_out_Lines))
print(np.cov(score_test_out_Lines))


# In[42]:


# squares

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.figsize'] = 6, 6

plt.figure(dpi=150)

x1 = np.linspace(-1,1,int(num_samples/8))
x1 = np.reshape(x1, (int(num_samples/8),1))
y1 = np.ones((int(num_samples/8),1))*1 + np.random.normal(0,noise,(int(num_samples/8),1))
y1 = np.reshape(y1, (int(num_samples/8),1))

x2 = np.linspace(-1,1,int(num_samples/8))
x2 = np.reshape(x2, (int(num_samples/8),1))
y2 = -np.ones((int(num_samples/8),1))*1 + np.random.normal(0,noise,(int(num_samples/8),1))
y2 = np.reshape(y2, (int(num_samples/8),1))

y3 = np.linspace(-1,1,int(num_samples/8))
y3 = np.reshape(y3, (int(num_samples/8),1))
x3 = np.ones((int(num_samples/8),1))*1 + np.random.normal(0,noise,(int(num_samples/8),1))
x3 = np.reshape(x3, (int(num_samples/8),1))

y4 = np.linspace(-1,1,int(num_samples/8))
y4 = np.reshape(y4, (int(num_samples/8),1))
x4 = -np.ones((int(num_samples/8),1))*1 + np.random.normal(0,noise,(int(num_samples/8),1))
x4 = np.reshape(x4, (int(num_samples/8),1))
label1 = np.ones((4*int(num_samples/8),1))


X1 = np.concatenate([x1,x2,x3,x4])
Y1 = np.concatenate([y1,y2,y3,y4])
print(X1.shape)

xx1 = np.linspace(-0.5,0.5,int(num_samples/8))
xx1 = np.reshape(xx1, (int(num_samples/8),1))
yy1 = np.ones((int(num_samples/8),1))*0.5 + np.random.normal(0,noise,(int(num_samples/8),1))
yy1 = np.reshape(yy1, (int(num_samples/8),1))

xx2 = np.linspace(-0.5,0.5,int(num_samples/8))
xx2 = np.reshape(xx2, (int(num_samples/8),1))
yy2 = -np.ones((int(num_samples/8),1))*0.5 + np.random.normal(0,noise,(int(num_samples/8),1))
yy2 = np.reshape(yy2, (int(num_samples/8),1))

yy3 = np.linspace(-0.5,0.5,int(num_samples/8))
yy3 = np.reshape(yy3, (int(num_samples/8),1))
xx3 = np.ones((int(num_samples/8),1))*0.5 + np.random.normal(0,noise,(int(num_samples/8),1))
xx3 = np.reshape(xx3, (int(num_samples/8),1))

yy4 = np.linspace(-0.5,0.5,int(num_samples/8))
yy4 = np.reshape(yy4, (int(num_samples/8),1))
xx4 = -np.ones((int(num_samples/8),1))*0.5 + np.random.normal(0,noise,(int(num_samples/8),1))
xx4 = np.reshape(xx4, (int(num_samples/8),1))
label2 = np.zeros((4*int(num_samples/8),1))


labels = np.vstack([label1,label2])
print(labels.shape)

X1 = np.concatenate([x1,x2,x3,x4,xx1,xx2,xx3,xx4])
Y1 = np.concatenate([y1,y2,y3,y4,yy1,yy2,yy3,yy4])
print(X1.shape)

plt.scatter(X1, Y1, s=10, c=labels, cmap=matplotlib.colors.ListedColormap(colors))
plt.savefig('simulated_squares.png', bbox_inches='tight')
plt.show()


# In[ ]:


X_squares = np.concatenate([X1,Y1], axis=1)
print(X_squares.shape)
X_squares_ndim = np.concatenate([X_squares,np.random.normal(0,noise,(num_samples,ndim-2))], axis=1)
print(X_squares_ndim.shape)


# In[ ]:


X_test_out = X_squares_ndim


# In[ ]:


leaves_test_out = et.apply(X_test_out)
print(leaves_test_out.shape)
print(leaves_test_out)

distances_test_out = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out[i,j] = hamming(leaves_test_out[i,:], leaves_test_out[j,:])

score_test_out_Squares = sum(distances_test_out)/499

print(np.mean(score_test_out_Squares))
print(np.cov(score_test_out_Squares))


# # Results

# In[ ]:


my_dict = {'Circles': score_test_in, 'Lines': score_test_out_Lines,
           'Squares': score_test_out_Squares}

plt.figure(figsize=(15,10))
plt.boxplot(my_dict.values(), labels=my_dict.keys());

plt.show()


# In[ ]:


score_pred_Lines = np.concatenate([score_test_in, score_test_out_Lines])
score_pred_Squares = np.concatenate([score_test_in, score_test_out_Squares])
score_true = np.concatenate([np.ones(500), np.zeros(500)])

print(roc_auc_score(score_true, score_pred_Lines))
print(roc_auc_score(score_true, score_pred_Squares))


# In[ ]:





# In[ ]:


print(np.mean(score_test_in), np.sqrt(np.cov(score_test_in)))
print(np.mean(score_test_out_Lines), np.sqrt(np.cov(score_test_out_Lines)))
print(np.mean(score_test_out_Squares), np.sqrt(np.cov(score_test_out_Squares)))


# In[ ]:




