#!/usr/bin/env python
# coding: utf-8

# In[1]:


from google.colab import drive
drive.mount('/content/gdrive')


# In[2]:


import torchtext
import torch
import numpy as np
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer
import matplotlib.pyplot as plt

# turning off automatic plot showing, and setting style
plt.style.use('bmh')

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.figsize'] = 9, 6


# In[3]:


pip install 'portalocker>=2.0.0'


# In[10]:


global_vectors = GloVe(name='6B', dim=300)

tokenizer = get_tokenizer("basic_english")

# def tokenize(label, line):
#     return line.split()

# tokens = []
# for label, line in train_iter:
#     tokens += tokenize(label, line)


# In[ ]:


# After running the previous cell for the first time, restart the kernel and run all


# In[11]:


train_iter, test_iter = torchtext.datasets.IMDB(split=('train', 'test'))

embeddings_IMDB_train = np.zeros((25000,300))  # IMDB 25000, AGNEWS 120000,  Amazon 3000000, DBpedia 560000, Yahoo 1400000
labels_IMDB_train = np.zeros(25000)

i = 0
#labels = []
for label, line in train_iter:

    if i%1 == 0:
        embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
        embedding_mean = torch.mean(embedding,0,True)
        embeddings_IMDB_train[i//1,:] = embedding_mean
        labels_IMDB_train[i//1] = label
    i += 1

    #labels.append(label)

#print(i)


# In[13]:


labels_IMDB_train


# In[14]:


print(i)


# In[17]:


embeddings_IMDB_test = np.zeros((500,300))
labels_IMDB_test = np.zeros(500)

i = 0
for label, line in test_iter:
    if i >= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_IMDB_test[i,:] = embedding_mean
    labels_IMDB_test[i] = label
    i += 1


# In[18]:


latent_train = embeddings_IMDB_train
y = labels_IMDB_train
print(latent_train.shape)
print(y.shape)
#y = np.array([int(y[i]) for i in range(25000)])


# In[19]:


indices = np.random.permutation(y.shape[0])

y = y[indices]
latent_train = latent_train[indices,:]
print(y)


# In[20]:


y = y[:25000]
latent_train = latent_train[:25000,:]


# In[21]:


# np.random.shuffle(y)
# print(y)


# # Tree Model

# In[22]:


pip install PIMS


# In[23]:


get_ipython().run_line_magic('matplotlib', 'inline')

# importing relevant libraries
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc#plot_precision_recall_curve
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import sklearn
#from umap import UMAP
#from pynndescent import NNDescent
#from fastcluster import single
from scipy.cluster.hierarchy import cut_tree, fcluster, dendrogram
from scipy.spatial.distance import squareform
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from pims import ImageSequence
from PIL import Image
from scipy.spatial.distance import hamming

# turning off automatic plot showing, and setting style
plt.style.use('bmh')


# In[24]:


et = ExtraTreesClassifier(n_estimators=500, min_samples_leaf=100,
                          max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# et = RandomForestClassifier(n_estimators=500, min_samples_leaf=100,
#                           max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=5, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, latent_train, y, cv=skf, method='predict_proba')

# evaluating the model
#print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))


# In[25]:


et.fit(latent_train,y)


# # Testing on ID Data

# In[26]:


leaves_train = et.apply(latent_train)
print(leaves_train.shape)
print(leaves_train)

distances_train = np.zeros((500,500))


for i in range(500):
    for j in range(500):
        distances_train[i,j] = hamming(leaves_train[i,:], leaves_train[j,:])

score_train = sum(distances_train)/499

print(np.mean(score_train))
print(np.cov(score_train))


# In[27]:


latent_test_in = embeddings_IMDB_test

leaves_test_in = et.apply(latent_test_in)
print(leaves_test_in.shape)
print(leaves_test_in)

distances_test_in = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_in[i,j] = hamming(leaves_test_in[i,:], leaves_test_in[j,:])

score_test_in = sum(distances_test_in)/499

print(np.mean(score_test_in))
print(np.cov(score_test_in))


# In[27]:





# # Testing on OOD Data

# ## AGNEWS

# In[28]:


train_iter, test_iter = torchtext.datasets.AG_NEWS(split=('train', 'test'))

embeddings_AGNEWS = np.zeros((500,300))
labels_AGNEWS = np.zeros((500,1))

i = 0
for label, line in test_iter:
    if i >= 500:
        break

    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_AGNEWS[i,:] = embedding_mean
    labels_AGNEWS[i,:] = label
    i += 1


# In[29]:


latent_test_out_AGNEWS = embeddings_AGNEWS

leaves_test_out_AGNEWS = et.apply(latent_test_out_AGNEWS)
print(leaves_test_out_AGNEWS.shape)
print(leaves_test_out_AGNEWS)

distances_test_out_AGNEWS = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_AGNEWS[i,j] = hamming(leaves_test_out_AGNEWS[i,:], leaves_test_out_AGNEWS[j,:])

score_test_out_AGNEWS = sum(distances_test_out_AGNEWS)/499

print(np.mean(score_test_out_AGNEWS))
print(np.cov(score_test_out_AGNEWS))


# ## Amazon

# In[30]:


train_iter, test_iter = torchtext.datasets.AmazonReviewFull(split=('train', 'test'))

embeddings_Amazon = np.zeros((500,300))
labels_Amazon = np.zeros((500,1))

i = 0
for label, line in test_iter:
    if i >= 500:
        break

    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Amazon[i,:] = embedding_mean
    labels_Amazon[i,:] = label
    i += 1


# In[31]:


latent_test_out_Amazon = embeddings_Amazon

leaves_test_out_Amazon = et.apply(latent_test_out_Amazon)
print(leaves_test_out_Amazon.shape)
print(leaves_test_out_Amazon)

distances_test_out_Amazon = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Amazon[i,j] = hamming(leaves_test_out_Amazon[i,:], leaves_test_out_Amazon[j,:])

score_test_out_Amazon = sum(distances_test_out_Amazon)/499

print(np.mean(score_test_out_Amazon))
print(np.cov(score_test_out_Amazon))


# ## YahooAnswers

# In[36]:


train_iter, test_iter = torchtext.datasets.YahooAnswers(split=('train', 'test'))

embeddings_YahooAnswers = np.zeros((500,300))
labels_YahooAnswers = np.zeros((500,1))

i = 0
for label, line in test_iter:
    if i >= 500:
        break

    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_YahooAnswers[i,:] = embedding_mean
    labels_YahooAnswers[i,:] = label
    i += 1


# In[37]:


latent_test_out_YahooAnswers = embeddings_YahooAnswers

leaves_test_out_YahooAnswers = et.apply(latent_test_out_YahooAnswers)
print(leaves_test_out_YahooAnswers.shape)
print(leaves_test_out_YahooAnswers)

distances_test_out_YahooAnswers = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_YahooAnswers[i,j] = hamming(leaves_test_out_YahooAnswers[i,:], leaves_test_out_YahooAnswers[j,:])

score_test_out_YahooAnswers = sum(distances_test_out_YahooAnswers)/499

print(np.mean(score_test_out_YahooAnswers))
print(np.cov(score_test_out_YahooAnswers))


# ## YelpReviewFull

# In[38]:


train_iter, test_iter = torchtext.datasets.YelpReviewFull(split=('train', 'test'))

embeddings_Yelp = np.zeros((500,300))
labels_Yelp = np.zeros((500,1))

i = 0
for label, line in test_iter:
    if i >= 500:
        break

    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Yelp[i,:] = embedding_mean
    labels_Yelp[i,:] = label
    i += 1


# In[39]:


latent_test_out_Yelp = embeddings_Yelp

leaves_test_out_Yelp = et.apply(latent_test_out_Yelp)
print(leaves_test_out_Yelp.shape)
print(leaves_test_out_Yelp)

distances_test_out_Yelp = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Yelp[i,j] = hamming(leaves_test_out_Yelp[i,:], leaves_test_out_Yelp[j,:])

score_test_out_Yelp = sum(distances_test_out_Yelp)/499

print(np.mean(score_test_out_Yelp))
print(np.cov(score_test_out_Yelp))


# In[39]:





# In[39]:





# # Results

# In[40]:


my_dict = {'IMDB': score_test_in, 'AGNEWS': score_test_out_AGNEWS, 'Amazon': score_test_out_Amazon, 'YahooAnswers': score_test_out_YahooAnswers, 'Yelp': score_test_out_Yelp}

plt.figure(figsize=(10,6))
plt.boxplot(my_dict.values(), labels=my_dict.keys());
plt.show()


# In[41]:


score_pred_AGNEWS = np.concatenate([score_test_in, score_test_out_AGNEWS])
score_pred_Amazon = np.concatenate([score_test_in, score_test_out_Amazon])
score_pred_YahooAnswers = np.concatenate([score_test_in, score_test_out_YahooAnswers])
score_pred_Yelp = np.concatenate([score_test_in, score_test_out_Yelp])
score_true = np.concatenate([np.ones(500), np.zeros(500)])


print(roc_auc_score(score_true, score_pred_AGNEWS))
print(roc_auc_score(score_true, score_pred_Amazon))
print(roc_auc_score(score_true, score_pred_YahooAnswers))
print(roc_auc_score(score_true, score_pred_Yelp))


# In[42]:


precision_AGNEWS, recall_AGNEWS, thresholds_AGNEWS = precision_recall_curve(score_true, score_pred_AGNEWS)
precision_Amazon, recall_Amazon, thresholds_Amazon = precision_recall_curve(score_true, score_pred_Amazon)
precision_YahooAnswers, recall_YahooAnswers, thresholds_YahooAnswers = precision_recall_curve(score_true, score_pred_YahooAnswers)
precision_Yelp, recall_Yelp, thresholds_Yelp = precision_recall_curve(score_true, score_pred_Yelp)

auc_precision_recall_AGNEWS = auc(recall_AGNEWS, precision_AGNEWS)
auc_precision_recall_Amazon = auc(recall_Amazon, precision_Amazon)
auc_precision_recall_YahooAnswers = auc(recall_YahooAnswers, precision_YahooAnswers)
auc_precision_recall_Yelp = auc(recall_Yelp, precision_Yelp)

print(auc_precision_recall_AGNEWS)
print(auc_precision_recall_Amazon)
print(auc_precision_recall_YahooAnswers)
print(auc_precision_recall_Yelp)


# In[43]:


def compute_fpr95(y_true, y_pred_probs):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_pred_probs)
    idx = np.abs(tpr - 0.95).argmin()
    fpr95 = fpr[idx]
    return fpr95

fpr95_score_AGNEWS = compute_fpr95(score_true, score_pred_AGNEWS)
fpr95_score_Amazon = compute_fpr95(score_true, score_pred_Amazon)
fpr95_score_YahooAnswers = compute_fpr95(score_true, score_pred_YahooAnswers)
fpr95_score_Yelp = compute_fpr95(score_true, score_pred_Yelp)

print(fpr95_score_AGNEWS)
print(fpr95_score_Amazon)
print(fpr95_score_YahooAnswers)
print(fpr95_score_Yelp)


# In[43]:





# In[44]:


print(np.mean(score_test_in), np.sqrt(np.cov(score_test_in)))
print(np.mean(score_test_out_AGNEWS), np.sqrt(np.cov(score_test_out_AGNEWS)))
print(np.mean(score_test_out_Amazon), np.sqrt(np.cov(score_test_out_Amazon)))
print(np.mean(score_test_out_YahooAnswers), np.sqrt(np.cov(score_test_out_YahooAnswers)))
print(np.mean(score_test_out_Yelp), np.sqrt(np.cov(score_test_out_Yelp)))


# In[44]:





# In[44]:




