#!/usr/bin/env python
# coding: utf-8

# In[274]:


from google.colab import drive
drive.mount('/content/gdrive')


# In[275]:


pip install 'portalocker>=2.0.0'


# In[279]:


import torchtext
import torch
import numpy as np
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer

global_vectors = GloVe(name='6B', dim=300)

tokenizer = get_tokenizer("basic_english")


# # Computer vs. Sports and Politics

# In[280]:


from sklearn.datasets import fetch_20newsgroups
#from sklearn.feature_extraction.text import TfidfVectorizer

categories_train = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories_train, random_state=None)

# vectorizer = TfidfVectorizer()
# vectors = vectorizer.fit_transform(newsgroups_train.data)


# In[281]:


embeddings_Computer_train = np.zeros((2936,300))
labels_Computer_train = newsgroups_train.target

i = 0

for line in newsgroups_train.data:
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Computer_train[i,:] = embedding_mean

    i += 1

#print(i)


# In[282]:


latent_train = embeddings_Computer_train
y = labels_Computer_train
print(latent_train.shape)
print(y.shape)
print(y)
# np.random.shuffle(y)
# print(y)


# In[283]:


pip install PIMS


# In[284]:


get_ipython().run_line_magic('matplotlib', 'inline')

# importing relevant libraries
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc#plot_precision_recall_curve
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import sklearn
#from umap import UMAP
#from pynndescent import NNDescent

#from fastcluster import single
from scipy.cluster.hierarchy import cut_tree, fcluster, dendrogram
from scipy.spatial.distance import squareform
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from pims import ImageSequence
from PIL import Image
from scipy.spatial.distance import hamming

# turning off automatic plot showing, and setting style
plt.style.use('bmh')


# In[285]:


et = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=10,
                          max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# et = RandomForestClassifier(n_estimators=100, min_samples_leaf=10,
#                           max_features=None, bootstrap=True, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=5, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, latent_train, y, cv=skf, method='predict_proba')

# evaluating the model
#print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))


# In[286]:


et.fit(latent_train,y)


# In[287]:


leaves_train = et.apply(latent_train)
print(leaves_train.shape)
print(leaves_train)

distances_train = np.zeros((500,500))


for i in range(500):
    for j in range(500):
        distances_train[i,j] = hamming(leaves_train[i,:], leaves_train[j,:])

score_train = sum(distances_train)/499

print(np.mean(score_train))
print(np.cov(score_train))


# ## Testing on ID Data

# ### Computer

# In[288]:


categories_test = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[289]:


embeddings_Computer_test = np.zeros((500,300))
labels_Computer_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i >= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Computer_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[290]:


latent_test_in = embeddings_Computer_test

leaves_test_in = et.apply(latent_test_in)
print(leaves_test_in.shape)
print(leaves_test_in)

distances_test_in = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_in[i,j] = hamming(leaves_test_in[i,:], leaves_test_in[j,:])

score_test_in = sum(distances_test_in)/499

print(np.mean(score_test_in))
print(np.cov(score_test_in))


# ## Testing on OOD Data

# ### Sports

# In[291]:


categories_test = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[292]:


embeddings_Sports_test = np.zeros((500,300))
labels_Sports_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Sports_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[293]:


latent_test_out_Sports = embeddings_Sports_test

leaves_test_out_Sports = et.apply(latent_test_out_Sports)
print(leaves_test_out_Sports.shape)
print(leaves_test_out_Sports)

distances_test_out_Sports = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Sports[i,j] = hamming(leaves_test_out_Sports[i,:], leaves_test_out_Sports[j,:])

score_test_out_Sports = sum(distances_test_out_Sports)/499

print(np.mean(score_test_out_Sports))
print(np.cov(score_test_out_Sports))


# ### Politics

# In[294]:


categories_test = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[295]:


embeddings_Politics_test = np.zeros((500,300))
labels_Politics_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Politics_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[296]:


latent_test_out_Politics = embeddings_Politics_test

leaves_test_out_Politics = et.apply(latent_test_out_Politics)
print(leaves_test_out_Politics.shape)
print(leaves_test_out_Politics)

distances_test_out_Politics = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Politics[i,j] = hamming(leaves_test_out_Politics[i,:], leaves_test_out_Politics[j,:])

score_test_out_Politics = sum(distances_test_out_Politics)/499

print(np.mean(score_test_out_Politics))
print(np.cov(score_test_out_Politics))


# ## Results

# In[297]:


my_dict = {'Computer': score_test_in, 'Sports': score_test_out_Sports, 'Politics': score_test_out_Politics}

plt.figure(figsize=(10,6))
plt.boxplot(my_dict.values(), labels=my_dict.keys());
plt.show()


# In[298]:


score_pred_Sports = np.concatenate([score_test_in, score_test_out_Sports])
score_pred_Politics = np.concatenate([score_test_in, score_test_out_Politics])
score_true = np.concatenate([np.ones(500), np.zeros(500)])


print(roc_auc_score(score_true, score_pred_Sports))
print(roc_auc_score(score_true, score_pred_Politics))


# In[299]:


precision_Sports, recall_Sports, thresholds_Sports = precision_recall_curve(score_true, score_pred_Sports)
precision_Politics, recall_Politics, thresholds_Politics = precision_recall_curve(score_true, score_pred_Politics)

auc_precision_recall_Sports = auc(recall_Sports, precision_Sports)
auc_precision_recall_Politics = auc(recall_Politics, precision_Politics)

print(auc_precision_recall_Sports)
print(auc_precision_recall_Politics)


# In[300]:


def compute_fpr90(y_true, y_pred_probs):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_pred_probs)
    idx = np.abs(tpr - 0.90).argmin()
    fpr90 = fpr[idx]
    return fpr90


fpr90_score_Sports = compute_fpr90(score_true, score_pred_Sports)
fpr90_score_Politics = compute_fpr90(score_true, score_pred_Politics)

print(fpr90_score_Sports)
print(fpr90_score_Politics)


# In[300]:





# In[300]:





# # Sports vs. Computer and Politics

# In[301]:


categories_train = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories_train, random_state=None)


# In[302]:


embeddings_Sports_train = np.zeros((1197,300))
labels_Sports_train = newsgroups_train.target

i = 0

for line in newsgroups_train.data:
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Sports_train[i,:] = embedding_mean

    i += 1

#print(i)


# In[303]:


latent_train = embeddings_Sports_train
y = labels_Sports_train
print(latent_train.shape)
print(y.shape)
print(y)
# np.random.shuffle(y)
# print(y)


# In[304]:


et = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=10,
                          max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# et = RandomForestClassifier(n_estimators=100, min_samples_leaf=10,
#                           max_features=None, bootstrap=True, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=5, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, latent_train, y, cv=skf, method='predict_proba')

# evaluating the model
#print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))


# In[305]:


et.fit(latent_train,y)


# In[306]:


leaves_train = et.apply(latent_train)
print(leaves_train.shape)
print(leaves_train)

distances_train = np.zeros((500,500))


for i in range(500):
    for j in range(500):
        distances_train[i,j] = hamming(leaves_train[i,:], leaves_train[j,:])

score_train = sum(distances_train)/499

print(np.mean(score_train))
print(np.cov(score_train))


# ## Testing on ID Data

# ### Sports

# In[307]:


categories_test = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[308]:


embeddings_Sports_test = np.zeros((500,300))
labels_Sports_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i >= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Sports_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[309]:


latent_test_in = embeddings_Sports_test

leaves_test_in = et.apply(latent_test_in)
print(leaves_test_in.shape)
print(leaves_test_in)

distances_test_in = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_in[i,j] = hamming(leaves_test_in[i,:], leaves_test_in[j,:])

score_test_in = sum(distances_test_in)/499

print(np.mean(score_test_in))
print(np.cov(score_test_in))


# ## Testing on OOD Data

# ### Computer

# In[310]:


categories_test = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[311]:


embeddings_Computer_test = np.zeros((500,300))
labels_Computer_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Computer_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[312]:


latent_test_out_Computer = embeddings_Computer_test

leaves_test_out_Computer = et.apply(latent_test_out_Computer)
print(leaves_test_out_Computer.shape)
print(leaves_test_out_Computer)

distances_test_out_Computer = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Computer[i,j] = hamming(leaves_test_out_Computer[i,:], leaves_test_out_Computer[j,:])

score_test_out_Computer = sum(distances_test_out_Computer)/499

print(np.mean(score_test_out_Computer))
print(np.cov(score_test_out_Computer))


# ### Politics

# In[313]:


categories_test = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[314]:


embeddings_Politics_test = np.zeros((500,300))
labels_Politics_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Politics_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[315]:


latent_test_out_Politics = embeddings_Politics_test

leaves_test_out_Politics = et.apply(latent_test_out_Politics)
print(leaves_test_out_Politics.shape)
print(leaves_test_out_Politics)

distances_test_out_Politics = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Politics[i,j] = hamming(leaves_test_out_Politics[i,:], leaves_test_out_Politics[j,:])

score_test_out_Politics = sum(distances_test_out_Politics)/499

print(np.mean(score_test_out_Politics))
print(np.cov(score_test_out_Politics))


# ## Results

# In[316]:


score_pred_Computer = np.concatenate([score_test_in, score_test_out_Computer])
score_pred_Politics = np.concatenate([score_test_in, score_test_out_Politics])
score_true = np.concatenate([np.ones(500), np.zeros(500)])


print(roc_auc_score(score_true, score_pred_Computer))
print(roc_auc_score(score_true, score_pred_Politics))


# In[317]:


precision_Computer, recall_Computer, thresholds_Computer = precision_recall_curve(score_true, score_pred_Computer)
precision_Politics, recall_Politics, thresholds_Politics = precision_recall_curve(score_true, score_pred_Politics)

auc_precision_recall_Computer = auc(recall_Computer, precision_Computer)
auc_precision_recall_Politics = auc(recall_Politics, precision_Politics)

print(auc_precision_recall_Computer)
print(auc_precision_recall_Politics)


# In[318]:


fpr90_score_Computer = compute_fpr90(score_true, score_pred_Computer)
fpr90_score_Politics = compute_fpr90(score_true, score_pred_Politics)

print(fpr90_score_Computer)
print(fpr90_score_Politics)


# In[318]:





# In[318]:





# # Politics vs. Sports and Computer

# In[319]:


categories_train = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories_train, random_state=None)


# In[320]:


embeddings_Politics_train = np.zeros((1575,300))
labels_Politics_train = newsgroups_train.target

i = 0

for line in newsgroups_train.data:
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Politics_train[i,:] = embedding_mean

    i += 1

#print(i)


# In[321]:


latent_train = embeddings_Politics_train
y = labels_Politics_train
print(latent_train.shape)
print(y.shape)
print(y)
# np.random.shuffle(y)
# print(y)


# In[322]:


et = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=10,
                          max_features="sqrt", bootstrap=True, class_weight='balanced', n_jobs=-1)

# et = RandomForestClassifier(n_estimators=100, min_samples_leaf=10,
#                           max_features=None, bootstrap=True, class_weight='balanced', n_jobs=-1)

# validation instance
skf = StratifiedKFold(n_splits=5, shuffle=True)

# getting the model validation predictions
preds = cross_val_predict(et, latent_train, y, cv=skf, method='predict_proba')

# evaluating the model
#print('Area under the ROC Curve:', roc_auc_score(y, preds, multi_class='ovo'))


# In[323]:


et.fit(latent_train,y)


# In[324]:


leaves_train = et.apply(latent_train)
print(leaves_train.shape)
print(leaves_train)

distances_train = np.zeros((500,500))


for i in range(500):
    for j in range(500):
        distances_train[i,j] = hamming(leaves_train[i,:], leaves_train[j,:])

score_train = sum(distances_train)/499

print(np.mean(score_train))
print(np.cov(score_train))


# ## Testing on ID Data

# ### Politics

# In[325]:


categories_test = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[326]:


embeddings_Politics_test = np.zeros((500,300))
labels_Politics_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Politics_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[327]:


latent_test_in = embeddings_Politics_test

leaves_test_in = et.apply(latent_test_in)
print(leaves_test_in.shape)
print(leaves_test_in)

distances_test_in = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_in[i,j] = hamming(leaves_test_in[i,:], leaves_test_in[j,:])

score_test_in = sum(distances_test_in)/499

print(np.mean(score_test_in))
print(np.cov(score_test_in))


# ## Testing on OOD Data

# ### Sports

# In[328]:


categories_test = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[329]:


embeddings_Sports_test = np.zeros((500,300))
labels_Sports_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Sports_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[330]:


latent_test_out_Sports = embeddings_Sports_test

leaves_test_out_Sports = et.apply(latent_test_out_Sports)
print(leaves_test_out_Sports.shape)
print(leaves_test_out_Sports)

distances_test_out_Sports = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Sports[i,j] = hamming(leaves_test_out_Sports[i,:], leaves_test_out_Sports[j,:])

score_test_out_Sports = sum(distances_test_out_Sports)/499

print(np.mean(score_test_out_Sports))
print(np.cov(score_test_out_Sports))


# ### Computer

# In[331]:


categories_test = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories_test, random_state=None)


# In[332]:


embeddings_Computer_test = np.zeros((500,300))
labels_Computer_test = newsgroups_test.target

i = 0

for line in newsgroups_test.data:
    if i>= 500:
        break
    embedding = global_vectors.get_vecs_by_tokens(tokenizer(line), lower_case_backup=True)
    embedding_mean = torch.mean(embedding,0,True)
    embeddings_Computer_test[i,:] = embedding_mean
    i += 1

#print(i)


# In[333]:


latent_test_out_Computer = embeddings_Computer_test

leaves_test_out_Computer = et.apply(latent_test_out_Computer)
print(leaves_test_out_Computer.shape)
print(leaves_test_out_Computer)

distances_test_out_Computer = np.zeros((500,500))

for i in range(500):
    for j in range(500):
        distances_test_out_Computer[i,j] = hamming(leaves_test_out_Computer[i,:], leaves_test_out_Computer[j,:])

score_test_out_Computer = sum(distances_test_out_Computer)/499

print(np.mean(score_test_out_Computer))
print(np.cov(score_test_out_Computer))


# ## Results

# In[334]:


score_pred_Computer = np.concatenate([score_test_in, score_test_out_Computer])
score_pred_Sports = np.concatenate([score_test_in, score_test_out_Sports])
score_true = np.concatenate([np.ones(500), np.zeros(500)])


print(roc_auc_score(score_true, score_pred_Computer))
print(roc_auc_score(score_true, score_pred_Sports))


# In[335]:


precision_Computer, recall_Computer, thresholds_Computer = precision_recall_curve(score_true, score_pred_Computer)
precision_Sports, recall_Sports, thresholds_Sports = precision_recall_curve(score_true, score_pred_Sports)

auc_precision_recall_Computer = auc(recall_Computer, precision_Computer)
auc_precision_recall_Sports = auc(recall_Sports, precision_Sports)

print(auc_precision_recall_Computer)
print(auc_precision_recall_Sports)


# In[336]:


fpr90_score_Computer = compute_fpr90(score_true, score_pred_Computer)
fpr90_score_Sports = compute_fpr90(score_true, score_pred_Sports)

print(fpr90_score_Computer)
print(fpr90_score_Sports)

