
import pickle
import numpy as np
import pandas as pd
from tools import Horo_distance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


with open('info.pkl', 'rb') as f:
    index2word, all_relations, tree_relations = pickle.load(f)
    
    

n_dims = [2,3,5,10]
n_files = [1,2,3]
roots = ['animal.n.01', 
        'group.n.01',
        'worker.n.01',
        'mammal.n.01',
        'location.n.01']


N_trails = 100


hparameter_scale=np.linspace(1,1.5,10)


df_F1=pd.DataFrame(np.zeros( (len(roots), len(n_dims)) ), columns=n_dims, index = roots)
df_F1std = pd.DataFrame(np.zeros( (len(roots), len(n_dims)) ), columns=n_dims, index = roots)

for n_dim in n_dims[:]:        
    F1s_roots ={}        
    for root in roots[:]:
        idx_treeroot=index2word.index(root)
        tree = list(all_relations[all_relations[:,0]==idx_treeroot, 1])            
        print(root+"lenght of tree: %d" % (len(tree)))        
        
        F1s_roots[root] =[]
        for n_trail in range(N_trails):
            n_file = np.random.choice(range(len(n_files)))+1
            filename = 'weights_d'+str(n_dim)+'_v'+str(n_file)
            wts = pd.read_csv(filename)
            wts=wts.iloc[:,1:].values
            
            wt_root = wts[idx_treeroot, :]
            wt_spectral = wt_root/(np.linalg.norm(wt_root))
          
            mask_train = np.zeros(len(index2word), dtype=bool)
            
            idxtrain_true=np.random.choice(tree, 
                                           int(len(tree)*0.8), 
                                           replace=False)
            notree = list(set(range(len(index2word)))-set(tree))
            idxtrain_false=np.random.choice(notree, 
                                            int(len(notree)*0.8), 
                                            replace=False)
            mask_train[idxtrain_true]=True
            mask_train[idxtrain_false]=True
            
            
            
            
#            mask_train[np.random.choice(range(len(index2word)),
#                                        int(len(index2word)*0.8), 
#                                        replace=False)] = True          
            
            F1s_hparameters = np.zeros((len(hparameter_scale), 2))          
            
            for ith_hparameter, hparameter in enumerate(hparameter_scale):
                ds=Horo_distance(wts/hparameter, wt_spectral)
                ds=ds.ravel()               
                X=ds
                Y=np.zeros(wts.shape[0], dtype=int)
                Y[list(set(tree))]=1                
                idx_nottree = np.where(~Y)[0]
                X_train = X[mask_train]
                Y_train = Y[mask_train]
                X_test = X[~mask_train]
                Y_test = Y[~mask_train]               
                clf = LogisticRegression().fit(X_train.reshape(-1, 1) ,Y_train)                
                F1s_hparameters[ith_hparameter, 0] = f1_score(Y_train, clf.predict(X_train.reshape(-1, 1)))
                F1s_hparameters[ith_hparameter, 1] = f1_score(Y_test, clf.predict(X_test.reshape(-1, 1)))                
            hpara_idx = np.argmax(F1s_hparameters[:,0])                
            print(root+", dim: %d, n_trial:%d, Poincare_version:%d, hpara:%f, train_F1:%f, test_F1: %f" %  
                  (n_dim, n_trail, n_file, hparameter_scale[hpara_idx], F1s_hparameters[hpara_idx, 0],F1s_hparameters[hpara_idx, 1] ))
            F1s_roots[root].append(F1s_hparameters[hpara_idx, 1])
    for root in roots:
        df_F1.loc[root,n_dim]= np.mean(F1s_roots[root])
        df_F1std.loc[root,n_dim]= np.std(F1s_roots[root])
        
            
                
                

                
print(df_F1)
print(df_F1std)               
                

                            
            

                    


