### playing around with node2vec

import argparse
import networkx as nx
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, confusion_matrix
from sympy.utilities.iterables import multiset_permutations
import numpy.random as nprand
import numpy as np
import pandas as pd
from pathlib import Path
import os
from datetime import date



### Parse arguments in from command line

parser = argparse.ArgumentParser()
parser.add_argument('--nodes', dest='nodes', type=int, default=50)
parser.add_argument('--K', dest='K', type=int, default=2)
parser.add_argument('--ratio', dest='ratio', type=float, default=0.2)
parser.add_argument('--sims', dest='sims', type=int, default=1)
# parser.add_argument('--within', dest='within', type=float, default=0.5)
# parser.add_argument('--between', dest='between', type=float, default=0.01)
parser.add_argument('--notes', dest='notes', type=str, default='')

args = parser.parse_args()


### initially specify p = 0.05 just by default
### to get it working

n = args.nodes
K = args.K 
## p = 0.05 ## for dense
# p = 5/n ## for sparse
p = np.log(n)/n ## for relatively sparse
q = args.ratio * p
notes = args.notes
nsims = args.sims

## can change node2vec params here also

all_results = []

today = date.today()
curr_date = today.strftime("%d_%m_%Y")
file_name = "node2vec_n{}_K{}_p{}_ratio{}_{}{}".format(n, K, int(p*100), int(args.ratio*100), curr_date, notes)

# Create a simple SBM
# nodes = [n, n]
# p = 0.5
# q = 0.005
# probs = [ [p, q], [q, p]]

# graph = nx.stochastic_block_model(nodes, probs)

for i in range(nsims):
    graph = nx.planted_partition_graph(K, n,  p, q)

    truth = sorted(list(range(K)) * n)

    node2vec = Node2Vec(graph, dimensions=64, walk_length=80, num_walks=10, workers=4) 

    # Embed nodes
    model = node2vec.fit(window=10, min_count=1, batch_words=4)

    emb_df = (
        pd.DataFrame(
            [model.wv.get_vector(str(n)) for n in graph.nodes()],
            index = graph.nodes
        )
    )

    X = emb_df.values

    training = np.random.choice(range(n * K), np.rint(n * K * 0.1).astype(int), replace = False)


    train_X = X[training]

    train_X.shape

    true_array = np.array(truth)
    train_Y = true_array[training]


    test = np.setdiff1d(np.array(range(0, 200)), training)


    X_test = X[test]

    X_test.shape

    Y_test = true_array[test]

    log_reg = LogisticRegression(random_state=0).fit(train_X, train_Y)

    log_reg.predict(X_test)

    accuracy = log_reg.score(X_test, Y_test)


    results = {'n': [n], 'K': [K], 'p': [p], 'q':[q], 'acc':[accuracy], 'sim':[i]}
    all_results.append(results)





df = pd.DataFrame(data = all_results)

curr_dir = os.getcwd()

save_path = curr_dir + "/subfolder/" + curr_date + '/'


filepath = Path(save_path + file_name + '_out.csv')  

filepath.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(filepath)
