{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[STEP 1]: Upload cora dataset.\n",
      "| # of nodes : 2708\n",
      "| # of edges : 5278.0\n",
      "| # of features : 1433\n",
      "| # of clases   : 7\n",
      "| # of train set : 140\n",
      "| # of val set   : 500\n",
      "| # of test set  : 1000\n",
      "node selection begin\n",
      "node selection end\n",
      "xxxxxxxxxx Evaluation begin xxxxxxxxxx\n",
      "0.781 0.779\n",
      "0.776 0.778\n",
      "0.774 0.768\n",
      "0.77 0.773\n",
      "0.768 0.764\n",
      "0.765 0.761\n",
      "0.762 0.763\n",
      "0.761 0.764\n",
      "0.755 0.753\n",
      "0.752 0.747\n",
      "xxxxxxxxxx Evaluation end xxxxxxxxxx\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pickle as pkl\n",
    "import networkx as nx\n",
    "import scipy.sparse as sp\n",
    "import torch\n",
    "import random\n",
    "import copy\n",
    "import sys\n",
    "import os\n",
    "import time\n",
    "import argparse\n",
    "import json\n",
    "import numpy as np\n",
    "import numpy.linalg as la\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "import torch.nn as nn\n",
    "import pandas as pd\n",
    "from scipy.sparse import csgraph\n",
    "from torch.backends import cudnn\n",
    "from torch.optim import lr_scheduler\n",
    "from utils import *\n",
    "from graphConvolution import *\n",
    "\n",
    "#hyperparameters\n",
    "num_node = 2708\n",
    "num_coreset = 140\n",
    "num_class = 7\n",
    "oracle_acc = 0.7\n",
    "th = 0.05\n",
    "batch_size = 5\n",
    "\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '1'\n",
    "cudnn.benchmark = False            # if benchmark=True, deterministic will be False\n",
    "cudnn.deterministic = True\n",
    "#num_coreset = int((num_node-1500)*0.01)\n",
    "hidden_size = 128\n",
    "num_val = 500\n",
    "num_test = 1000\n",
    "\n",
    "def get_reliable_score(similarity):\n",
    "    return (oracle_acc*similarity)/(oracle_acc*similarity+(1-oracle_acc)*(1-similarity)/(num_class-1))\n",
    "    \n",
    "def get_activated_node_dense(node,reliable_score,activated_node): \n",
    "    activated_vector=((adj_matrix2[node]*reliable_score)>th)+0\n",
    "    activated_vector=activated_vector*activated_node\n",
    "    count=num_ones.dot(activated_vector)\n",
    "    return count,activated_vector\n",
    "\n",
    "def get_max_reliable_info_node_dense(idx_used,high_score_nodes,activated_node,train_class,labels): \n",
    "    max_ral_node = 0\n",
    "    max_activated_node = 0\n",
    "    max_activated_num = 0 \n",
    "    for node in high_score_nodes:\n",
    "        reliable_score = oracle_acc\n",
    "        activated_num,activated_node_tmp =get_activated_node_dense(node,reliable_score,activated_node)\n",
    "        if activated_num > max_activated_num:\n",
    "            max_activated_num = activated_num\n",
    "            max_ral_node = node\n",
    "            max_activated_node = activated_node_tmp        \n",
    "    return max_ral_node,max_activated_node,max_activated_num\n",
    "\n",
    "def update_reliability(idx_used,train_class,labels,num_node):\n",
    "    activated_node = np.zeros(num_node)\n",
    "    for node in idx_used:\n",
    "        reliable_score = 0\n",
    "        node_label = labels[node].item()\n",
    "        if node_label in train_class:\n",
    "            total_score = 0.0\n",
    "            for tmp_node in train_class[node_label]:\n",
    "                total_score+=reliability_list[tmp_node]\n",
    "            for tmp_node in train_class[node_label]:\n",
    "                reliable_score+=reliability_list[tmp_node]*get_reliable_score(similarity_feature[node][tmp_node])\n",
    "            reliable_score = reliable_score/total_score\n",
    "        else:\n",
    "            reliable_score = oracle_acc\n",
    "        reliability_list[node]=reliable_score\n",
    "        activated_node+=((adj_matrix2[node]*reliable_score)>th)+0\n",
    "    return np.ones(num_node)-((activated_node>0)+0)\n",
    "\n",
    "def my_cross_entropy(x_pred,x_traget):\n",
    "    logged_x_pred = torch.log(x_pred)\n",
    "    cost_value = -torch.sum(x_traget*logged_x_pred)/x_pred.size()[0]\n",
    "    return cost_value\n",
    "\n",
    "def aug_normalized_adjacency(adj):\n",
    "    adj = adj + sp.eye(adj.shape[0])\n",
    "    adj = sp.coo_matrix(adj)\n",
    "    row_sum = np.array(adj.sum(1))\n",
    "    d_inv_sqrt = np.power(row_sum, -1.0).flatten()\n",
    "    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.\n",
    "    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)\n",
    "    return d_mat_inv_sqrt.dot(adj).tocoo()\n",
    "\n",
    "def random_pick(some_list, probabilities): \n",
    "    x = random.uniform(0,1) \n",
    "    cumulative_probability = 0.0 \n",
    "    for item, item_probability in zip(some_list, probabilities): \n",
    "        cumulative_probability += item_probability \n",
    "        if x < cumulative_probability:\n",
    "            break \n",
    "    return item \n",
    "\n",
    "\n",
    "def compute_cos_sim(vec_a,vec_b):\n",
    "    return (vec_a.dot(vec_b.T))/(la.norm(vec_a)*la.norm(vec_b))\n",
    "#read dataset\n",
    "\n",
    "class GCN(nn.Module):\n",
    "    def __init__(self, nfeat, nhid, nclass, dropout):\n",
    "        super(GCN, self).__init__()\n",
    "\n",
    "        self.gc1 = GraphConvolution(nfeat, nhid,bias=True)\n",
    "        self.gc2 = GraphConvolution(nhid, nclass,bias=True)\n",
    "        self.dropout = dropout\n",
    "\n",
    "    def forward(self, x, adj):\n",
    "        x = F.dropout(x, self.dropout, training=self.training)\n",
    "        x = F.relu(self.gc1(x, adj))\n",
    "        x = F.dropout(x, self.dropout, training=self.training)\n",
    "        x = self.gc2(x, adj)\n",
    "        return x\n",
    "   \n",
    "def train(epoch, model,record):\n",
    "\n",
    "    model.train()\n",
    "    optimizer.zero_grad()\n",
    "    output = model(features_GCN, adj)\n",
    "    output_ = F.softmax(output,dim=1)\n",
    "    one_hot_labels = F.one_hot(labels, num_classes=num_class)\n",
    "    weight_one_hot_labels = torch.mul(one_hot_labels,reliability_list)\n",
    "    loss_train = my_cross_entropy(output_[idx_train],weight_one_hot_labels[idx_train])\n",
    "    acc_train = accuracy(output[idx_train], labels[idx_train])\n",
    "    loss_train.backward()\n",
    "    optimizer.step()\n",
    "    model.eval()\n",
    "    output = model(features_GCN, adj)\n",
    "\n",
    "    loss_val = F.nll_loss(output[idx_val], labels[idx_val])\n",
    "    acc_val = accuracy(output[idx_val], labels[idx_val])\n",
    "    loss_test = F.nll_loss(output[idx_test], labels[idx_test])\n",
    "    acc_test = accuracy(output[idx_test], labels[idx_test])\n",
    "    record[acc_val.item()] = acc_test.item()\n",
    "#read data\n",
    "adj, features, labels, idx_train, idx_val, idx_test = load_data(dataset=\"cora\")\n",
    "\n",
    "reliability_list = np.ones(num_node)\n",
    "num_zeros = np.zeros(num_node)\n",
    "num_ones = np.ones(num_node)\n",
    "labels = list(labels.cpu())\n",
    "idx_val = list(idx_val.cpu())\n",
    "idx_test = list(idx_test.cpu())\n",
    "idx_avaliable = list()\n",
    "for i in range(num_node):\n",
    "    if i not in idx_val and i not in idx_test:\n",
    "        idx_avaliable.append(i)\n",
    "\n",
    "# add noise\n",
    "label_list=[]\n",
    "prob_list = np.full((num_class,num_class),(1-oracle_acc)/(num_class-1)).tolist()\n",
    "for i in range(num_class):\n",
    "    label_list.append(i)\n",
    "    prob_list[i][i]=oracle_acc\n",
    "for idx in idx_avaliable:\n",
    "    labels[idx]=torch.tensor(random_pick(label_list,prob_list[labels[idx].item()]))\n",
    "\n",
    "#compute normalized distance\n",
    "adj = aug_normalized_adjacency(adj)\n",
    "features = features.cuda()\n",
    "adj_matrix = torch.FloatTensor(adj.todense()).cuda()\n",
    "adj_matrix2 = torch.mm(adj_matrix,adj_matrix).cuda()\n",
    "aax_feature = torch.mm(adj_matrix2,features)\n",
    "aax_feature = np.array(aax_feature.cpu())\n",
    "adj_matrix2 = np.array(adj_matrix2.cpu())\n",
    "features = features.cpu()\n",
    "adj = sparse_mx_to_torch_sparse_tensor(adj).float().cuda()\n",
    "features_GCN = copy.deepcopy(features)\n",
    "features_GCN = torch.FloatTensor(features_GCN).cuda()\n",
    "\n",
    "similarity_feature = np.ones((num_node,num_node))\n",
    "for i in range(num_node-1):\n",
    "    for j in range(i+1,num_node):\n",
    "        similarity_feature[i][j] = compute_cos_sim(aax_feature[i],aax_feature[j])\n",
    "        similarity_feature[j][i] = similarity_feature[i][j]\n",
    "dis_range = np.max(similarity_feature) - np.min(similarity_feature)\n",
    "similarity_feature = (similarity_feature - np.min(similarity_feature))/dis_range\n",
    "\n",
    "#chooose node\n",
    "print(\"node selection begin\")\n",
    "activated_node = np.ones(num_node)\n",
    "idx_train = []\n",
    "train_class = dict()\n",
    "idx_avaliable_temp = copy.deepcopy(idx_avaliable)\n",
    "count = 0\n",
    "while True:\n",
    "    max_ral_node,max_activated_node,max_activated_num = get_max_reliable_info_node_dense(idx_train,idx_avaliable_temp,activated_node,train_class,labels) \n",
    "    idx_train.append(max_ral_node) \n",
    "    idx_avaliable.remove(max_ral_node)\n",
    "    idx_avaliable_temp.remove(max_ral_node)\n",
    "    node_label = labels[max_ral_node].item()\n",
    "    if node_label in train_class:\n",
    "        train_class[node_label].append(max_ral_node)\n",
    "    else:\n",
    "        train_class[node_label]=list()\n",
    "        train_class[node_label].append(max_ral_node)\n",
    "    count += 1\n",
    "    if count%batch_size == 0:\n",
    "        activated_node = update_reliability(idx_train,train_class,labels,num_node)\n",
    "    activated_node = activated_node - max_activated_node\n",
    "    if count >= num_coreset or max_activated_num <= 0:\n",
    "        break\n",
    "print(\"node selection end\")\n",
    "\n",
    "labels = torch.LongTensor(labels).cuda()\n",
    "idx_train = torch.LongTensor(idx_train).cuda()\n",
    "idx_val = torch.LongTensor(idx_val).cuda()\n",
    "idx_test = torch.LongTensor(idx_test).cuda()\n",
    "reliability_list = torch.FloatTensor(reliability_list).unsqueeze(1).cuda()\n",
    "#train\n",
    "print('xxxxxxxxxx Evaluation begin xxxxxxxxxx')\n",
    "t_total = time.time()\n",
    "record = {}\n",
    "model = GCN(nfeat=features_GCN.shape[1],\n",
    "        nhid=hidden_size,\n",
    "        nclass=labels.max().item() + 1,\n",
    "        dropout=0.85)\n",
    "model.cuda()\n",
    "optimizer = optim.Adam(model.parameters(),\n",
    "                        lr=0.05, weight_decay=5e-4)\n",
    "for epoch in range(400):\n",
    "    train(epoch,model,record)\n",
    "\n",
    "bit_list = sorted(record.keys())\n",
    "bit_list.reverse()\n",
    "for key in bit_list[:10]:\n",
    "    value = record[key]\n",
    "    print(key,value)\n",
    "print('xxxxxxxxxx Evaluation end xxxxxxxxxx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
