{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bronze-chamber",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path\n",
    "import os\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "import torch.nn as nn\n",
    "import torch_geometric.transforms as T\n",
    "from torch_geometric.datasets import Planetoid,TUDataset\n",
    "from torch_geometric.nn import GATConv, GCNConv,GINConv\n",
    "import numpy as np\n",
    "import igraph as ig\n",
    "from torch_geometric.nn import global_mean_pool,global_add_pool\n",
    "from functools import reduce\n",
    "import pickle\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "import torch.optim as optim\n",
    "import csv\n",
    "import pandas as pd\n",
    "from torch_geometric.loader import DataLoader\n",
    "from torch_geometric.utils import degree\n",
    "from torch_geometric.utils.convert import to_networkx\n",
    "\n",
    "import networkx as nx\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dense-wyoming",
   "metadata": {},
   "outputs": [],
   "source": [
    "import utils"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fallen-advance",
   "metadata": {},
   "source": [
    "# 1. Load dataset\n",
    "\n",
    "### A. dataset_name\n",
    "\n",
    "ENZYMES, PROTEINS_full\n",
    "### B. Check data information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "julian-incident",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS_full.zip\n",
      "Extracting dataset/PROTEINS_full/PROTEINS_full.zip\n",
      "Processing...\n",
      "Done!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Dataset: PROTEINS_full(1113):\n",
      "====================\n",
      "Number of graphs: 1113\n",
      "Number of features: 32\n",
      "Number of classes: 2\n",
      "=============================================================\n",
      "AVERAGE # H1 CYCLES: 34.83647798742138\n",
      "AVERAGE MAGNITUDE # CYCLES: 3.7217758564407077\n",
      "# GRAPH WITH CYCLES: 1112\n"
     ]
    }
   ],
   "source": [
    "dataset_name = 'PROTEINS_full'\n",
    "dataset = TUDataset(root='dataset', name=dataset_name,use_node_attr=True)\n",
    "\n",
    "print()\n",
    "print(f'Dataset: {dataset}:')\n",
    "print('====================')\n",
    "print(f'Number of graphs: {len(dataset)}')\n",
    "print(f'Number of features: {dataset.num_features}')\n",
    "print(f'Number of classes: {dataset.num_classes}')\n",
    "\n",
    "print('=============================================================')\n",
    "total_cycle=0\n",
    "total_magnitude_cycle=0\n",
    "nonzero_count=0\n",
    "for i in range(len(dataset)):\n",
    "    data=dataset[i]\n",
    "    Xgraph = to_networkx(data,to_undirected= True)\n",
    "    num_g_cycle=Xgraph.number_of_edges() - Xgraph.number_of_nodes() + nx.number_connected_components(Xgraph)\n",
    "    total_cycle += num_g_cycle\n",
    "    node_each_cycle=nx.cycle_basis(Xgraph)  \n",
    "   \n",
    "    if len(node_each_cycle)>0:\n",
    "        magnitude = 0\n",
    "        for j in range(len(node_each_cycle)):\n",
    "            magnitude += len(node_each_cycle[j])\n",
    "        average_magnitude=magnitude/len(node_each_cycle)\n",
    "        nonzero_count+=1\n",
    "    else :\n",
    "        average_magnitude=0\n",
    "        \n",
    "    total_magnitude_cycle+=average_magnitude\n",
    "avg_total_cycle=total_cycle/len(dataset)   \n",
    "avg_total_magnitude=total_magnitude_cycle/len(dataset)\n",
    "print(f'AVERAGE # H1 CYCLES: {avg_total_cycle}') \n",
    "print(f'AVERAGE MAGNITUDE # CYCLES: {avg_total_magnitude}') \n",
    "print(f'# GRAPH WITH CYCLES: {nonzero_count}') "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "subsequent-water",
   "metadata": {},
   "source": [
    "# 2. Preprocessing\n",
    "## A. Normalize \n",
    "- ENZYMES,PROTEINS_full (normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "incredible-warehouse",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset=utils.data_load(dataset,normalize=True)\n",
    "max_node=utils.max_node_dataset(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dramatic-february",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n"
     ]
    }
   ],
   "source": [
    "file = f'./dataset/{dataset_name}/H1_ver2'\n",
    "\n",
    "if os.path.isfile(file):\n",
    "    NEWDATA = torch.load(file)     \n",
    "    print('file')\n",
    "\n",
    "else:        \n",
    "    SUB_ADJ=[]\n",
    "    RAW_SUB_ADJ=[]\n",
    "    NEWDATA=[]\n",
    "    for i in range(len(dataset)):                    \n",
    "        \n",
    "        data=dataset[i]\n",
    "        v1=data.edge_index[0,:]\n",
    "        v2=data.edge_index[1,:]\n",
    "        #print(torch.max(v1))\n",
    "        adj = torch.zeros((max_node,max_node))\n",
    "        adj[v1,v2]=1\n",
    "        adj=adj.numpy()\n",
    "        (adj==adj.T).all()\n",
    "        list_feature=(data.x)\n",
    "        list_adj=(adj)       \n",
    "        \n",
    "        #print(dataset[i])\n",
    "        _, _, _, _, sum_sub_adj = utils.make_cycle_adj_speed_nosl(list_adj,data)\n",
    "        \n",
    "        if i % 100 == 0:\n",
    "            print(i)\n",
    "            \n",
    "        #_sub_adj=np.array(sub_adj)\n",
    "\n",
    "        if len(sum_sub_adj)>0:    \n",
    "            new_adj=np.stack((list_adj,sum_sub_adj),0)\n",
    "        else :\n",
    "            sum_sub_adj=np.zeros((1, list_adj.shape[0], list_adj.shape[1]))\n",
    "            new_adj=np.concatenate((list_adj.reshape(1, list_adj.shape[0], list_adj.shape[1]),sum_sub_adj),0)\n",
    "\n",
    "        #SUB_ADJ.append(new_adj)\n",
    "        SUB_ADJ=new_adj\n",
    "        #------합치기\n",
    "        data=dataset[i]\n",
    "        check1=torch.sum(data.edge_index[0]-np.where(SUB_ADJ[0]==1)[0])+torch.sum(data.edge_index[1]-np.where(SUB_ADJ[0]==1)[1])\n",
    "        if check1 != 0 :\n",
    "            print('error')\n",
    "        data.xx=data.x[:,:-3]\n",
    "        data.cycle_index=torch.stack((torch.LongTensor(np.where(SUB_ADJ[1]!=0)[0]), torch.LongTensor(np.where(SUB_ADJ[1]!=0)[1])),1).T.contiguous()\n",
    "        #data.cycle_attr = torch.FloatTensor(SUB_ADJ[1][np.where(SUB_ADJ[1]!=0)[0],np.where(SUB_ADJ[1]!=0)[1]]) \n",
    "        #FloatTensor 형태여야됨 \n",
    "        NEWDATA.append(data)\n",
    "        \n",
    "    torch.save(NEWDATA,file)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "hydraulic-steal",
   "metadata": {},
   "source": [
    "## C. stratified 10-fold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "insured-dining",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "folder_on\n",
      "0\n",
      "1\n",
      "2\n",
      "3\n",
      "4\n",
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n"
     ]
    }
   ],
   "source": [
    "dataset_class=[]\n",
    "for i in range(len(dataset)):\n",
    "    dataset_class.append(dataset[i].y)\n",
    "dataset_class=np.array(dataset_class)\n",
    "dataset_class.shape, dataset_class[[0,10,500]]\n",
    "\n",
    "\n",
    "# In[6]:\n",
    "\n",
    "\n",
    "folder = f'./dataset/{dataset_name}/kfold_data'\n",
    "if os.path.isdir(folder):\n",
    "    print('folder_on')\n",
    "\n",
    "    for j in range(10):\n",
    "        print(j)\n",
    "        test_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/test_idx-{j}.txt',dtype=np.int32), dtype=torch.long)\n",
    "        for k in range(10):\n",
    "            train_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/train_idx-{k}.txt',dtype=np.int32), dtype=torch.long)\n",
    "            valid_index = torch.as_tensor(np.loadtxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/valid_idx-{k}.txt',dtype=np.int32), dtype=torch.long)    \n",
    "            all_index = reduce(np.union1d, (train_index, valid_index, test_index))\n",
    "            assert len(dataset) == len(all_index)\n",
    "\n",
    "else :        \n",
    "    os.makedirs(folder)\n",
    "    kkf=StratifiedKFold(n_splits=10, shuffle=True)\n",
    "    #kf = KFold(n_splits=10, shuffle=True)\n",
    "    kkf2=StratifiedKFold(n_splits=10, shuffle=True)\n",
    "    #kf2 = KFold(n_splits=10, shuffle=True)\n",
    "    kkf.get_n_splits(dataset,dataset_class)\n",
    "    print(kkf)\n",
    "    j=0\n",
    "    for train_total_index, test_index in kkf.split(dataset,dataset_class):\n",
    "        #print(train_index, test_index)\n",
    "        np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_idx-{j}.txt',(train_total_index.astype(np.int64)), fmt='%i', delimiter='\\t')\n",
    "        np.savetxt(f'./dataset/{dataset_name}/kfold_data/test_idx-{j}.txt',(test_index.astype(np.int64)), fmt='%i', delimiter='\\t')\n",
    "        assert len(dataset)==len(reduce(np.union1d, (test_index, train_total_index)))\n",
    "        k=0\n",
    "        os.mkdir(f'./dataset/{dataset_name}/kfold_data/train_total_{j}') \n",
    "        \n",
    "        dataset_class_train=[]\n",
    "        dataset_train=[]\n",
    "        for i in train_total_index:\n",
    "            dataset_class_train.append(dataset[i].y)\n",
    "            dataset_train.append(dataset[i])\n",
    "        dataset_class_train=np.array(dataset_class_train)\n",
    "        dataset_train=np.array(dataset_train)\n",
    "        kkf2.get_n_splits(train_total_index,dataset_class_train)\n",
    "        \n",
    "        for ii, jj in kkf2.split(dataset_train,dataset_class_train):        \n",
    "            valid_index=train_total_index[jj]\n",
    "            train_index=train_total_index[ii]\n",
    "            np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/valid_idx-{k}.txt',(valid_index.astype(np.int64)), fmt='%i', delimiter='\\t')\n",
    "            np.savetxt(f'./dataset/{dataset_name}/kfold_data/train_total_{j}/train_idx-{k}.txt',(train_index.astype(np.int64)), fmt='%i', delimiter='\\t')\n",
    "            k+=1\n",
    "            assert len(train_total_index)==len(reduce(np.union1d, (valid_index, train_index)))\n",
    "        j+=1\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cleared-memory",
   "metadata": {},
   "source": [
    "# 3. Train & Test\n",
    "\n",
    "### Baseline-GNNs\n",
    "- from nets import GCN, GAT, GIN\n",
    "- Option Cy2C=False\n",
    "\n",
    "### Cy2C-GNNs\n",
    "- from nets import Cy2C-GCN, Cy2C-GAT, Cy2C-GIN\n",
    "- Option Cy2C=True(default)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "emotional-shoot",
   "metadata": {},
   "source": [
    "## A. Cy2C-GNNs "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "supposed-hundred",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:The OGB package is out of date. Your version is 1.3.3, while the latest version is 1.3.4.\n"
     ]
    }
   ],
   "source": [
    "from nets import Cy2C_GCN, Cy2C_GAT, Cy2C_GIN\n",
    "from Trainer_part import Trainer\n",
    "device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')\n",
    "model_name=['Cy2C_GCN', 'Cy2C_GAT', 'Cy2C_GIN']\n",
    "class_name=[Cy2C_GCN, Cy2C_GAT, Cy2C_GIN]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "configured-malpractice",
   "metadata": {},
   "outputs": [],
   "source": [
    "#(BF)Cy2C_GAT_3_drop0.0(0.3)_deacy1e-05_besttest_0.8035714285714286_0.0357142857142857_finaltest_0.7767857142857143_0.008928571428571397\n",
    "lr=1e-3\n",
    "batch_size=32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "stuck-angola",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i,CLASS_NAME in enumerate(class_name):\n",
    "    for decay in [0.0, 0.0001]:\n",
    "        for hidden_dim in [64,128]:\n",
    "            for drop_ini in [0.0,0.2,0.4]:\n",
    "                for drop_mid in [0.0, 0.2, 0.4]:\n",
    "                    for n_layer in [1,2,3,4,5]:\n",
    "                        print(n_layer)\n",
    "                        name=f'(10FOLD){model_name[i]}_{n_layer}({hidden_dim})_drop{drop_ini}({drop_mid})_deacy{decay}'\n",
    "                        print('=====================================')\n",
    "                        print('=====',name,'=====',dataset_name,'=====')\n",
    "                        print('=====================================')\n",
    "                        trainer=Trainer(name, dataset_name,NEWDATA,device,CLASS_NAME,dataset.num_node_features-3,dataset.num_classes,lr=lr,hidden_dim=hidden_dim,n_layer=n_layer,num_workers=2,drop_mid=drop_mid,small_fold=1,batch_size=batch_size,decay=decay, main_fold=10,Cy2C=True)\n",
    "                        trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "continent-minnesota",
   "metadata": {},
   "source": [
    "## B. Baseline-GNNs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "thousand-wagner",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nets import GCN, GAT, GIN\n",
    "from Trainer_part import Trainer\n",
    "device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')\n",
    "model_name=['GCN', 'GAT', 'GIN']\n",
    "class_name=[GCN, GAT, GIN]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "pursuant-modem",
   "metadata": {},
   "outputs": [],
   "source": [
    "lr = 1e-4\n",
    "batch_size=32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "responsible-judge",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "=====================================\n",
      "===== GCN_1(136)_drop0.0(0.0)_deacy0.0 ===== PROTEINS_full =====\n",
      "=====================================\n",
      "load mainfold, subfold== 0 0\n"
     ]
    }
   ],
   "source": [
    "for i,CLASS_NAME in enumerate(class_name):\n",
    "    for decay in [0.0]:\n",
    "        for hidden_dim in [136]:\n",
    "            for drop_ini in [0.0]:\n",
    "                for drop_mid in [0.0]:\n",
    "                    for n_layer in [1,2,3,4,5]:\n",
    "                        print(n_layer)\n",
    "                        name=f'{model_name[i]}_{n_layer}({hidden_dim})_drop{drop_ini}({drop_mid})_deacy{decay}'\n",
    "                        print('=====================================')\n",
    "                        print('=====',name,'=====',dataset_name,'=====')\n",
    "                        print('=====================================')\n",
    "                        trainer=Trainer(name, dataset_name,NEWDATA,device,CLASS_NAME,dataset.num_node_features-3,dataset.num_classes,lr=lr,hidden_dim=hidden_dim,n_layer=n_layer,num_workers=2,drop_mid=drop_mid,small_fold=1,batch_size=batch_size,decay=decay, main_fold=10,Cy2C=False)\n",
    "                        trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fifth-playing",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "GYY",
   "language": "python",
   "name": "gyy"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
