{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creates _s.npy, _X.npy _y.npy, label_*.npy of the cora dataset with sensitive attribute of feature w_1177."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import networkx as nx\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "edgelist = pd.read_csv(\"cora.cites\", sep='\\t', header=None, names=[\"target\", \"source\"])\n",
    "edgelist[\"label\"] = \"cites\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "Gnx = nx.from_pandas_edgelist(edgelist, edge_attr=\"label\")\n",
    "nx.set_node_attributes(Gnx, \"paper\", \"label\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': 'paper'}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Gnx.nodes[1103985]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = [\"w_{}\".format(ii) for ii in range(1433)]\n",
    "column_names =  feature_names + [\"subject\"]\n",
    "node_data = pd.read_csv(\"cora.content\", sep='\\t', header=None, names=column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>w_0</th>\n",
       "      <th>w_1</th>\n",
       "      <th>w_2</th>\n",
       "      <th>w_3</th>\n",
       "      <th>w_4</th>\n",
       "      <th>w_5</th>\n",
       "      <th>w_6</th>\n",
       "      <th>w_7</th>\n",
       "      <th>w_8</th>\n",
       "      <th>w_9</th>\n",
       "      <th>...</th>\n",
       "      <th>w_1424</th>\n",
       "      <th>w_1425</th>\n",
       "      <th>w_1426</th>\n",
       "      <th>w_1427</th>\n",
       "      <th>w_1428</th>\n",
       "      <th>w_1429</th>\n",
       "      <th>w_1430</th>\n",
       "      <th>w_1431</th>\n",
       "      <th>w_1432</th>\n",
       "      <th>subject</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Genetic_Algorithms</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 1434 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    w_0  w_1  w_2  w_3  w_4  w_5  w_6  w_7  w_8  w_9  ...  w_1424  w_1425  \\\n",
       "35    0    0    0    0    0    0    0    0    0    0  ...       0       0   \n",
       "\n",
       "    w_1426  w_1427  w_1428  w_1429  w_1430  w_1431  w_1432             subject  \n",
       "35       0       0       0       0       0       0       0  Genetic_Algorithms  \n",
       "\n",
       "[1 rows x 1434 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_data[node_data.index==35]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Case_Based',\n",
       " 'Genetic_Algorithms',\n",
       " 'Neural_Networks',\n",
       " 'Probabilistic_Methods',\n",
       " 'Reinforcement_Learning',\n",
       " 'Rule_Learning',\n",
       " 'Theory'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(node_data[\"subject\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "31336      2\n",
      "1061127    5\n",
      "1106406    4\n",
      "13195      4\n",
      "37879      3\n",
      "          ..\n",
      "1128975    1\n",
      "1128977    1\n",
      "1128978    1\n",
      "117328     0\n",
      "24043      2\n",
      "Name: subject, Length: 2708, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "le = LabelEncoder()\n",
    "node_data['subject'] = le.fit_transform(node_data['subject'])\n",
    "node_data['subject'] = node_data['subject'].astype('int64')\n",
    "print(node_data['subject'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([  31336, 1061127, 1106406,   13195,   37879, 1126012, 1107140, 1102850,\n",
       "         31349, 1106418,\n",
       "       ...\n",
       "        626531, 1131180, 1130454, 1131184, 1128974, 1128975, 1128977, 1128978,\n",
       "        117328,   24043],\n",
       "      dtype='int64', length=2708)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes = node_data.index.unique()\n",
    "nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_to_index = {node: index for index, node in enumerate(nodes)}\n",
    "\n",
    "# The matrix is symmetric\n",
    "coassoc_matrix = np.zeros((len(nodes), len(nodes)))\n",
    "\n",
    "for index, edge in edgelist.drop([\"label\"], axis=1).iterrows():\n",
    "    if edge['source'] in node_to_index and edge['target'] in node_to_index:\n",
    "        i = node_to_index[edge['source']]\n",
    "        j = node_to_index[edge['target']]\n",
    "        coassoc_matrix[i, j] = 1\n",
    "        coassoc_matrix[j, i] = 1  \n",
    "\n",
    "coassoc_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 5, 4, ..., 1, 0, 2], dtype=int64)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subject_array = node_data['subject'].values\n",
    "np.save('cora_y.npy', subject_array)\n",
    "subject_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('cora_coassoc.npy', coassoc_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_data_X = node_data.drop(columns=['subject', 'w_1177'])\n",
    "np.save('cora_X.npy', node_data_X.values)\n",
    "node_data_X.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "label_encoder = LabelEncoder()\n",
    "y_ravel = node_data['subject'].values.ravel()\n",
    "y_labels = label_encoder.fit_transform(y_ravel)\n",
    "np.save('labels_cora.npy', y_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 1, ..., 1, 0, 0], dtype=int64)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.save('cora_s.npy', node_data['w_1177'].values)\n",
    "node_data['w_1177'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0, 1, 2, 3, 4, 5, 6}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(node_data[\"subject\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31336      2\n",
       "1061127    5\n",
       "1106406    4\n",
       "13195      4\n",
       "37879      3\n",
       "          ..\n",
       "1128975    1\n",
       "1128977    1\n",
       "1128978    1\n",
       "117328     0\n",
       "24043      2\n",
       "Name: subject, Length: 2708, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_data[\"subject\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "w_19:\n",
      "w_19\n",
      "0    2148\n",
      "1     560\n",
      "Name: count, dtype: int64\n",
      "Skipping w_444\n",
      "w_507:\n",
      "w_507\n",
      "0    2032\n",
      "1     676\n",
      "Name: count, dtype: int64\n",
      "w_1177:\n",
      "w_1177\n",
      "0    1625\n",
      "1    1083\n",
      "Name: count, dtype: int64\n",
      "w_1209:\n",
      "w_1209\n",
      "0    2124\n",
      "1     584\n",
      "Name: count, dtype: int64\n",
      "w_1263:\n",
      "w_1263\n",
      "0    1728\n",
      "1     980\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "for column in node_data.drop([\"subject\"],axis=1).columns:\n",
    "    try:\n",
    "        counts = node_data[column].value_counts().loc[[0, 1]]\n",
    "        if all(counts >= 500):\n",
    "            print(f\"{column}:\\n{counts}\")\n",
    "    except:\n",
    "        print(f\"Skipping {column}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "158172     0\n",
       "1130780    0\n",
       "1108570    0\n",
       "153598     0\n",
       "310530     0\n",
       "          ..\n",
       "14062      1\n",
       "595056     1\n",
       "37879      1\n",
       "976284     1\n",
       "592986     1\n",
       "Name: w_1177, Length: 1000, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Sample 500 rows where \"w_1177\" is 0\n",
    "zeros_sample = node_data[node_data['w_1177'] == 0].sample(n=500)\n",
    "\n",
    "# Sample 500 rows where \"w_1177\" is 1\n",
    "ones_sample = node_data[node_data['w_1177'] == 1].sample(n=500)\n",
    "\n",
    "# Concatenate the two samples\n",
    "sampled_data = pd.concat([zeros_sample, ones_sample])\n",
    "\n",
    "sampled_data['w_1177']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([ 158172, 1130780, 1108570,  153598,  310530, 1106849,    1688,  157401,\n",
       "       1132948, 1128881,\n",
       "       ...\n",
       "        321861,   25794, 1125944,  919885,      40,   14062,  595056,   37879,\n",
       "        976284,  592986],\n",
       "      dtype='int64', length=1000)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes = sampled_data.index.unique()\n",
    "nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.],\n",
       "       [0., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_to_index = {node: index for index, node in enumerate(nodes)}\n",
    "\n",
    "# The matrix is symmetric\n",
    "coassoc_matrix = np.zeros((len(nodes), len(nodes)))\n",
    "\n",
    "for index, edge in edgelist.drop([\"label\"], axis=1).iterrows():\n",
    "    if edge['source'] in node_to_index and edge['target'] in node_to_index:\n",
    "        i = node_to_index[edge['source']]\n",
    "        j = node_to_index[edge['target']]\n",
    "        coassoc_matrix[i, j] = 1\n",
    "        coassoc_matrix[j, i] = 1  \n",
    "\n",
    "coassoc_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1426"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.count_nonzero(coassoc_matrix == 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "998574"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.count_nonzero(coassoc_matrix == 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 3, 6, 4, 0, 2, 1, 2, 2, 5, 1, 2, 1, 0, 5, 2, 6, 0, 1, 2, 1, 3,\n",
       "       0, 1, 3, 4, 2, 4, 2, 6, 2, 2, 3, 2, 5, 0, 2, 2, 2, 4, 6, 2, 0, 5,\n",
       "       1, 2, 3, 2, 4, 5, 3, 4, 6, 3, 2, 6, 3, 3, 3, 0, 3, 0, 4, 5, 1, 3,\n",
       "       2, 4, 0, 2, 0, 1, 6, 1, 1, 2, 2, 3, 5, 6, 3, 3, 2, 6, 1, 3, 2, 0,\n",
       "       3, 1, 3, 0, 4, 2, 3, 0, 2, 2, 2, 2, 0, 6, 6, 3, 3, 2, 2, 1, 3, 1,\n",
       "       6, 1, 2, 2, 2, 6, 2, 3, 1, 1, 1, 0, 3, 3, 2, 0, 3, 2, 2, 2, 2, 2,\n",
       "       2, 0, 1, 4, 3, 1, 5, 2, 2, 1, 3, 1, 3, 2, 3, 5, 2, 2, 2, 0, 1, 1,\n",
       "       3, 6, 6, 3, 6, 3, 2, 2, 2, 2, 3, 1, 0, 1, 4, 2, 6, 1, 2, 2, 6, 4,\n",
       "       2, 0, 4, 1, 2, 3, 5, 2, 3, 2, 3, 3, 2, 1, 1, 2, 2, 3, 6, 6, 6, 3,\n",
       "       6, 0, 3, 2, 2, 0, 2, 6, 2, 3, 2, 2, 5, 1, 2, 2, 1, 2, 1, 6, 2, 2,\n",
       "       3, 3, 2, 1, 0, 1, 1, 2, 0, 3, 2, 2, 3, 2, 6, 2, 4, 6, 6, 4, 2, 2,\n",
       "       3, 0, 2, 6, 1, 6, 3, 3, 2, 2, 0, 1, 2, 1, 4, 3, 1, 4, 2, 0, 2, 2,\n",
       "       2, 0, 1, 2, 2, 2, 1, 3, 3, 2, 2, 3, 1, 4, 4, 1, 2, 4, 0, 0, 2, 2,\n",
       "       6, 3, 4, 0, 2, 3, 4, 6, 2, 0, 4, 2, 5, 2, 4, 2, 2, 6, 2, 3, 1, 2,\n",
       "       5, 1, 5, 6, 6, 2, 3, 2, 6, 5, 5, 0, 3, 3, 3, 2, 2, 2, 6, 3, 3, 6,\n",
       "       2, 6, 2, 6, 1, 3, 1, 2, 5, 2, 3, 5, 2, 3, 2, 2, 2, 1, 2, 6, 6, 2,\n",
       "       6, 6, 0, 2, 5, 0, 3, 2, 2, 1, 6, 2, 1, 2, 6, 3, 6, 3, 2, 1, 5, 2,\n",
       "       6, 3, 2, 2, 6, 2, 2, 5, 2, 1, 2, 2, 0, 0, 1, 2, 1, 5, 2, 5, 1, 4,\n",
       "       1, 2, 3, 1, 2, 1, 6, 3, 3, 2, 2, 2, 2, 3, 2, 0, 6, 2, 2, 2, 2, 6,\n",
       "       0, 1, 3, 2, 0, 4, 6, 0, 2, 2, 3, 3, 0, 6, 2, 3, 4, 2, 2, 2, 5, 2,\n",
       "       0, 3, 6, 5, 3, 6, 3, 4, 2, 2, 2, 6, 1, 2, 5, 4, 6, 3, 3, 6, 6, 0,\n",
       "       6, 3, 6, 1, 0, 1, 4, 5, 2, 2, 4, 1, 3, 5, 3, 1, 5, 2, 3, 3, 2, 1,\n",
       "       1, 5, 2, 3, 2, 3, 4, 1, 0, 4, 3, 2, 0, 1, 3, 2, 1, 6, 5, 2, 3, 6,\n",
       "       0, 4, 2, 6, 1, 0, 4, 1, 0, 3, 2, 0, 2, 3, 3, 0, 5, 3, 0, 0, 0, 3,\n",
       "       6, 2, 3, 2, 3, 2, 5, 1, 1, 6, 2, 5, 1, 4, 0, 3, 0, 1, 3, 2, 1, 1,\n",
       "       2, 6, 3, 1, 4, 2, 3, 2, 1, 6, 3, 1, 1, 0, 0, 3, 0, 0, 2, 2, 1, 1,\n",
       "       4, 5, 1, 1, 4, 0, 1, 6, 1, 2, 0, 0, 6, 1, 2, 3, 0, 0, 5, 4, 2, 2,\n",
       "       2, 2, 3, 2, 0, 4, 6, 2, 0, 0, 0, 6, 2, 3, 2, 2, 0, 3, 0, 1, 2, 2,\n",
       "       5, 1, 6, 6, 1, 0, 5, 3, 4, 0, 2, 1, 2, 2, 3, 2, 6, 0, 5, 5, 2, 1,\n",
       "       5, 0, 1, 1, 0, 0, 0, 2, 3, 3, 2, 3, 4, 6, 0, 0, 0, 6, 2, 1, 6, 0,\n",
       "       2, 4, 0, 3, 0, 2, 6, 1, 2, 2, 0, 5, 2, 6, 5, 3, 1, 2, 0, 3, 6, 0,\n",
       "       1, 3, 2, 3, 3, 2, 1, 2, 2, 0, 1, 1, 0, 6, 2, 3, 2, 3, 1, 2, 3, 1,\n",
       "       2, 4, 1, 6, 5, 3, 2, 2, 6, 0, 2, 2, 2, 2, 2, 2, 0, 6, 0, 3, 2, 4,\n",
       "       3, 4, 1, 5, 2, 3, 2, 4, 3, 2, 4, 1, 2, 6, 6, 1, 2, 5, 4, 2, 2, 5,\n",
       "       2, 4, 5, 2, 2, 3, 4, 5, 2, 6, 1, 2, 5, 3, 2, 2, 0, 4, 1, 1, 2, 4,\n",
       "       3, 3, 5, 2, 1, 6, 6, 2, 0, 2, 0, 2, 2, 2, 1, 3, 2, 4, 5, 2, 5, 2,\n",
       "       3, 1, 3, 3, 0, 2, 2, 1, 2, 0, 5, 5, 6, 2, 2, 1, 4, 6, 1, 2, 5, 0,\n",
       "       6, 2, 3, 0, 3, 1, 2, 0, 6, 1, 2, 2, 6, 2, 4, 6, 3, 4, 2, 1, 3, 4,\n",
       "       2, 1, 6, 6, 6, 2, 2, 2, 2, 2, 3, 1, 3, 3, 3, 6, 0, 5, 1, 5, 6, 2,\n",
       "       0, 5, 6, 5, 1, 0, 6, 1, 2, 6, 2, 4, 3, 0, 6, 2, 1, 6, 0, 2, 2, 2,\n",
       "       1, 2, 2, 3, 6, 1, 2, 1, 2, 2, 2, 3, 3, 1, 6, 0, 1, 2, 0, 0, 2, 5,\n",
       "       2, 6, 1, 3, 5, 2, 3, 2, 1, 4, 1, 1, 0, 4, 4, 2, 6, 0, 6, 2, 3, 4,\n",
       "       0, 2, 2, 1, 6, 2, 6, 2, 5, 6, 4, 3, 6, 1, 6, 3, 5, 5, 2, 3, 2, 2,\n",
       "       1, 5, 6, 2, 2, 6, 1, 3, 6, 0, 5, 4, 0, 2, 3, 5, 3, 4, 0, 6, 6, 4,\n",
       "       2, 2, 2, 3, 6, 3, 2, 1, 6, 3, 1, 6, 6, 1, 3, 4, 2, 4, 2, 6, 2, 2,\n",
       "       6, 2, 0, 2, 1, 1, 1, 3, 6, 1], dtype=int64)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subject_array = sampled_data['subject'].values\n",
    "np.save('cora_y.npy', subject_array)\n",
    "subject_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('cora_coassoc.npy', coassoc_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sampled_data_X = sampled_data.drop(columns=['subject', 'w_1177'])\n",
    "np.save('cora_X.npy', sampled_data_X.values)\n",
    "sampled_data_X.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(np.unique(sampled_data['subject'].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "label_encoder = LabelEncoder()\n",
    "y_ravel = sampled_data['subject'].values.ravel()\n",
    "y_labels = label_encoder.fit_transform(y_ravel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 3, 6, 4, 0, 2, 1, 2, 2, 5, 1, 2, 1, 0, 5, 2, 6, 0, 1, 2, 1, 3,\n",
       "       0, 1, 3, 4, 2, 4, 2, 6, 2, 2, 3, 2, 5, 0, 2, 2, 2, 4, 6, 2, 0, 5,\n",
       "       1, 2, 3, 2, 4, 5, 3, 4, 6, 3, 2, 6, 3, 3, 3, 0, 3, 0, 4, 5, 1, 3,\n",
       "       2, 4, 0, 2, 0, 1, 6, 1, 1, 2, 2, 3, 5, 6, 3, 3, 2, 6, 1, 3, 2, 0,\n",
       "       3, 1, 3, 0, 4, 2, 3, 0, 2, 2, 2, 2, 0, 6, 6, 3, 3, 2, 2, 1, 3, 1,\n",
       "       6, 1, 2, 2, 2, 6, 2, 3, 1, 1, 1, 0, 3, 3, 2, 0, 3, 2, 2, 2, 2, 2,\n",
       "       2, 0, 1, 4, 3, 1, 5, 2, 2, 1, 3, 1, 3, 2, 3, 5, 2, 2, 2, 0, 1, 1,\n",
       "       3, 6, 6, 3, 6, 3, 2, 2, 2, 2, 3, 1, 0, 1, 4, 2, 6, 1, 2, 2, 6, 4,\n",
       "       2, 0, 4, 1, 2, 3, 5, 2, 3, 2, 3, 3, 2, 1, 1, 2, 2, 3, 6, 6, 6, 3,\n",
       "       6, 0, 3, 2, 2, 0, 2, 6, 2, 3, 2, 2, 5, 1, 2, 2, 1, 2, 1, 6, 2, 2,\n",
       "       3, 3, 2, 1, 0, 1, 1, 2, 0, 3, 2, 2, 3, 2, 6, 2, 4, 6, 6, 4, 2, 2,\n",
       "       3, 0, 2, 6, 1, 6, 3, 3, 2, 2, 0, 1, 2, 1, 4, 3, 1, 4, 2, 0, 2, 2,\n",
       "       2, 0, 1, 2, 2, 2, 1, 3, 3, 2, 2, 3, 1, 4, 4, 1, 2, 4, 0, 0, 2, 2,\n",
       "       6, 3, 4, 0, 2, 3, 4, 6, 2, 0, 4, 2, 5, 2, 4, 2, 2, 6, 2, 3, 1, 2,\n",
       "       5, 1, 5, 6, 6, 2, 3, 2, 6, 5, 5, 0, 3, 3, 3, 2, 2, 2, 6, 3, 3, 6,\n",
       "       2, 6, 2, 6, 1, 3, 1, 2, 5, 2, 3, 5, 2, 3, 2, 2, 2, 1, 2, 6, 6, 2,\n",
       "       6, 6, 0, 2, 5, 0, 3, 2, 2, 1, 6, 2, 1, 2, 6, 3, 6, 3, 2, 1, 5, 2,\n",
       "       6, 3, 2, 2, 6, 2, 2, 5, 2, 1, 2, 2, 0, 0, 1, 2, 1, 5, 2, 5, 1, 4,\n",
       "       1, 2, 3, 1, 2, 1, 6, 3, 3, 2, 2, 2, 2, 3, 2, 0, 6, 2, 2, 2, 2, 6,\n",
       "       0, 1, 3, 2, 0, 4, 6, 0, 2, 2, 3, 3, 0, 6, 2, 3, 4, 2, 2, 2, 5, 2,\n",
       "       0, 3, 6, 5, 3, 6, 3, 4, 2, 2, 2, 6, 1, 2, 5, 4, 6, 3, 3, 6, 6, 0,\n",
       "       6, 3, 6, 1, 0, 1, 4, 5, 2, 2, 4, 1, 3, 5, 3, 1, 5, 2, 3, 3, 2, 1,\n",
       "       1, 5, 2, 3, 2, 3, 4, 1, 0, 4, 3, 2, 0, 1, 3, 2, 1, 6, 5, 2, 3, 6,\n",
       "       0, 4, 2, 6, 1, 0, 4, 1, 0, 3, 2, 0, 2, 3, 3, 0, 5, 3, 0, 0, 0, 3,\n",
       "       6, 2, 3, 2, 3, 2, 5, 1, 1, 6, 2, 5, 1, 4, 0, 3, 0, 1, 3, 2, 1, 1,\n",
       "       2, 6, 3, 1, 4, 2, 3, 2, 1, 6, 3, 1, 1, 0, 0, 3, 0, 0, 2, 2, 1, 1,\n",
       "       4, 5, 1, 1, 4, 0, 1, 6, 1, 2, 0, 0, 6, 1, 2, 3, 0, 0, 5, 4, 2, 2,\n",
       "       2, 2, 3, 2, 0, 4, 6, 2, 0, 0, 0, 6, 2, 3, 2, 2, 0, 3, 0, 1, 2, 2,\n",
       "       5, 1, 6, 6, 1, 0, 5, 3, 4, 0, 2, 1, 2, 2, 3, 2, 6, 0, 5, 5, 2, 1,\n",
       "       5, 0, 1, 1, 0, 0, 0, 2, 3, 3, 2, 3, 4, 6, 0, 0, 0, 6, 2, 1, 6, 0,\n",
       "       2, 4, 0, 3, 0, 2, 6, 1, 2, 2, 0, 5, 2, 6, 5, 3, 1, 2, 0, 3, 6, 0,\n",
       "       1, 3, 2, 3, 3, 2, 1, 2, 2, 0, 1, 1, 0, 6, 2, 3, 2, 3, 1, 2, 3, 1,\n",
       "       2, 4, 1, 6, 5, 3, 2, 2, 6, 0, 2, 2, 2, 2, 2, 2, 0, 6, 0, 3, 2, 4,\n",
       "       3, 4, 1, 5, 2, 3, 2, 4, 3, 2, 4, 1, 2, 6, 6, 1, 2, 5, 4, 2, 2, 5,\n",
       "       2, 4, 5, 2, 2, 3, 4, 5, 2, 6, 1, 2, 5, 3, 2, 2, 0, 4, 1, 1, 2, 4,\n",
       "       3, 3, 5, 2, 1, 6, 6, 2, 0, 2, 0, 2, 2, 2, 1, 3, 2, 4, 5, 2, 5, 2,\n",
       "       3, 1, 3, 3, 0, 2, 2, 1, 2, 0, 5, 5, 6, 2, 2, 1, 4, 6, 1, 2, 5, 0,\n",
       "       6, 2, 3, 0, 3, 1, 2, 0, 6, 1, 2, 2, 6, 2, 4, 6, 3, 4, 2, 1, 3, 4,\n",
       "       2, 1, 6, 6, 6, 2, 2, 2, 2, 2, 3, 1, 3, 3, 3, 6, 0, 5, 1, 5, 6, 2,\n",
       "       0, 5, 6, 5, 1, 0, 6, 1, 2, 6, 2, 4, 3, 0, 6, 2, 1, 6, 0, 2, 2, 2,\n",
       "       1, 2, 2, 3, 6, 1, 2, 1, 2, 2, 2, 3, 3, 1, 6, 0, 1, 2, 0, 0, 2, 5,\n",
       "       2, 6, 1, 3, 5, 2, 3, 2, 1, 4, 1, 1, 0, 4, 4, 2, 6, 0, 6, 2, 3, 4,\n",
       "       0, 2, 2, 1, 6, 2, 6, 2, 5, 6, 4, 3, 6, 1, 6, 3, 5, 5, 2, 3, 2, 2,\n",
       "       1, 5, 6, 2, 2, 6, 1, 3, 6, 0, 5, 4, 0, 2, 3, 5, 3, 4, 0, 6, 6, 4,\n",
       "       2, 2, 2, 3, 6, 3, 2, 1, 6, 3, 1, 6, 6, 1, 3, 4, 2, 4, 2, 6, 2, 2,\n",
       "       6, 2, 0, 2, 1, 1, 1, 3, 6, 1], dtype=int64)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('labels_cora.npy', y_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Save the 'w_1177' column as a .npy file\n",
    "np.save('cora_s.npy', sampled_data['w_1177'].values)\n",
    "sampled_data['w_1177'].values"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dl2023",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
