{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d06885b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import math\n",
    "from tqdm import tqdm\n",
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d6316883",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge = pd.read_csv('result_data/citeseer_edge_influence_002.csv', header = None)\n",
    "df_edge.columns = ['actual_influence', 'predicted_influence', 'from_edges', 'to_edges']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9d793f94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual_influence</th>\n",
       "      <th>predicted_influence</th>\n",
       "      <th>from_edges</th>\n",
       "      <th>to_edges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.752125</td>\n",
       "      <td>-0.290519</td>\n",
       "      <td>0.0</td>\n",
       "      <td>628.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.041634</td>\n",
       "      <td>-0.046335</td>\n",
       "      <td>1.0</td>\n",
       "      <td>158.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.037618</td>\n",
       "      <td>-0.037273</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2919.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.122478</td>\n",
       "      <td>0.045500</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2933.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.072385</td>\n",
       "      <td>-0.077310</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1097.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12426</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3322.0</td>\n",
       "      <td>3322.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12427</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3323.0</td>\n",
       "      <td>3323.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12428</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3324.0</td>\n",
       "      <td>3324.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12429</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3325.0</td>\n",
       "      <td>3325.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12430</th>\n",
       "      <td>0.281748</td>\n",
       "      <td>0.214657</td>\n",
       "      <td>3326.0</td>\n",
       "      <td>3326.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12431 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       actual_influence  predicted_influence  from_edges  to_edges\n",
       "0              0.752125            -0.290519         0.0     628.0\n",
       "1             -0.041634            -0.046335         1.0     158.0\n",
       "2             -0.037618            -0.037273         1.0    2919.0\n",
       "3              0.122478             0.045500         1.0    2933.0\n",
       "4             -0.072385            -0.077310         1.0    1097.0\n",
       "...                 ...                  ...         ...       ...\n",
       "12426          0.000000             0.000000      3322.0    3322.0\n",
       "12427          0.000000             0.000000      3323.0    3323.0\n",
       "12428          0.000000             0.000000      3324.0    3324.0\n",
       "12429          0.000000             0.000000      3325.0    3325.0\n",
       "12430          0.281748             0.214657      3326.0    3326.0\n",
       "\n",
       "[12431 rows x 4 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_edge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8514289e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_actual_edge_influence(df_edge):\n",
    "    new_infl = []\n",
    "    new_infl_pred = []\n",
    "    df_edge_copy = df_edge.copy()\n",
    "    for i in tqdm(range(len(df_edge))):\n",
    "        f = df_edge.loc[i, ['from_edges']].values[0]\n",
    "        t = df_edge.loc[i, ['to_edges']].values[0]\n",
    "\n",
    "        act_1 = df_edge[(df_edge['from_edges'] == f) & (df_edge['to_edges'] == t)].actual_influence.values[0]\n",
    "        act_2 = df_edge[(df_edge['from_edges'] == t) & (df_edge['to_edges'] == f)].actual_influence.values[0]\n",
    "        \n",
    "        pred_1 = df_edge[(df_edge['from_edges'] == f) & (df_edge['to_edges'] == t)].predicted_influence.values[0]\n",
    "        pred_2 = df_edge[(df_edge['from_edges'] == t) & (df_edge['to_edges'] == f)].predicted_influence.values[0]\n",
    "        \n",
    "        assert (act_1 == act_2)\n",
    "        assert (pred_1 == pred_2)\n",
    "        new_infl.append(act_1)\n",
    "        new_infl_pred.append(pred_1)\n",
    "    return new_infl, new_infl_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1ac07fa6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████| 12431/12431 [00:23<00:00, 534.13it/s]\n"
     ]
    }
   ],
   "source": [
    "df_edge_copy = df_edge.copy()\n",
    "df_edge_copy['influence'], df_edge_copy['pred_influence'] = get_actual_edge_influence(df_edge)\n",
    "\n",
    "df_edge_preprocessed = df_edge_copy.drop(columns=['actual_influence', 'predicted_influence'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1420a1c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge_preprocessed = df_edge_preprocessed.sort_values(['influence'], ascending=False)\n",
    "df_edge_preprocessed.index = range(len(df_edge_preprocessed))\n",
    "df_edge_preprocessed.to_csv('result_data/citeseer_df_edge_preprocessed_two_edge.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6bc2148d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge_infl = pd.read_csv('result_data/citeseer_df_edge_preprocessed_two_edge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "408ec333",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_self_loop = df_edge_infl[df_edge_infl['from_edges'] == df_edge_infl['to_edges']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2f660f8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_edge = df_edge_infl[df_edge_infl['from_edges'] != df_edge_infl['to_edges']]\n",
    "# df_edge = df_edge[1::2]\n",
    "df_edge.index = range(len(df_edge))\n",
    "df_edge.loc[:, 'from_edges'] = df_edge['from_edges'].astype(int)\n",
    "df_edge.loc[:, 'to_edges'] = df_edge['to_edges'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "48f7127a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge.to_csv('result_data/citeseer_df_edge_preprocessed_two_edge_no_self_loop.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9b3e965f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual_influence</th>\n",
       "      <th>predicted_influence</th>\n",
       "      <th>from_edges</th>\n",
       "      <th>to_edges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.752125</td>\n",
       "      <td>-0.290519</td>\n",
       "      <td>0</td>\n",
       "      <td>628</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.041634</td>\n",
       "      <td>-0.046335</td>\n",
       "      <td>1</td>\n",
       "      <td>158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.037618</td>\n",
       "      <td>-0.037273</td>\n",
       "      <td>1</td>\n",
       "      <td>2919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.122478</td>\n",
       "      <td>0.045500</td>\n",
       "      <td>1</td>\n",
       "      <td>2933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.072385</td>\n",
       "      <td>-0.077310</td>\n",
       "      <td>1</td>\n",
       "      <td>1097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12426</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3322</td>\n",
       "      <td>3322</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12427</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3323</td>\n",
       "      <td>3323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12428</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3324</td>\n",
       "      <td>3324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12429</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3325</td>\n",
       "      <td>3325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12430</th>\n",
       "      <td>0.281748</td>\n",
       "      <td>0.214657</td>\n",
       "      <td>3326</td>\n",
       "      <td>3326</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12431 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       actual_influence  predicted_influence  from_edges  to_edges\n",
       "0              0.752125            -0.290519           0       628\n",
       "1             -0.041634            -0.046335           1       158\n",
       "2             -0.037618            -0.037273           1      2919\n",
       "3              0.122478             0.045500           1      2933\n",
       "4             -0.072385            -0.077310           1      1097\n",
       "...                 ...                  ...         ...       ...\n",
       "12426          0.000000             0.000000        3322      3322\n",
       "12427          0.000000             0.000000        3323      3323\n",
       "12428          0.000000             0.000000        3324      3324\n",
       "12429          0.000000             0.000000        3325      3325\n",
       "12430          0.281748             0.214657        3326      3326\n",
       "\n",
       "[12431 rows x 4 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_edge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2d4a2504",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge_new = df_edge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "acc927e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████| 12431/12431 [00:11<00:00, 1118.47it/s]\n"
     ]
    }
   ],
   "source": [
    "orig_index = []\n",
    "redundant_index = []\n",
    "for i in tqdm(range(len(df_edge_new))):\n",
    "    temp_f = df_edge_new.loc[i, ['from_edges']].values[0]\n",
    "    temp_t = df_edge_new.loc[i, ['to_edges']].values[0]\n",
    "    \n",
    "    temp_index = df_edge_new.loc[(df_edge_new['from_edges'] == temp_t) & \n",
    "                                 (df_edge_new['to_edges'] == temp_f)].index.values\n",
    "    if temp_index.size > 0:\n",
    "        redundant_index.append(temp_index[0])\n",
    "        orig_index.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "642b6609",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.array(redundant_index)\n",
    "b = np.array(orig_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "84d7f0fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_index = []\n",
    "for i in range(len(a)):\n",
    "    if b[i] in new_index:\n",
    "        continue\n",
    "    new_index.append(a[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ae55a02",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "261d8dc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge_new_0 = df_edge_new.loc[new_index]\n",
    "df_edge_new_0.index = range(len(df_edge_new_0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b10637d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_edge_new_0.to_csv('result_data/citeseer_edge_influence_new_version.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f7db4be1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual_influence</th>\n",
       "      <th>predicted_influence</th>\n",
       "      <th>from_edges</th>\n",
       "      <th>to_edges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.752125</td>\n",
       "      <td>-0.290519</td>\n",
       "      <td>628</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.041634</td>\n",
       "      <td>-0.046335</td>\n",
       "      <td>158</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.037618</td>\n",
       "      <td>-0.037273</td>\n",
       "      <td>2919</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.122478</td>\n",
       "      <td>0.045500</td>\n",
       "      <td>2933</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.072385</td>\n",
       "      <td>-0.077310</td>\n",
       "      <td>1097</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7874</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3322</td>\n",
       "      <td>3322</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7875</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3323</td>\n",
       "      <td>3323</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7876</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3324</td>\n",
       "      <td>3324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7877</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3325</td>\n",
       "      <td>3325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7878</th>\n",
       "      <td>0.281748</td>\n",
       "      <td>0.214657</td>\n",
       "      <td>3326</td>\n",
       "      <td>3326</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7879 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      actual_influence  predicted_influence  from_edges  to_edges\n",
       "0             0.752125            -0.290519         628         0\n",
       "1            -0.041634            -0.046335         158         1\n",
       "2            -0.037618            -0.037273        2919         1\n",
       "3             0.122478             0.045500        2933         1\n",
       "4            -0.072385            -0.077310        1097         1\n",
       "...                ...                  ...         ...       ...\n",
       "7874          0.000000             0.000000        3322      3322\n",
       "7875          0.000000             0.000000        3323      3323\n",
       "7876          0.000000             0.000000        3324      3324\n",
       "7877          0.000000             0.000000        3325      3325\n",
       "7878          0.281748             0.214657        3326      3326\n",
       "\n",
       "[7879 rows x 4 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_edge_new_0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "c5ce6a94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SpearmanrResult(correlation=0.11423523927599526, pvalue=6.003071705508103e-185)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scipy.stats.spearmanr(df_edge_new_0['actual_influence'].values, df_edge_new_0['predicted_influence'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "3a52ce12",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = df_edge_new_0['actual_influence'].values\n",
    "b = df_edge_new_0['predicted_influence'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "cd87131c",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.around(a, 5)\n",
    "b = np.around(b, 5)\n",
    "index_ = np.intersect1d(np.where(a != 0)[0], np.where(b != 0)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "e9e86649",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SpearmanrResult(correlation=0.2655048139347416, pvalue=0.0)"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scipy.stats.spearmanr(a[index_], b[index_])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "009be2d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9504624216344937, 0.0)"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scipy.stats.pearsonr(a[index_], b[index_])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
