{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d5c6bc41",
   "metadata": {},
   "source": [
    "# Module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4176ba56",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8ff9512",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "181ce6ac",
   "metadata": {},
   "source": [
    "# Choose Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7a33e0cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ======== CONFIGURATION ======== #\n",
    "\n",
    "# To switch datasets, uncomment the appropriate lines below:\n",
    "\n",
    "# Dataset: Brain\n",
    "input_edgelist_path = \"../data/brain/brain.txt\"\n",
    "dataset_name = \"brain\"\n",
    "\n",
    "# Dataset: School\n",
    "#input_edgelist_path = \"../data/school/school.txt\"\n",
    "#dataset_name = \"school\"\n",
    "\n",
    "# Dataset: Stock\n",
    "#input_edgelist_path = \"../data/stock/stock.txt\"\n",
    "#dataset_name = \"stock\"\n",
    "\n",
    "# Dataset: Synthetic Experiment 1\n",
    "#input_edgelist_path = \"../data/synthetic_exp2.1_n200/synthetic_exp2.1_n200.txt\"\n",
    "#dataset_name = \"synthetic_exp2.1\"\n",
    "\n",
    "# Dataset: Synthetic Experiment 2\n",
    "#input_edgelist_path = \"../data/synthetic_exp2.2_n200/synthetic_exp2.2_n200.txt\"\n",
    "#dataset_name = \"synthetic_exp2.2\"\n",
    "\n",
    "# ======== OUTPUT SETUP ======== #\n",
    "output_dir = f\"./processed_data/{dataset_name}\"\n",
    "Path(output_dir).mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# Feature dimensions\n",
    "node_feat_dim = 1\n",
    "edge_feat_dim = 1\n",
    "\n",
    "## SET THIS TO 1 FOR small datasets (synthetic and school)\n",
    "if (dataset_name == \"school\") | (dataset_name == \"synthetic_exp2.1\") | (dataset_name == \"synthetic_exp2.2\"):\n",
    "    random = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83c7da68",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "0953b706",
   "metadata": {},
   "source": [
    "# Format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "676e828f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed dataset saved to: ./processed_data/school\n",
      "Number of unique nodes: 327\n",
      "Number of edges (original + reverse, no exact duplicates): 22488\n"
     ]
    }
   ],
   "source": [
    "# ======== LOAD AND AUGMENT EDGES ======== #\n",
    "df = pd.read_csv(input_edgelist_path, sep=' ', header=None, names=['u', 'i', 'ts'])\n",
    "df_rev = pd.DataFrame({\"u\": df[\"i\"], \"i\": df[\"u\"], \"ts\": df[\"ts\"]})\n",
    "\n",
    "# Combine and drop exact duplicates\n",
    "df_full = pd.concat([df, df_rev])\n",
    "df_full.drop_duplicates(inplace=True)  # <- ADD THIS LINE\n",
    "df_full = df_full.sort_values('ts').reset_index(drop=True)\n",
    "\n",
    "# ======== ADD LABEL AND EDGE INDEX ======== #\n",
    "df_full['label'] = 1.0\n",
    "df_full['idx'] = df_full.index\n",
    "\n",
    "# ======== GENERATE FEATURES ======== #\n",
    "num_nodes = max(df_full['u'].max(), df_full['i'].max())\n",
    "#node_feats = np.zeros((num_nodes + 1, node_feat_dim))  # node 0 is dummy\n",
    "#edge_feats = np.zeros((df_full.shape[0] + 1, edge_feat_dim))  # edge 0 is dummy\n",
    "\n",
    "# Node features\n",
    "if random == 1:\n",
    "    node_feats = np.random.randn(num_nodes + 1, node_feat_dim)  # random 1D feature\n",
    "else:\n",
    "    node_feats = np.zeros((num_nodes + 1, node_feat_dim))  # all-zero features\n",
    "\n",
    "# Edge features\n",
    "if random == 1:\n",
    "    edge_feats = np.random.randn(df_full.shape[0]  + 1, edge_feat_dim)  # random 1D feature\n",
    "else:\n",
    "    edge_feats  = np.zeros((df_full.shape[0] + 1, edge_feat_dim))\n",
    "\n",
    "\n",
    "# ======== SAVE FILES ======== #\n",
    "df_full.to_csv(f\"{output_dir}/ml_{dataset_name}.csv\", index=False)\n",
    "np.save(f\"{output_dir}/ml_{dataset_name}.npy\", edge_feats)\n",
    "np.save(f\"{output_dir}/ml_{dataset_name}_node.npy\", node_feats)\n",
    "\n",
    "# ======== LOG ======== #\n",
    "print(f\"Processed dataset saved to: {output_dir}\")\n",
    "print(f\"Number of unique nodes: {len(set(df_full['u']).union(df_full['i']))}\")\n",
    "print(f\"Number of edges (original + reverse, no exact duplicates): {df_full.shape[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71050b8e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
