{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "701405a4-565f-4576-9634-4e60988889d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adjust path if necessary\n",
    "import os\n",
    "import numpy as np\n",
    "from benchmarking_utils_arxiv import run_benchmark\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "\n",
    "# configure output dir\n",
    "OUT_DIR = \"./benchmark_outputs\"\n",
    "os.makedirs(OUT_DIR, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9cd5bb2e-0414-4035-a2eb-07b20dd1b9d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Configuration ---\n",
    "datasets = [\"arxiv\"]  # names handled by module\n",
    "\n",
    "seeds = [46]\n",
    "# Two splits: 30-70 and 70-30. In our convention mask_frac is fraction MASKED (unlabeled).\n",
    "# For 30-70 split (30% known) -> mask_frac = 0.7 ; For 70-30 split mask_frac = 0.3\n",
    "mask_fracs = [0.7, 0.3]\n",
    "\n",
    "# Embedding / classifier lists (None -> module defaults)\n",
    "# embedding_methods = ['random', 'given', 'deepwalk', 'node2vec', 'dgi', 'fuse', 'vgae']\n",
    "embedding_methods = ['fuse']\n",
    "classifiers = ['gcn', 'gat', 'graphsage']\n",
    "\n",
    "# Embedding dimensionality\n",
    "emb_dim = 150\n",
    "\n",
    "# Training / model hyperparams (keep default-ish)\n",
    "vgae_epochs = 200\n",
    "dgi_epochs = 200\n",
    "fuse_iterations = 200\n",
    "\n",
    "# device for PyG models\n",
    "device = 'cpu'  # or 'cuda' if available and configured\n",
    "\n",
    "# Where outputs will be stored\n",
    "save_dir = OUT_DIR\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1d93c591-5d04-4e27-86aa-c8175aefc80c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using custom mask: ./masks\\Arxiv\\70_30\\Arxiv_70_30_masked_indices_seed46.npy\n",
      "Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloaded 0.08 GB: 100%|██████████| 81/81 [00:16<00:00,  4.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting .\\arxiv.zip\n",
      "Loading necessary files...\n",
      "This might take a while.\n",
      "Processing graphs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1/1 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[arxiv][seed=46][mf=0.7] Masked=50802, Unmasked=118541\n",
      "[arxiv][seed=46][mask_frac=0.7] Running fuse …\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Random walks per node: 100%|██████████| 169343/169343 [02:19<00:00, 1217.79it/s]\n",
      "Computing attention weights: 100%|██████████| 169343/169343 [00:22<00:00, 7509.72it/s] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embedding: fuse, Classifier: gcn\n",
      "Accuracy: 0.3899\n",
      "F1-Score: 0.1016\n",
      "Kappa: 0.3049\n",
      "Embedding generation time: 1329.69s\n",
      "Classifier runtime: 75.59s\n",
      "--------------------------------------------------\n",
      "Embedding: fuse, Classifier: gat\n",
      "Accuracy: 0.2615\n",
      "F1-Score: 0.0424\n",
      "Kappa: 0.1398\n",
      "Embedding generation time: 1329.69s\n",
      "Classifier runtime: 237.57s\n",
      "--------------------------------------------------\n",
      "Embedding: fuse, Classifier: graphsage\n",
      "Accuracy: 0.2954\n",
      "F1-Score: 0.0564\n",
      "Kappa: 0.1946\n",
      "Embedding generation time: 1329.69s\n",
      "Classifier runtime: 98.43s\n",
      "--------------------------------------------------\n",
      "Using custom mask: ./masks\\Arxiv\\30_70\\Arxiv_30_70_masked_indices_seed46.npy\n",
      "[arxiv][seed=46][mf=0.3] Masked=118540, Unmasked=50803\n",
      "[arxiv][seed=46][mask_frac=0.3] Running fuse …\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Random walks per node: 100%|██████████| 169343/169343 [02:48<00:00, 1003.57it/s]\n",
      "Computing attention weights: 100%|██████████| 169343/169343 [00:17<00:00, 9417.99it/s] \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embedding: fuse, Classifier: gcn\n",
      "Accuracy: 0.3604\n",
      "F1-Score: 0.0528\n",
      "Kappa: 0.2655\n",
      "Embedding generation time: 1812.09s\n",
      "Classifier runtime: 21.35s\n",
      "--------------------------------------------------\n",
      "Embedding: fuse, Classifier: gat\n",
      "Accuracy: 0.3273\n",
      "F1-Score: 0.0978\n",
      "Kappa: 0.2258\n",
      "Embedding generation time: 1812.09s\n",
      "Classifier runtime: 73.46s\n",
      "--------------------------------------------------\n",
      "Embedding: fuse, Classifier: graphsage\n",
      "Accuracy: 0.3321\n",
      "F1-Score: 0.0706\n",
      "Kappa: 0.2412\n",
      "Embedding generation time: 1812.09s\n",
      "Classifier runtime: 35.10s\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "res = run_benchmark(\n",
    "    datasets=datasets,\n",
    "    seeds=seeds,\n",
    "    mask_fracs=mask_fracs,\n",
    "    emb_dim=emb_dim,\n",
    "    embedding_methods=embedding_methods,\n",
    "    classifiers=classifiers,\n",
    "    vgae_epochs=vgae_epochs,\n",
    "    dgi_epochs=dgi_epochs,\n",
    "    fuse_iterations=fuse_iterations,\n",
    "    save_dir=save_dir,\n",
    "    device=device,\n",
    "    masks_root=\"./masks\",\n",
    "    verbose=True\n",
    ")"
   ]
  },
  {
   "cell_type": "raw",
   "id": "7f5d3f6f-6e5a-498d-9d0a-5b42c8ac5eb7",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "print(\"Per-run results saved at:\", save_dir)\n",
    "display(res['per_run'].head())\n",
    "display(res['avg_by_model_and_classifier'].sort_values(['dataset','mask_frac','embedding','classifier']).head(20))\n",
    "display(res['avg_embedding_times'].sort_values(['dataset','mask_frac','avg_embedding_time']).head(20))\n",
    "# You can write these DataFrames to separate CSV too (they are saved by the module)."
   ]
  },
  {
   "cell_type": "raw",
   "id": "ed9e7684-6a6e-4995-89b6-f325d22a2090",
   "metadata": {},
   "source": [
    "import numpy as np, os\n",
    "dataset = \"cora\"\n",
    "seed = 42\n",
    "emb_name = \"fuse\"\n",
    "path = os.path.join(save_dir, dataset, f\"seed_{seed}\", f\"embedding_{emb_name}.npy\")\n",
    "E = np.load(path)\n",
    "print(\"Loaded embedding shape:\", E.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dfe209d-8cef-4078-870e-f26e3ff9c998",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
