{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "df7be55e-9568-4956-a8d9-abef2eb568b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the avg_by_model_and_classifier.csv file\n",
    "df = pd.read_csv('avg_by_model_and_classifier_arxiv.csv')\n",
    "\n",
    "def process_split(df, mask_frac_value, output_file):\n",
    "    df_filtered = df[df['mask_frac'] == mask_frac_value]\n",
    "    \n",
    "    def compute_avg_pm(series):\n",
    "        means = []\n",
    "        stds = []\n",
    "        for item in series:\n",
    "            mean_str, std_str = item.split('±')\n",
    "            means.append(float(mean_str.strip()))\n",
    "            stds.append(float(std_str.strip()))\n",
    "        avg_mean = sum(means) / len(means)\n",
    "        avg_std = sum(stds) / len(stds)\n",
    "        return f\"{avg_mean:.4f} ± {avg_std:.4f}\"\n",
    "    \n",
    "    # Group by classifier and embedding, compute aggregated averages\n",
    "    df_grouped = df_filtered.groupby(['classifier', 'embedding']).agg({\n",
    "        'accuracy_pm': compute_avg_pm,\n",
    "        'f1_pm': compute_avg_pm,\n",
    "    }).reset_index()\n",
    "    \n",
    "    # Rename columns as required\n",
    "    df_grouped.rename(columns={\n",
    "        'classifier': 'Classifier',\n",
    "        'embedding': 'Embedding',\n",
    "        'accuracy_pm': 'Accuracy',\n",
    "        'f1_pm': 'F1'\n",
    "    }, inplace=True)\n",
    "    \n",
    "    # Save the result\n",
    "    df_grouped.to_csv(output_file, index=False)\n",
    "\n",
    "# Process for 30-70 split (mask_frac = 0.3)\n",
    "process_split(df, 0.3, 'analysis_results_30_70.csv')\n",
    "\n",
    "# Process for 70-30 split (mask_frac = 0.7)\n",
    "process_split(df, 0.7, 'analysis_results_70_30.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4b612e9c-1630-4e4f-bddd-67c1b0bd910a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved datasetwise_results_30_70.csv\n",
      "Saved datasetwise_results_70_30.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load the file\n",
    "df = pd.read_csv('avg_by_model_and_classifier_arxiv.csv')\n",
    "\n",
    "def process_split_datasetwise(df, mask_frac_value, output_file):\n",
    "    # Filter for the chosen mask fraction\n",
    "    df_filtered = df[df['mask_frac'] == mask_frac_value]\n",
    "\n",
    "    def compute_avg_pm(series):\n",
    "        means = []\n",
    "        stds = []\n",
    "        for item in series:\n",
    "            mean_str, std_str = item.split('±')\n",
    "            means.append(float(mean_str.strip()))\n",
    "            stds.append(float(std_str.strip()))\n",
    "        avg_mean = sum(means) / len(means)\n",
    "        avg_std = sum(stds) / len(stds)\n",
    "        return f\"{avg_mean:.4f} ± {avg_std:.4f}\"\n",
    "\n",
    "    # Group by classifier, embedding, dataset\n",
    "    df_grouped = df_filtered.groupby(['classifier', 'embedding', 'dataset']).agg({\n",
    "        'accuracy_pm': compute_avg_pm\n",
    "    }).reset_index()\n",
    "\n",
    "    # Pivot so datasets become columns\n",
    "    pivot_acc = df_grouped.pivot_table(\n",
    "        index=['classifier', 'embedding'],\n",
    "        columns='dataset',\n",
    "        values='accuracy_pm',\n",
    "        aggfunc='first'\n",
    "    )\n",
    "\n",
    "    # Flatten multiindex column names\n",
    "    pivot_acc.columns = [f\"Accuracy_{col}\" for col in pivot_acc.columns]\n",
    "\n",
    "    # Add \"Average across datasets\" column\n",
    "    def avg_pm_row(row):\n",
    "        means, stds = [], []\n",
    "        for val in row.dropna():\n",
    "            mean_str, std_str = val.split('±')\n",
    "            means.append(float(mean_str.strip()))\n",
    "            stds.append(float(std_str.strip()))\n",
    "        return f\"{sum(means)/len(means):.4f} ± {sum(stds)/len(stds):.4f}\" if means else \"\"\n",
    "\n",
    "    pivot_acc[\"Accuracy_Avg\"] = pivot_acc.apply(avg_pm_row, axis=1)\n",
    "\n",
    "    # Add \"Average across models\" row\n",
    "    avg_row = {}\n",
    "    for col in pivot_acc.columns:\n",
    "        if col in [\"classifier\", \"embedding\"]:\n",
    "            continue\n",
    "        means, stds = [], []\n",
    "        for val in pivot_acc[col].dropna():\n",
    "            mean_str, std_str = val.split('±')\n",
    "            means.append(float(mean_str.strip()))\n",
    "            stds.append(float(std_str.strip()))\n",
    "        avg_row[col] = f\"{sum(means)/len(means):.4f} ± {sum(stds)/len(stds):.4f}\" if means else \"\"\n",
    "\n",
    "    avg_df = pd.DataFrame([avg_row], index=[(\"Average\", \"Average\")])\n",
    "\n",
    "    # Concatenate\n",
    "    final_df = pd.concat([pivot_acc, avg_df])\n",
    "    final_df = final_df.reset_index()\n",
    "\n",
    "    # Save\n",
    "    final_df.to_csv(output_file, index=False)\n",
    "\n",
    "    print(f\"Saved {output_file}\")\n",
    "    return final_df\n",
    "\n",
    "# Example usage\n",
    "results_30_70 = process_split_datasetwise(df, 0.3, 'datasetwise_results_30_70.csv')\n",
    "results_70_30 = process_split_datasetwise(df, 0.7, 'datasetwise_results_70_30.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "64c0a1e5-159a-42ec-af60-f29f4ee0ad6b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 'formatted_runtime_table_with_average.csv'\n",
      "embedding  dataset  split  DeepWalk  Node2Vec     VGAE     DGI     FUSE  \\\n",
      "0              NaN  30-70  12996.76  12038.33  1098.25  758.04  1698.52   \n",
      "1              NaN  70-30  13029.78  12899.23  1072.42  633.06  1360.30   \n",
      "2          Average  30-70  12996.76  12038.33  1098.25  758.04  1698.52   \n",
      "3          Average  70-30  13029.78  12899.23  1072.42  633.06  1360.30   \n",
      "4          Average    All  13013.27  12468.78  1085.33  695.55  1529.41   \n",
      "\n",
      "embedding  Avg_all_embeddings  \n",
      "0                     5717.98  \n",
      "1                     5798.96  \n",
      "2                     5717.98  \n",
      "3                     5798.96  \n",
      "4                     5758.47  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load the CSV data\n",
    "df = pd.read_csv('avg_embedding_times_arxiv.csv')\n",
    "\n",
    "# Filter out 'random' and 'given' embeddings\n",
    "df = df[~df['embedding'].isin(['random', 'given'])]\n",
    "\n",
    "# Create a mapping for better display names\n",
    "embedding_name_map = {\n",
    "    'deepwalk': 'DeepWalk',\n",
    "    'node2vec': 'Node2Vec',\n",
    "    'vgae': 'VGAE',\n",
    "    'dgi': 'DGI',\n",
    "    'fuse': 'FUSE'\n",
    "}\n",
    "df['embedding'] = df['embedding'].map(embedding_name_map)\n",
    "\n",
    "# Create a new column to represent the split type\n",
    "df['split'] = df['mask_frac'].apply(lambda x: '70-30' if x == 0.7 else '30-70')\n",
    "\n",
    "# Pivot the data to get dataset × split rows and embeddings as columns\n",
    "pivot_df = df.pivot_table(\n",
    "    index=['dataset', 'split'],\n",
    "    columns='embedding',\n",
    "    values='avg_embedding_time',\n",
    "    aggfunc='mean'\n",
    ").reset_index()\n",
    "\n",
    "# Rename datasets\n",
    "dataset_name_map = {\n",
    "    'cora': 'Cora',\n",
    "    'citeseer': 'CiteSeer',\n",
    "    'photo': 'Amazon-Photo',\n",
    "    'wikics': 'WikiCS',\n",
    "    'pubmed': 'PubMed'\n",
    "}\n",
    "pivot_df['dataset'] = pivot_df['dataset'].map(dataset_name_map)\n",
    "\n",
    "# Ensure embedding columns exist (even if some are missing) and define order\n",
    "embed_cols = ['DeepWalk', 'Node2Vec', 'VGAE', 'DGI', 'FUSE']\n",
    "for col in embed_cols:\n",
    "    if col not in pivot_df.columns:\n",
    "        pivot_df[col] = pd.NA\n",
    "\n",
    "# Reorder columns\n",
    "column_order = ['dataset', 'split'] + embed_cols\n",
    "pivot_df = pivot_df[column_order]\n",
    "\n",
    "# Convert embedding columns to numeric (coerce missing strings -> NaN)\n",
    "pivot_df[embed_cols] = pivot_df[embed_cols].apply(pd.to_numeric, errors='coerce')\n",
    "\n",
    "# --- ROW-WISE: add column that averages across embedding columns for each (dataset, split) ---\n",
    "pivot_df['Avg_all_embeddings'] = pivot_df[embed_cols].mean(axis=1)\n",
    "\n",
    "# --- COLUMN-WISE: add average row per split (average across datasets for each split) ---\n",
    "overall_avg_by_split = (\n",
    "    pivot_df.groupby('split')[embed_cols + ['Avg_all_embeddings']]\n",
    "    .mean()\n",
    "    .reset_index()\n",
    ")\n",
    "overall_avg_by_split['dataset'] = 'Average'\n",
    "# place columns in same order\n",
    "overall_avg_by_split = overall_avg_by_split[['dataset', 'split'] + embed_cols + ['Avg_all_embeddings']]\n",
    "\n",
    "# --- GLOBAL AVERAGE: average across both splits and all datasets ---\n",
    "global_avg = pivot_df[embed_cols + ['Avg_all_embeddings']].mean().to_frame().T\n",
    "global_avg['split'] = 'All'\n",
    "global_avg['dataset'] = 'Average'\n",
    "global_avg = global_avg[['dataset', 'split'] + embed_cols + ['Avg_all_embeddings']]\n",
    "\n",
    "# --- Combine original rows + per-split average rows + global average row ---\n",
    "final_df = pd.concat([pivot_df, overall_avg_by_split, global_avg], ignore_index=True)\n",
    "\n",
    "# Optional: sort so real datasets come first, and average rows at bottom\n",
    "# Keep the 'Average' dataset rows at the end\n",
    "final_df['__is_avg'] = final_df['dataset'].eq('Average')\n",
    "final_df = final_df.sort_values(['__is_avg', 'dataset', 'split']).drop(columns='__is_avg').reset_index(drop=True)\n",
    "\n",
    "# Round numeric columns for readability\n",
    "for col in embed_cols + ['Avg_all_embeddings']:\n",
    "    final_df[col] = final_df[col].round(2)\n",
    "\n",
    "# Save\n",
    "final_df.to_csv('formatted_runtime_table_with_average.csv', index=False)\n",
    "\n",
    "print(\"Saved 'formatted_runtime_table_with_average.csv'\")\n",
    "print(final_df.tail(8))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9dbb85f7-d667-4cb0-b8c8-139dd91d04b2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved dataset_wise_total_times_30_70.csv\n",
      "Saved dataset_wise_total_times_70_30.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load your file\n",
    "df = pd.read_csv(\"per_run_results_all_arxiv.csv\")\n",
    "\n",
    "# Filter only classifiers of interest\n",
    "df_filtered = df[df[\"classifier\"].isin([\"gat\", \"gcn\", \"graphsage\"])]\n",
    "\n",
    "# Add new column for total time\n",
    "df_filtered[\"total_time_seconds\"] = (\n",
    "    df_filtered[\"embedding_time_seconds\"] + df_filtered[\"train_time_seconds\"]\n",
    ")\n",
    "\n",
    "def make_split_table(df, mask_frac_value, output_file):\n",
    "    # Filter for a given split (mask_frac)\n",
    "    df_split = df[df[\"mask_frac\"] == mask_frac_value]\n",
    "\n",
    "    # Group by embedding, classifier, dataset\n",
    "    dataset_wise = (\n",
    "        df_split.groupby([\"embedding\", \"classifier\", \"dataset\"])[\"total_time_seconds\"]\n",
    "        .mean()\n",
    "        .reset_index()\n",
    "    )\n",
    "\n",
    "    # Pivot so datasets become columns\n",
    "    pivot_table = dataset_wise.pivot_table(\n",
    "        index=[\"embedding\", \"classifier\"],\n",
    "        columns=\"dataset\",\n",
    "        values=\"total_time_seconds\"\n",
    "    ).reset_index()\n",
    "\n",
    "    # Add average across datasets (row-wise)\n",
    "    pivot_table[\"Avg_all_datasets\"] = pivot_table.drop(columns=[\"embedding\", \"classifier\"]).mean(axis=1)\n",
    "\n",
    "    # Add average row (column-wise)\n",
    "    avg_row = pivot_table.drop(columns=[\"embedding\", \"classifier\"]).mean()\n",
    "    avg_row[\"embedding\"] = \"Average\"\n",
    "    avg_row[\"classifier\"] = \"Average\"\n",
    "\n",
    "    # Append average row\n",
    "    pivot_table = pd.concat([pivot_table, avg_row.to_frame().T], ignore_index=True)\n",
    "\n",
    "    # Save\n",
    "    pivot_table.to_csv(output_file, index=False)\n",
    "    print(f\"Saved {output_file}\")\n",
    "    return pivot_table\n",
    "\n",
    "# Generate tables for both splits\n",
    "table_30_70 = make_split_table(df_filtered, 0.3, \"dataset_wise_total_times_30_70.csv\")\n",
    "table_70_30 = make_split_table(df_filtered, 0.7, \"dataset_wise_total_times_70_30.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80469938-8d6e-405c-91a1-6d609eb314b5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
