{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "515e7e7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "import torch_geometric.nn\n",
    "import torch_geometric.data as data\n",
    "from torch_geometric.utils.convert import to_networkx\n",
    "\n",
    "import torch.nn as nn\n",
    "\n",
    "import scanpy as sc\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "def sigmoid(x):\n",
    "    return 1/(1+np.exp(-x))\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "from torch_geometric.nn import TransformerConv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04ef840a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def set_seed(seed):\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    os.environ['PYTHONHASHEED'] = str(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    torch.cuda.manual_seed_all(seed)\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "    torch.backends.cudnn.deterministic = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b65a61ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "set_seed(0)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "40d6683d",
   "metadata": {},
   "source": [
    "# Here we use heart as one example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9b7a48c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for specific encoder/decoder\n",
    "# tissue_list = { \n",
    "#                \"heart\":[233, 676, 783, 947,266, 223, 233, 978, 928, 852, 839, 733]}\n",
    "\n",
    "\n",
    "tissue_list = { \n",
    "               \"scrna_heart\":['D4',\n",
    " 'H2',\n",
    " 'H3',\n",
    " 'D6',\n",
    " 'D2',\n",
    " 'H7',\n",
    " 'D11',\n",
    " 'D3',\n",
    " 'D1',\n",
    " 'D5',\n",
    " 'H4',\n",
    " 'D7',\n",
    " 'H6',\n",
    " 'H5',\n",
    " 'G19'], \n",
    "}\n",
    "\n",
    "# construct graph batch\n",
    "# based on simulation results\n",
    "graph_list = []\n",
    "cor_list = []\n",
    "label_list = []\n",
    "count = 0\n",
    "\n",
    "for tissue in tissue_list.keys():\n",
    "    for i in tissue_list[tissue]:\n",
    "        print(i)\n",
    "        pathway_count = f\"./heart_atlas/{tissue}_\" + i + \"_rna_expression\" + \".csv\"\n",
    "        pathway_matrix = f\"./heart_atlas/{tissue}_\" + i + \"_pvalue\" + \".csv\"\n",
    "\n",
    "        pd_adata_new =  pd.read_csv(pathway_count, index_col=0)\n",
    "        correlation = pd.read_csv(pathway_matrix, index_col=0)\n",
    "        cor_list.append(correlation)\n",
    "\n",
    "        print(correlation.shape)\n",
    "        print(pd_adata_new.shape)\n",
    "        adata = sc.AnnData(pd_adata_new)\n",
    "\n",
    "        adata_new = adata.copy()\n",
    "        edges_new = np.array([np.nonzero(correlation.values)[0],np.nonzero(correlation.values)[1]])\n",
    "        graph = data.Data(x=torch.FloatTensor(adata_new.X.copy()), edge_index=torch.FloatTensor(edges_new).long())\n",
    "\n",
    "        vis = to_networkx(graph)\n",
    "        graph.gene_list = pd_adata_new.index\n",
    "        graph.show_index = tissue +\"__\" + str(i)\n",
    "\n",
    "        graph_list.append(graph)\n",
    "        label_list.append(tissue)\n",
    "        \n",
    "        count +=1"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e6916d01",
   "metadata": {},
   "source": [
    "# PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5427ae5",
   "metadata": {},
   "outputs": [],
   "source": [
    "graph_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f177f611",
   "metadata": {},
   "outputs": [],
   "source": [
    "emb_list = []\n",
    "gene_list = []\n",
    "tissue_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "555e12ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(tissue_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b85460f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(0,len(graph_list)):\n",
    "    graph = graph_list[i]\n",
    "    adata = sc.AnnData(graph.x.cpu().numpy())\n",
    "    sc.pp.scale(adata)\n",
    "    sc.tl.pca(adata, 32)\n",
    "\n",
    "    emb_list.append(adata.obsm['X_pca'])\n",
    "\n",
    "    gene_list.append(graph.gene_list)\n",
    "    tissue_list.append([graph.show_index for j in range(len(graph.x))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf101d53",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = sc.AnnData(np.concatenate(emb_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b073bd66",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baeff996",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['gene'] = np.concatenate(gene_list)\n",
    "adata.obs['tissue'] = np.concatenate(tissue_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29414787",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26f356ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.neighbors(adata, use_rep='X')\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aec1a16d",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='tissue')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91351db5",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.tl.leiden(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29b36239",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='leiden')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09d62bf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue_new'] = [i.split(\"__\")[0] for i in adata.obs['tissue']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a81e183f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='tissue_new')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e112e992",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write_h5ad(\"heart_global/heart_umi_PCA.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2d9517c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6f635764",
   "metadata": {},
   "source": [
    "# Gene2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be1012fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from torch_geometric.utils.convert import to_networkx\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import scanpy as sc\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def sigmoid(x):\n",
    "    return 1/(1+np.exp(-x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4319b34",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "690bcfb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "####training parameters########\n",
    "dimension = 32  # dimension of the embedding\n",
    "num_workers = 32  # number of worker threads\n",
    "sg = 1  # sg =1, skip-gram, sg =0, CBOW\n",
    "max_iter = 10  # number of iterations\n",
    "window_size = 1  # The maximum distance between the gene and predicted gene within a gene list\n",
    "txtOutput = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62d4f541",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = gensim.models.Word2Vec(gene_pairs, vector_size=dimension, window=window_size, min_count=1, workers=num_workers,sg=sg, )\n",
    "# model.train(gene_pairs,total_examples=model.corpus_count,epochs=max_iter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae448bfb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# vector = model.wv['ENSG00000158747.15'] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4ce9413",
   "metadata": {},
   "outputs": [],
   "source": [
    "# vector_list = np.zeros((1000,32))\n",
    "# gene_list = []\n",
    "# for num,i in enumerate(edge_list.index):\n",
    "#     vector_list[num] = model.wv[i] \n",
    "#     gene_list.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15959f8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gene_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e56d9ce7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_list(model, edge_list):\n",
    "    vector_list = np.zeros((1000,32))\n",
    "    gene_list = []\n",
    "    for num,i in enumerate(edge_list.index):\n",
    "        vector_list[num] = model.wv[i] \n",
    "        gene_list.append(i)\n",
    "    \n",
    "    return vector_list, gene_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4320435e",
   "metadata": {},
   "outputs": [],
   "source": [
    "vec_list = []\n",
    "gene_list_final = []\n",
    "\n",
    "for num,i in enumerate(graph_list):\n",
    "    edge_list = cor_list[num]\n",
    "    \n",
    "    nonz_index = np.nonzero(edge_list.values)\n",
    "    \n",
    "    gene_pairs = []\n",
    "    for i,j in zip(nonz_index[0], nonz_index[1]):\n",
    "        gene_pairs.append([edge_list.index[i], edge_list.columns[j]])\n",
    "        \n",
    "    model = gensim.models.Word2Vec(gene_pairs, vector_size=dimension, window=window_size, min_count=1, workers=num_workers,sg=sg, )\n",
    "    model.train(gene_pairs,total_examples=model.corpus_count,epochs=max_iter)\n",
    "    \n",
    "    print('finish gene2vec training')\n",
    "    vector_list = np.zeros((1000,32))\n",
    "    gene_list = []\n",
    "    for num,i in enumerate(edge_list.index):\n",
    "        vector_list[num] = model.wv[i] \n",
    "        gene_list.append(i)\n",
    "        \n",
    "    vec_list.append(vector_list)\n",
    "    gene_list_final.append(gene_list)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b6246c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue_list = []\n",
    "for graph in graph_list:\n",
    "    label_list = [graph.show_index for i in range(len(graph.x))]\n",
    "    tissue_list.append(label_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "479cfbef",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.concatenate(np.array(tissue_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3dadca10",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = sc.AnnData(np.concatenate(np.array(vec_list)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fe70ecb",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'] = np.concatenate(np.array(tissue_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60d7ce89",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['gene'] = np.concatenate(np.array(gene_list_final))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36d3c259",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.neighbors(adata, use_rep='X')\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e4c29a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.tl.leiden(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41d78fe3",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='leiden')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a24bcad",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pl.umap(adata, color='tissue')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11d30257",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write_h5ad('heart_global/heart_umi_gene2vec.h5ad')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "fc413854",
   "metadata": {},
   "source": [
    "# scBERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc276b5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of scBERT\n",
    "# https://github.com/TencentAILabHealthcare/scBERT"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b09f66f6",
   "metadata": {},
   "source": [
    "# GIANT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb3605da",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of GIANT\n",
    "# https://github.com/chenhcs/GIANT"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "88ce389e",
   "metadata": {},
   "source": [
    "# GAE/VGAE/MAE/WSMAE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8128832e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the seperated file for {method} benchmark.py"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "9d501535",
   "metadata": {},
   "source": [
    "# SUGRL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ca88f0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of SUGRL\n",
    "# https://github.com/YujieMo/SUGRL"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6df568e3",
   "metadata": {},
   "source": [
    "# GRACE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "533ebf9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of GRACE\n",
    "# https://github.com/CRIPAC-DIG/GRACE"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2c69368c",
   "metadata": {},
   "source": [
    "# GPS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61f5623e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of GPS\n",
    "# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GPSConv.html#torch_geometric.nn.conv.GPSConv"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f2fe82dc",
   "metadata": {},
   "source": [
    "# Graphormer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2dfab39",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Please see the codes of Graphormer\n",
    "# https://github.com/microsoft/Graphormer"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
