{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "67d38e7b-067d-4ec6-aaf4-58c4d9d2d004",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.2.2+cu121\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "print(torch.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "327d87f1-cfb2-489f-aa06-98b1eb0b5d2e",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"6\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d1383b40-1b0e-4cba-a27a-16e0c68f7493",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.device_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a9e0d790-0c55-491c-8224-48230370df00",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/minhnh/python_venv/nlp/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2024-04-05 09:07:07.497737: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "2024-04-05 09:07:07.497818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "2024-04-05 09:07:07.498754: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2024-04-05 09:07:07.505809: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-05 09:07:08.427440: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset, Dataset, DatasetDict\n",
    "import pandas as pd\n",
    "\n",
    "from huggingface_hub.hf_api import HfFolder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7deeda1f-055c-47b6-9ffa-434ef6027ee3",
   "metadata": {},
   "outputs": [],
   "source": [
    "HfFolder.save_token('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7523200d-80ea-41bd-a5ae-c80e286914ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/minhnh/python_venv/nlp/lib/python3.9/site-packages/datasets/load.py:2483: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
      "You can remove this warning by passing 'token=<use_auth_token>' instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"TeeA/text2sql_vi\", use_auth_token=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "71e78237-81d2-4ae0-85ea-58fc1f1dc60c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'schema_syll': 'CREATE TABLE lab(subject_id text,hadm_id text,itemid text,charttime text,flag text,value_unit text,label text,fluid text) CREATE TABLE thủ tục(subject_id text,hadm_id text,icd9_code text,short_title text,long_title text) CREATE TABLE nhân khẩu học(subject_id text,hadm_id text,name text,marital_status text,age text,dob text,giới tính text,ngôn ngữ text,tôn giáo text,loại_nhập viện text,ngày_ở text,bảo hiểm text,dân tộc text,hết hạn_cờ text,vị trí_nhập viện text,vị trí xuất viện text,chẩn đoán text,dod text,dob_year text,dod_year text,thời gian nhập viện text,dischtime text,admityear text) CREATE TABLE đơn thuốc(subject_id text,hadm_id text,icustay_id text,drug_type text,drug text,formulary_drug_cd text,route text,drug_dose text) CREATE TABLE chẩn đoán(subject_id text,hadm_id text,icd9_code text,short_title text,long_title text)',\n",
       " 'schema_word': 'CREATE TABLE lab(subject_id text,hadm_id text,itemid text,charttime text,flag text,value_unit text,label text,fluid text) CREATE TABLE thủ_tục(subject_id text,hadm_id text,icd9_code text,short_title text,long_title text) CREATE TABLE nhân_khẩu học(subject_id text,hadm_id text,name text,marital_status text,age text,dob text,giới_tính text,ngôn_ngữ text,tôn_giáo text,loại_nhập_viện text,ngày_ở text,bảo_hiểm text,dân_tộc text,hết hạn_cờ text,vị trí_nhập_viện text,vị_trí xuất_viện text,chẩn_đoán text,dod text,dob_year text,dod_year text,thời_gian nhập_viện text,dischtime text,admityear text) CREATE TABLE đơn thuốc(subject_id text,hadm_id text,icustay_id text,drug_type text,drug text,formulary_drug_cd text,route text,drug_dose text) CREATE TABLE chẩn_đoán(subject_id text,hadm_id text,icd9_code text,short_title text,long_title text)',\n",
       " 'query_syll': 'SELECT COUNT(DISTINCTnhân khẩu học.subject_id) FROM nhân khẩu học INNER JOIN thủ tục ON nhân khẩu học.hadm_id = thủ tục.hadm_id WHERE nhân khẩu học.gender = \"F\" AND thủ tục.long_title = \"chuyển nhịp nhĩ\"',\n",
       " 'source': 'mimicsql_data',\n",
       " 'question_syll': 'cho tôi xem số lượng bệnh nhân nữ đã trải qua chuyển nhịp nhĩ.',\n",
       " 'question_word': 'cho tôi xem số_lượng bệnh_nhân nữ đã trải qua chuyển nhịp nhĩ .',\n",
       " 'query_word': 'SELECT COUNT( DISTINCT nhân_khẩu học.subject_id) FROM nhân_khẩu học INNER JOIN thủ_tục ON nhân_khẩu học.hadm_id = thủ_tục.hadm_id WHERE nhân_khẩu học.gender = \"F\" AND thủ_tục.long_title = \"chuyển nhịp nhĩ\"'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train'][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "baeb045d-90bd-4c58-a121-098d798cb3a3",
   "metadata": {},
   "source": [
    "**meaning level**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "25018252-6008-47df-acf4-353e58072f39",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModel, AutoTokenizer\n",
    "\n",
    "phobert = AutoModel.from_pretrained(\"vinai/phobert-base-v2\",device_map = \"auto\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base-v2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "10cffd76-06d1-4fad-9765-87156975eae7",
   "metadata": {},
   "outputs": [],
   "source": [
    "instruction = dataset['train']['question_word']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c096afda-b557-43eb-8a45-f1cfc8efe006",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 243964/243964 [28:33<00:00, 142.35it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "list_emb = []\n",
    "for i in tqdm(range(len(instruction))):\n",
    "    input_ids = torch.tensor([tokenizer.encode(instruction[i],truncation=True, max_length=254)])\n",
    "\n",
    "    with torch.no_grad():\n",
    "      features = phobert(input_ids.to('cuda:0'))\n",
    "    list_emb.append((i,features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ac0b034c-8aba-4947-a3e9-604acd0cfd05",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "243964\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "len_emb = len(list_emb)\n",
    "print(len_emb)\n",
    "list_emb_1 = [(i,ele[0][:, 0, :].to('cpu').numpy()) for i,ele in list_emb]\n",
    "list_emb_2 = [(i,ele.squeeze(0)) for i,ele in list_emb_1]\n",
    "with open(\"my_array_word.pickle\", \"wb\") as f:\n",
    "    pickle.dump(list_emb_2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c6e28e6c-4c93-442d-acfe-701b90d583e3",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "\n",
    "class NumpyEncoder(json.JSONEncoder):\n",
    "    def default(self, obj):\n",
    "        if isinstance(obj, np.ndarray):\n",
    "            return obj.tolist()\n",
    "        return json.JSONEncoder.default(self, obj)\n",
    "\n",
    "list_data = []\n",
    "for i,emb in list_emb_2:\n",
    "    list_data.append({\n",
    "        'index' : i,\n",
    "        'emb' : emb\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "844f7e60-f491-4844-9e5b-e218ad8c0753",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]\n",
      "Creating parquet from Arrow format:   0%|          | 0/122 [00:00<?, ?ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:   2%|▏         | 3/122 [00:00<00:04, 29.49ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:   6%|▌         | 7/122 [00:00<00:03, 31.71ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:   9%|▉         | 11/122 [00:00<00:03, 31.37ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  12%|█▏        | 15/122 [00:00<00:03, 31.38ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  16%|█▌        | 19/122 [00:00<00:05, 17.38ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  19%|█▉        | 23/122 [00:01<00:04, 20.28ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  22%|██▏       | 27/122 [00:01<00:04, 22.43ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  25%|██▍       | 30/122 [00:01<00:03, 23.67ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  27%|██▋       | 33/122 [00:01<00:05, 15.91ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  30%|███       | 37/122 [00:01<00:04, 19.68ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  34%|███▎      | 41/122 [00:01<00:03, 22.39ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  36%|███▌      | 44/122 [00:01<00:03, 23.73ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  39%|███▉      | 48/122 [00:02<00:02, 26.92ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  43%|████▎     | 52/122 [00:02<00:02, 29.06ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  46%|████▌     | 56/122 [00:02<00:02, 29.83ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  49%|████▉     | 60/122 [00:02<00:01, 31.91ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  52%|█████▏    | 64/122 [00:02<00:03, 18.66ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  56%|█████▌    | 68/122 [00:02<00:02, 21.54ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  59%|█████▉    | 72/122 [00:03<00:02, 23.94ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  62%|██████▏   | 76/122 [00:03<00:01, 26.73ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  66%|██████▌   | 80/122 [00:03<00:01, 28.86ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  69%|██████▉   | 84/122 [00:03<00:01, 30.52ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  72%|███████▏  | 88/122 [00:03<00:01, 31.93ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  75%|███████▌  | 92/122 [00:03<00:00, 32.19ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  79%|███████▊  | 96/122 [00:03<00:00, 32.16ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  82%|████████▏ | 100/122 [00:03<00:00, 33.22ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  85%|████████▌ | 104/122 [00:03<00:00, 33.90ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  89%|████████▊ | 108/122 [00:04<00:00, 34.69ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  92%|█████████▏| 112/122 [00:04<00:00, 34.42ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  95%|█████████▌| 116/122 [00:04<00:00, 34.04ba/s]\u001B[A\n",
      "Creating parquet from Arrow format: 100%|██████████| 122/122 [00:04<00:00, 26.97ba/s]\u001B[A\n",
      "Uploading the dataset shards:  50%|█████     | 1/2 [00:18<00:18, 18.69s/it]\n",
      "Creating parquet from Arrow format:   0%|          | 0/122 [00:00<?, ?ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:   3%|▎         | 4/122 [00:00<00:03, 32.25ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:   7%|▋         | 8/122 [00:00<00:03, 33.07ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  10%|▉         | 12/122 [00:00<00:03, 32.33ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  13%|█▎        | 16/122 [00:00<00:06, 16.96ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  16%|█▋        | 20/122 [00:00<00:04, 20.69ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  20%|█▉        | 24/122 [00:01<00:04, 23.75ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  23%|██▎       | 28/122 [00:01<00:03, 26.56ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  26%|██▌       | 32/122 [00:01<00:03, 28.91ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  30%|██▉       | 36/122 [00:01<00:05, 17.10ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  32%|███▏      | 39/122 [00:01<00:04, 19.09ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  35%|███▌      | 43/122 [00:01<00:03, 22.04ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  39%|███▊      | 47/122 [00:02<00:03, 24.62ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  42%|████▏     | 51/122 [00:02<00:02, 26.75ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  45%|████▌     | 55/122 [00:02<00:02, 28.50ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  48%|████▊     | 59/122 [00:02<00:02, 29.62ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  52%|█████▏    | 63/122 [00:02<00:01, 31.64ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  55%|█████▍    | 67/122 [00:02<00:01, 32.39ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  58%|█████▊    | 71/122 [00:03<00:02, 17.73ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  61%|██████▏   | 75/122 [00:03<00:02, 20.52ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  64%|██████▍   | 78/122 [00:03<00:01, 22.22ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  67%|██████▋   | 82/122 [00:03<00:01, 25.02ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  70%|███████   | 86/122 [00:03<00:01, 27.37ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  74%|███████▍  | 90/122 [00:03<00:01, 21.33ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  77%|███████▋  | 94/122 [00:03<00:01, 23.73ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  80%|████████  | 98/122 [00:04<00:00, 26.08ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  84%|████████▎ | 102/122 [00:04<00:00, 28.02ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  87%|████████▋ | 106/122 [00:04<00:00, 28.70ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  90%|█████████ | 110/122 [00:04<00:00, 30.16ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  93%|█████████▎| 114/122 [00:04<00:00, 31.01ba/s]\u001B[A\n",
      "Creating parquet from Arrow format:  97%|█████████▋| 118/122 [00:04<00:00, 31.82ba/s]\u001B[A\n",
      "Creating parquet from Arrow format: 100%|██████████| 122/122 [00:04<00:00, 25.38ba/s]\u001B[A\n",
      "Uploading the dataset shards: 100%|██████████| 2/2 [00:40<00:00, 20.10s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/hoangphu7122002ai/phobert_t2sql_embedding_word/commit/95b63e9e3b51d14e35c17c956244ce8b826d47e6', commit_message='Upload dataset', commit_description='', oid='95b63e9e3b51d14e35c17c956244ce8b826d47e6', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(list_data)\n",
    "from datasets import Dataset\n",
    "dataset = Dataset.from_pandas(df)\n",
    "dataset.push_to_hub(\"hoangphu7122002ai/phobert_t2sql_embedding_word\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "NLP",
   "language": "python",
   "name": "nlp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
