{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"! pip install datasets transformers seqeval\nimport os\nimport math\nimport random\nimport csv\nimport sys\n\nimport numpy as np\nimport pandas as pd\nfrom sklearn import metrics\nfrom sklearn.metrics import f1_score, precision_score, recall_score\nfrom sklearn.metrics import classification_report\nimport statistics as stats\nfrom datasets import load_dataset, load_metric\n\nfrom transformers import AutoModelForTokenClassification, TrainingArguments, Trainer\nfrom transformers import AutoTokenizer\n\nimport random \nimport numpy as np\nimport torch\n\ndef random_seed(seed_value): \n    np.random.seed(seed_value) \n    torch.manual_seed(seed_value)\n    random.seed(seed_value) \n\n\nrandom_seed(42)","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-06-15T18:37:24.838306Z","iopub.execute_input":"2022-06-15T18:37:24.839083Z","iopub.status.idle":"2022-06-15T18:37:35.463318Z","shell.execute_reply.started":"2022-06-15T18:37:24.839047Z","shell.execute_reply":"2022-06-15T18:37:35.462218Z"},"trusted":true},"execution_count":32,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nRequirement already satisfied: datasets in /opt/conda/lib/python3.7/site-packages (2.1.0)\nRequirement already satisfied: transformers in /opt/conda/lib/python3.7/site-packages (4.18.0)\nRequirement already satisfied: seqeval in /opt/conda/lib/python3.7/site-packages (1.2.2)\nRequirement already satisfied: pyarrow>=5.0.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (5.0.0)\nRequirement already satisfied: xxhash in /opt/conda/lib/python3.7/site-packages (from datasets) (3.0.0)\nRequirement already satisfied: multiprocess in /opt/conda/lib/python3.7/site-packages (from datasets) (0.70.13)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from datasets) (1.21.6)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.7/site-packages (from datasets) (3.8.1)\nRequirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.5.1)\nRequirement already satisfied: fsspec[http]>=2021.05.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2022.5.0)\nRequirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.7/site-packages (from datasets) (4.64.0)\nRequirement already satisfied: packaging in /opt/conda/lib/python3.7/site-packages (from datasets) (21.3)\nRequirement already satisfied: responses<0.19 in /opt/conda/lib/python3.7/site-packages (from datasets) (0.18.0)\nRequirement already satisfied: dill in /opt/conda/lib/python3.7/site-packages (from datasets) (0.3.5.1)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from datasets) (1.3.5)\nRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.7/site-packages (from datasets) (2.27.1)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from datasets) (4.11.4)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.6.0)\nRequirement already satisfied: sacremoses in /opt/conda/lib/python3.7/site-packages (from transformers) (0.0.53)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (2021.11.10)\nRequirement already satisfied: tokenizers!=0.11.3,<0.13,>=0.11.1 in /opt/conda/lib/python3.7/site-packages (from transformers) (0.12.1)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0)\nRequirement already satisfied: scikit-learn>=0.21.3 in /opt/conda/lib/python3.7/site-packages (from seqeval) (1.0.2)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging->datasets) (3.0.9)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (1.26.9)\nRequirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2.0.12)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (3.3)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.19.0->datasets) (2022.5.18.1)\nRequirement already satisfied: scipy>=1.1.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval) (1.7.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval) (3.1.0)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval) (1.1.0)\nRequirement already satisfied: asynctest==0.13.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (0.13.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.7.2)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.3.0)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (1.2.0)\nRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (21.4.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (6.0.2)\nRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.7/site-packages (from aiohttp->datasets) (4.0.2)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->datasets) (3.8.0)\nRequirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas->datasets) (2022.1)\nRequirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (8.0.4)\nRequirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from sacremoses->transformers) (1.16.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"os.environ[\"WANDB_DISABLED\"] = \"true\"","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:47:20.541048Z","iopub.execute_input":"2022-06-15T18:47:20.541703Z","iopub.status.idle":"2022-06-15T18:47:20.565030Z","shell.execute_reply.started":"2022-06-15T18:47:20.541648Z","shell.execute_reply":"2022-06-15T18:47:20.550752Z"},"trusted":true},"execution_count":41,"outputs":[]},{"cell_type":"code","source":"ncbi = load_dataset(\"ncbi_disease\")","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:26:38.626003Z","iopub.execute_input":"2022-06-15T18:26:38.626963Z","iopub.status.idle":"2022-06-15T18:26:45.414622Z","shell.execute_reply.started":"2022-06-15T18:26:38.626925Z","shell.execute_reply":"2022-06-15T18:26:45.413797Z"},"trusted":true},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading builder script:   0%|          | 0.00/2.28k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8ac4c3601786449db0040bb24aa4ff71"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading metadata:   0%|          | 0.00/1.55k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cda7835273c44fdda92b871f372ae615"}},"metadata":{}},{"name":"stdout","text":"Downloading and preparing dataset ncbi_disease/ncbi_disease (download: 1.47 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 4.52 MiB) to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d40bf50b625340aab6db699fbf371100"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"51f1a6c8654d445893d9150e7f7b766a"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"bb214334bca14dc8bdc15005cb537552"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"cde8f2804def4e7693d807021f45e7b2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"81d13b0c2be04fd8874e0be9e2e96328"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Generating train split:   0%|          | 0/5433 [00:00<?, ? examples/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":""}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Generating validation split:   0%|          | 0/924 [00:00<?, ? examples/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":""}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Generating test split:   0%|          | 0/941 [00:00<?, ? examples/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":""}},"metadata":{}},{"name":"stdout","text":"Dataset ncbi_disease downloaded and prepared to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"  0%|          | 0/3 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"42c31d528ce8420fa5f401a38f4ed027"}},"metadata":{}}]},{"cell_type":"code","source":"ncbi[\"train\"][0]","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:26:45.418491Z","iopub.execute_input":"2022-06-15T18:26:45.420457Z","iopub.status.idle":"2022-06-15T18:26:45.434565Z","shell.execute_reply.started":"2022-06-15T18:26:45.420415Z","shell.execute_reply":"2022-06-15T18:26:45.433733Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"{'id': '0',\n 'tokens': ['Identification',\n  'of',\n  'APC2',\n  ',',\n  'a',\n  'homologue',\n  'of',\n  'the',\n  'adenomatous',\n  'polyposis',\n  'coli',\n  'tumour',\n  'suppressor',\n  '.'],\n 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}"},"metadata":{}}]},{"cell_type":"code","source":"label_list = ncbi[\"train\"].features[f\"ner_tags\"].feature.names\nprint(len(label_list))\nlabel_list","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:26:45.438870Z","iopub.execute_input":"2022-06-15T18:26:45.441328Z","iopub.status.idle":"2022-06-15T18:26:45.453121Z","shell.execute_reply.started":"2022-06-15T18:26:45.441288Z","shell.execute_reply":"2022-06-15T18:26:45.452353Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"3\n","output_type":"stream"},{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"['O', 'B-Disease', 'I-Disease']"},"metadata":{}}]},{"cell_type":"code","source":"MODEL_NAME = \"allenai/scibert_scivocab_uncased\"","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:26:45.457283Z","iopub.execute_input":"2022-06-15T18:26:45.457615Z","iopub.status.idle":"2022-06-15T18:26:45.465031Z","shell.execute_reply.started":"2022-06-15T18:26:45.457584Z","shell.execute_reply":"2022-06-15T18:26:45.464022Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:26:45.466543Z","iopub.execute_input":"2022-06-15T18:26:45.467159Z","iopub.status.idle":"2022-06-15T18:26:50.161017Z","shell.execute_reply.started":"2022-06-15T18:26:45.467120Z","shell.execute_reply":"2022-06-15T18:26:50.160027Z"},"trusted":true},"execution_count":6,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"853068cf615940f59338b7a08088bfdf"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"4c693da417b64e9da1e2181809624c2a"}},"metadata":{}}]},{"cell_type":"code","source":"label_all_tokens = True","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:34:15.509911Z","iopub.execute_input":"2022-06-15T18:34:15.510867Z","iopub.status.idle":"2022-06-15T18:34:15.515064Z","shell.execute_reply.started":"2022-06-15T18:34:15.510819Z","shell.execute_reply":"2022-06-15T18:34:15.513989Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"def tokenize_and_align_labels(examples):\n    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n\n    labels = []\n    for i, label in enumerate(examples[f\"ner_tags\"]):\n        word_ids = tokenized_inputs.word_ids(batch_index=i)\n        previous_word_idx = None\n        label_ids = []\n        for word_idx in word_ids:\n            # Special tokens have a word id that is None. We set the label to -100 so they are automatically\n            # ignored in the loss function.\n            if word_idx is None:\n                label_ids.append(-100)\n            # We set the label for the first token of each word.\n            elif word_idx != previous_word_idx:\n                label_ids.append(label[word_idx])\n            # For the other tokens in a word, we set the label to either the current label or -100, depending on\n            # the label_all_tokens flag.\n            else:\n                label_ids.append(label[word_idx] if label_all_tokens else -100)\n            previous_word_idx = word_idx\n\n        labels.append(label_ids)\n\n    tokenized_inputs[\"labels\"] = labels\n    return tokenized_inputs","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:34:15.796873Z","iopub.execute_input":"2022-06-15T18:34:15.797529Z","iopub.status.idle":"2022-06-15T18:34:15.804932Z","shell.execute_reply.started":"2022-06-15T18:34:15.797497Z","shell.execute_reply":"2022-06-15T18:34:15.803878Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"tokenize_and_align_labels(ncbi['train'][:5])","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:34:15.984967Z","iopub.execute_input":"2022-06-15T18:34:15.985252Z","iopub.status.idle":"2022-06-15T18:34:15.992571Z","shell.execute_reply.started":"2022-06-15T18:34:15.985226Z","shell.execute_reply":"2022-06-15T18:34:15.991797Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"{'input_ids': [[102, 3366, 131, 16036, 30132, 422, 106, 28563, 131, 111, 24243, 861, 153, 7503, 1070, 5703, 7421, 14957, 205, 103], [102, 111, 24243, 861, 153, 7503, 1070, 5703, 145, 16036, 546, 7421, 579, 14957, 787, 3151, 111, 12157, 10636, 3430, 214, 8437, 106, 1127, 190, 18260, 12242, 4655, 239, 12186, 145, 20362, 579, 239, 12186, 546, 422, 10361, 30111, 1352, 6030, 107, 137, 6130, 14025, 205, 103], [102, 1127, 2256, 6844, 111, 2857, 4750, 131, 6130, 14025, 205, 103], [102, 121, 3662, 5722, 576, 422, 1738, 131, 16036, 3412, 147, 111, 5091, 131, 6130, 14025, 121, 111, 6438, 422, 582, 256, 10172, 147, 137, 13500, 111, 6168, 30122, 579, 286, 3148, 1491, 145, 6329, 121, 260, 158, 1901, 260, 170, 1901, 546, 205, 103], [102, 1530, 422, 185, 2024, 111, 3366, 137, 6110, 1187, 131, 16036, 28563, 30113, 205, 103]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, -100], [-100, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}"},"metadata":{}}]},{"cell_type":"code","source":"tokenized_datasets = ncbi.map(tokenize_and_align_labels, batched=True)","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:34:33.854277Z","iopub.execute_input":"2022-06-15T18:34:33.854642Z","iopub.status.idle":"2022-06-15T18:34:35.921975Z","shell.execute_reply.started":"2022-06-15T18:34:33.854611Z","shell.execute_reply":"2022-06-15T18:34:35.921002Z"},"trusted":true},"execution_count":25,"outputs":[{"output_type":"display_data","data":{"text/plain":"  0%|          | 0/6 [00:00<?, ?ba/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"606e96c3c19d46a692219f3b98633f19"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"  0%|          | 0/1 [00:00<?, ?ba/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6d58f55003294a2cb9635af28ff55e52"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"  0%|          | 0/1 [00:00<?, ?ba/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"39ecb4de3071407cb12f2fa3e5ccaf69"}},"metadata":{}}]},{"cell_type":"code","source":"model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:35:08.483274Z","iopub.execute_input":"2022-06-15T18:35:08.483649Z","iopub.status.idle":"2022-06-15T18:35:31.192668Z","shell.execute_reply.started":"2022-06-15T18:35:08.483618Z","shell.execute_reply":"2022-06-15T18:35:31.191791Z"},"trusted":true},"execution_count":26,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e02f3139296448e090ed135c2336b710"}},"metadata":{}},{"name":"stderr","text":"Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']\n- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nSome weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","output_type":"stream"}]},{"cell_type":"code","source":"args = TrainingArguments(\"SCIBERT-VOCAB-UNCASED-finetuned-NER\",\n    evaluation_strategy = \"epoch\",\n    learning_rate=2e-5,\n    per_device_train_batch_size=32,\n    per_device_eval_batch_size=64,\n    num_train_epochs=4,\n    weight_decay=0.01\n)","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:36:38.607366Z","iopub.execute_input":"2022-06-15T18:36:38.608187Z","iopub.status.idle":"2022-06-15T18:36:38.684261Z","shell.execute_reply.started":"2022-06-15T18:36:38.608139Z","shell.execute_reply":"2022-06-15T18:36:38.683490Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"from transformers import DataCollatorForTokenClassification\n\ndata_collator = DataCollatorForTokenClassification(tokenizer)","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:36:45.215134Z","iopub.execute_input":"2022-06-15T18:36:45.215485Z","iopub.status.idle":"2022-06-15T18:36:45.220362Z","shell.execute_reply.started":"2022-06-15T18:36:45.215455Z","shell.execute_reply":"2022-06-15T18:36:45.219246Z"},"trusted":true},"execution_count":29,"outputs":[]},{"cell_type":"code","source":"metric = load_metric(\"seqeval\")","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:37:35.465517Z","iopub.execute_input":"2022-06-15T18:37:35.465947Z","iopub.status.idle":"2022-06-15T18:37:36.273844Z","shell.execute_reply.started":"2022-06-15T18:37:35.465907Z","shell.execute_reply":"2022-06-15T18:37:36.273022Z"},"trusted":true},"execution_count":33,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8f2f825f1f5749018577af661e9f09dd"}},"metadata":{}}]},{"cell_type":"code","source":"import numpy as np\n\ndef compute_metrics(p):\n    predictions, labels = p\n    predictions = np.argmax(predictions, axis=2)\n\n    # Remove ignored index (special tokens)\n    true_predictions = [\n        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n        for prediction, label in zip(predictions, labels)\n    ]\n    true_labels = [\n        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n        for prediction, label in zip(predictions, labels)\n    ]\n\n    results = metric.compute(predictions=true_predictions, references=true_labels)\n    return {\n        \"precision\": results[\"overall_precision\"],\n        \"recall\": results[\"overall_recall\"],\n        \"f1\": results[\"overall_f1\"],\n        \"accuracy\": results[\"overall_accuracy\"],\n    }","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:38:18.128642Z","iopub.execute_input":"2022-06-15T18:38:18.129256Z","iopub.status.idle":"2022-06-15T18:38:18.136559Z","shell.execute_reply.started":"2022-06-15T18:38:18.129226Z","shell.execute_reply":"2022-06-15T18:38:18.135527Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"trainer = Trainer(\n    model,\n    args,\n    train_dataset=tokenized_datasets[\"train\"],\n    eval_dataset=tokenized_datasets[\"validation\"],\n    data_collator=data_collator,\n    tokenizer=tokenizer,\n    compute_metrics=compute_metrics\n)","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:38:25.820598Z","iopub.execute_input":"2022-06-15T18:38:25.821030Z","iopub.status.idle":"2022-06-15T18:38:31.564342Z","shell.execute_reply.started":"2022-06-15T18:38:25.820993Z","shell.execute_reply":"2022-06-15T18:38:31.562833Z"},"trusted":true},"execution_count":36,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","output_type":"stream"}]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:38:37.332423Z","iopub.execute_input":"2022-06-15T18:38:37.332828Z","iopub.status.idle":"2022-06-15T18:41:58.275924Z","shell.execute_reply.started":"2022-06-15T18:38:37.332792Z","shell.execute_reply":"2022-06-15T18:41:58.274986Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stderr","text":"The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n/opt/conda/lib/python3.7/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n  FutureWarning,\n***** Running training *****\n  Num examples = 5433\n  Num Epochs = 4\n  Instantaneous batch size per device = 32\n  Total train batch size (w. parallel, distributed & accumulation) = 32\n  Gradient Accumulation steps = 1\n  Total optimization steps = 680\nAutomatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n","output_type":"stream"},{"output_type":"stream","name":"stdin","text":"\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································\n"},{"name":"stderr","text":"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n","output_type":"stream"},{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"wandb version 0.12.18 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Tracking run with wandb version 0.12.16"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Run data is saved locally in <code>/kaggle/working/wandb/run-20220615_183851-2m8bzqan</code>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"Syncing run <strong><a href=\"https://wandb.ai/iammanavk/huggingface/runs/2m8bzqan\" target=\"_blank\">SCIBERT-VOCAB-UNCASED-finetuned-NER</a></strong> to <a href=\"https://wandb.ai/iammanavk/huggingface\" target=\"_blank\">Weights & Biases</a> (<a href=\"https://wandb.me/run\" target=\"_blank\">docs</a>)<br/>"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n    <div>\n      \n      <progress value='680' max='680' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [680/680 03:00, Epoch 4/4]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n      <th>Precision</th>\n      <th>Recall</th>\n      <th>F1</th>\n      <th>Accuracy</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>1</td>\n      <td>No log</td>\n      <td>0.053764</td>\n      <td>0.784705</td>\n      <td>0.843610</td>\n      <td>0.813092</td>\n      <td>0.981666</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>No log</td>\n      <td>0.058505</td>\n      <td>0.820779</td>\n      <td>0.847185</td>\n      <td>0.833773</td>\n      <td>0.983319</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>0.056900</td>\n      <td>0.061393</td>\n      <td>0.826353</td>\n      <td>0.846291</td>\n      <td>0.836203</td>\n      <td>0.983394</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>0.056900</td>\n      <td>0.064349</td>\n      <td>0.828274</td>\n      <td>0.853441</td>\n      <td>0.840669</td>\n      <td>0.983807</td>\n    </tr>\n  </tbody>\n</table><p>"},"metadata":{}},{"name":"stderr","text":"The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Evaluation *****\n  Num examples = 924\n  Batch size = 64\nThe following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Evaluation *****\n  Num examples = 924\n  Batch size = 64\nSaving model checkpoint to SCIBERT-VOCAB-UNCASED-finetuned-NER/checkpoint-500\nConfiguration saved in SCIBERT-VOCAB-UNCASED-finetuned-NER/checkpoint-500/config.json\nModel weights saved in SCIBERT-VOCAB-UNCASED-finetuned-NER/checkpoint-500/pytorch_model.bin\ntokenizer config file saved in SCIBERT-VOCAB-UNCASED-finetuned-NER/checkpoint-500/tokenizer_config.json\nSpecial tokens file saved in SCIBERT-VOCAB-UNCASED-finetuned-NER/checkpoint-500/special_tokens_map.json\nThe following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Evaluation *****\n  Num examples = 924\n  Batch size = 64\nThe following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Evaluation *****\n  Num examples = 924\n  Batch size = 64\n\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n\n","output_type":"stream"},{"execution_count":37,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=680, training_loss=0.04527099938953624, metrics={'train_runtime': 200.8927, 'train_samples_per_second': 108.177, 'train_steps_per_second': 3.385, 'total_flos': 774297676783980.0, 'train_loss': 0.04527099938953624, 'epoch': 4.0})"},"metadata":{}}]},{"cell_type":"code","source":"trainer.evaluate()","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:41:58.277971Z","iopub.execute_input":"2022-06-15T18:41:58.278504Z","iopub.status.idle":"2022-06-15T18:42:01.400906Z","shell.execute_reply.started":"2022-06-15T18:41:58.278462Z","shell.execute_reply":"2022-06-15T18:42:01.399839Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stderr","text":"The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Evaluation *****\n  Num examples = 924\n  Batch size = 64\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n    <div>\n      \n      <progress value='45' max='15' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [15/15 00:39]\n    </div>\n    "},"metadata":{}},{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"{'eval_loss': 0.06434859335422516,\n 'eval_precision': 0.8282740676496098,\n 'eval_recall': 0.8534405719392315,\n 'eval_f1': 0.840669014084507,\n 'eval_accuracy': 0.9838073411729346,\n 'eval_runtime': 3.1049,\n 'eval_samples_per_second': 297.59,\n 'eval_steps_per_second': 4.831,\n 'epoch': 4.0}"},"metadata":{}}]},{"cell_type":"code","source":"predictions, labels, _ = trainer.predict(tokenized_datasets[\"test\"])\npredictions = np.argmax(predictions, axis=2)\n\n# Remove ignored index (special tokens)\ntrue_predictions = [\n    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n    for prediction, label in zip(predictions, labels)\n]\ntrue_labels = [\n    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]\n    for prediction, label in zip(predictions, labels)\n]\n\nresults = metric.compute(predictions=true_predictions, references=true_labels)\nresults","metadata":{"execution":{"iopub.status.busy":"2022-06-15T18:42:35.865592Z","iopub.execute_input":"2022-06-15T18:42:35.866328Z","iopub.status.idle":"2022-06-15T18:42:40.103440Z","shell.execute_reply.started":"2022-06-15T18:42:35.866288Z","shell.execute_reply":"2022-06-15T18:42:40.102589Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stderr","text":"The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n***** Running Prediction *****\n  Num examples = 941\n  Batch size = 64\n","output_type":"stream"},{"execution_count":40,"output_type":"execute_result","data":{"text/plain":"{'Disease': {'precision': 0.8628205128205129,\n  'recall': 0.8997326203208557,\n  'f1': 0.8808900523560209,\n  'number': 1496},\n 'overall_precision': 0.8628205128205129,\n 'overall_recall': 0.8997326203208557,\n 'overall_f1': 0.8808900523560209,\n 'overall_accuracy': 0.9820055144391235}"},"metadata":{}}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}