{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "412e41ee-72d3-4e71-bd3a-703b37429c57",
   "metadata": {},
   "source": [
    "# PyTorch AO (torchao) with int8_weight_only"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10e1acc3-50b8-4d40-bdf3-0133c113cc4b",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a9935ae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import os\n",
    "\n",
    "import torch\n",
    "from torch.optim import AdamW\n",
    "from torch.utils.data import DataLoader\n",
    "from peft import (\n",
    "    get_peft_config,\n",
    "    get_peft_model,\n",
    "    get_peft_model_state_dict,\n",
    "    set_peft_model_state_dict,\n",
    "    LoraConfig,\n",
    "    PeftType,\n",
    "    PrefixTuningConfig,\n",
    "    PromptEncoderConfig,\n",
    ")\n",
    "\n",
    "import evaluate\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TorchAoConfig, get_linear_schedule_with_warmup, set_seed\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eafdd532-b1eb-4aac-8077-3386a84c7cdb",
   "metadata": {},
   "source": [
    "## Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e3b13308",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 16\n",
    "model_name_or_path = \"google/gemma-2-2b\"\n",
    "task = \"mrpc\"\n",
    "device = \"cuda\"\n",
    "num_epochs = 5\n",
    "lr = 2e-5\n",
    "\n",
    "lora_rank = 16\n",
    "lora_alpha = 32\n",
    "lora_dropout = 0.1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7fb69bf-0182-4111-b715-e2e659b42b1d",
   "metadata": {},
   "source": [
    "## Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d2f4d25e-30b9-431f-95c3-adb390dc6fcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n",
    "    padding_side = \"left\"\n",
    "else:\n",
    "    padding_side = \"right\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n",
    "if getattr(tokenizer, \"pad_token_id\") is None:\n",
    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
    "\n",
    "datasets = load_dataset(\"glue\", task)\n",
    "metric = evaluate.load(\"glue\", task)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ea852bc-a040-4244-8fd3-516307cecd14",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_function(examples):\n",
    "    # max_length=None => use the model max length (it's actually the default)\n",
    "    outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n",
    "    return outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets = datasets.map(\n",
    "    tokenize_function,\n",
    "    batched=True,\n",
    "    remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n",
    ")\n",
    "\n",
    "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n",
    "# transformers library\n",
    "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def collate_fn(examples):\n",
    "    return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0288f311-8475-4a0e-99af-e4b909d10e01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate dataloaders.\n",
    "train_dataloader = DataLoader(\n",
    "    tokenized_datasets[\"train\"],\n",
    "    shuffle=True,\n",
    "    collate_fn=collate_fn,\n",
    "    batch_size=batch_size,\n",
    ")\n",
    "eval_dataloader = DataLoader(\n",
    "    tokenized_datasets[\"validation\"],\n",
    "    shuffle=False,\n",
    "    collate_fn=collate_fn,\n",
    "    batch_size=batch_size,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "512d9dc10a4d4ecc88b9440575b0973a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b and are newly initialized: ['score.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "quant_config = TorchAoConfig(quant_type=\"int8_weight_only\")\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\n",
    "    model_name_or_path, return_dict=True, device_map=0, torch_dtype=torch.bfloat16, quantization_config=quant_config\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0526f571",
   "metadata": {},
   "outputs": [],
   "source": [
    "peft_config = LoraConfig(\n",
    "    task_type=\"SEQ_CLS\",\n",
    "    r=lora_rank,\n",
    "    lora_alpha=lora_alpha,\n",
    "    lora_dropout=lora_dropout,\n",
    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ceeae329-e931-4d52-8a28-9c87e5cdb4cf",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 3,199,488 || all params: 2,617,545,984 || trainable%: 0.1222\n"
     ]
    }
   ],
   "source": [
    "model = get_peft_model(model, peft_config)\n",
    "model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b3d2544-3028-4e2a-9c56-d4d7d9d674de",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0d2d0381",
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = AdamW(params=model.parameters(), lr=lr)\n",
    "\n",
    "# Instantiate scheduler\n",
    "lr_scheduler = get_linear_schedule_with_warmup(\n",
    "    optimizer=optimizer,\n",
    "    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n",
    "    num_training_steps=(len(train_dataloader) * num_epochs),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f04c88ca-84eb-4184-afe6-3869b6f96b76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PeftModelForSequenceClassification(\n",
       "  (base_model): LoraModel(\n",
       "    (model): Gemma2ForSequenceClassification(\n",
       "      (model): Gemma2Model(\n",
       "        (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
       "        (layers): ModuleList(\n",
       "          (0-25): 26 x Gemma2DecoderLayer(\n",
       "            (self_attn): Gemma2Attention(\n",
       "              (q_proj): lora.TorchaoLoraLinear(\n",
       "                (base_layer): Linear(in_features=2304, out_features=2048, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=2048, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "                (lora_magnitude_vector): ModuleDict()\n",
       "              )\n",
       "              (k_proj): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "              (v_proj): lora.TorchaoLoraLinear(\n",
       "                (base_layer): Linear(in_features=2304, out_features=1024, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=2304, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "                (lora_magnitude_vector): ModuleDict()\n",
       "              )\n",
       "              (o_proj): Linear(in_features=2048, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "              (rotary_emb): Gemma2RotaryEmbedding()\n",
       "            )\n",
       "            (mlp): Gemma2MLP(\n",
       "              (gate_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "              (up_proj): Linear(in_features=2304, out_features=9216, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "              (down_proj): Linear(in_features=9216, out_features=2304, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None))\n",
       "              (act_fn): PytorchGELUTanh()\n",
       "            )\n",
       "            (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
       "            (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
       "            (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
       "            (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
       "          )\n",
       "        )\n",
       "        (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
       "      )\n",
       "      (score): ModulesToSaveWrapper(\n",
       "        (original_module): Linear(in_features=2304, out_features=2, bias=False)\n",
       "        (modules_to_save): ModuleDict(\n",
       "          (default): Linear(in_features=2304, out_features=2, bias=False)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.use_cache = False\n",
    "model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "fa0e73be",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                                                                                                                                                                                                                       | 0/230 [00:00<?, ?it/s]You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.19it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1 | train loss 1.0672 | {'accuracy': 0.6715686274509803, 'f1': 0.7751677852348994}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.19it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 2 | train loss 0.6261 | {'accuracy': 0.7377450980392157, 'f1': 0.8201680672268907}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.25it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.15it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 3 | train loss 0.4743 | {'accuracy': 0.7867647058823529, 'f1': 0.8502581755593803}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.30it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 4 | train loss 0.4006 | {'accuracy': 0.803921568627451, 'f1': 0.8586572438162544}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:31<00:00,  7.26it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 16.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 5 | train loss 0.3585 | {'accuracy': 0.8235294117647058, 'f1': 0.8791946308724832}\n",
      "CPU times: user 2min 8s, sys: 38 s, total: 2min 46s\n",
      "Wall time: 2min 46s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "for epoch in range(1, num_epochs + 1):\n",
    "    model.train()\n",
    "    train_losses = []\n",
    "    for step, batch in enumerate(tqdm(train_dataloader)):\n",
    "        batch.to(device)\n",
    "        outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        if not torch.isfinite(loss):\n",
    "            raise ValueError(\"non-finite loss encountered\")\n",
    "\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        lr_scheduler.step()\n",
    "        optimizer.zero_grad()\n",
    "        train_losses.append(loss.item())\n",
    "\n",
    "    model.eval()\n",
    "    for step, batch in enumerate(tqdm(eval_dataloader)):\n",
    "        batch.to(device)\n",
    "        with torch.no_grad():\n",
    "            outputs = model(**batch)\n",
    "        predictions = outputs.logits.argmax(dim=-1)\n",
    "        predictions, references = predictions, batch[\"labels\"]\n",
    "        metric.add_batch(\n",
    "            predictions=predictions,\n",
    "            references=references,\n",
    "        )\n",
    "\n",
    "    eval_metric = metric.compute()\n",
    "    train_loss = sum(train_losses) / len(train_losses)\n",
    "    print(f\"epoch {epoch} | train loss {train_loss:.4f} |\", eval_metric)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6a1f937b-a0a5-40ec-8e41-5a5a18c6bff6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# memory: 18098MiB"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
