{
 "cells": [
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "import os\n",
    "\n",
    "import torch\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    default_data_collator,\n",
    "    AutoModelForSeq2SeqLM,\n",
    "    Seq2SeqTrainingArguments,\n",
    "    Seq2SeqTrainer,\n",
    "    GenerationConfig,\n",
    ")\n",
    "from peft import get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType\n",
    "from datasets import load_dataset\n",
    "\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "\n",
    "device = \"cuda\"\n",
    "model_name_or_path = \"t5-large\"\n",
    "tokenizer_name_or_path = \"t5-large\"\n",
    "\n",
    "checkpoint_name = \"financial_sentiment_analysis_prefix_tuning_v1.pt\"\n",
    "text_column = \"sentence\"\n",
    "label_column = \"text_label\"\n",
    "max_length = 8\n",
    "lr = 1e0\n",
    "num_epochs = 5\n",
    "batch_size = 8"
   ],
   "id": "99ff309b4e897cd6"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# creating model\n",
    "peft_config = peft_config = PromptTuningConfig(\n",
    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
    "    prompt_tuning_init=PromptTuningInit.TEXT,\n",
    "    num_virtual_tokens=20,\n",
    "    prompt_tuning_init_text=\"What is the sentiment of this article?\\n\",\n",
    "    inference_mode=False,\n",
    "    tokenizer_name_or_path=model_name_or_path,\n",
    ")\n",
    "\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
    "model = get_peft_model(model, peft_config)\n",
    "model.print_trainable_parameters()\n",
    "model"
   ],
   "id": "3204a0e11ac3a271"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# loading dataset\n",
    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")\n",
    "dataset = dataset[\"train\"].train_test_split(test_size=0.1)\n",
    "dataset[\"validation\"] = dataset[\"test\"]\n",
    "del dataset[\"test\"]\n",
    "\n",
    "classes = dataset[\"train\"].features[\"label\"].names\n",
    "dataset = dataset.map(\n",
    "    lambda x: {\"text_label\": [classes[label] for label in x[\"label\"]]},\n",
    "    batched=True,\n",
    "    num_proc=1,\n",
    ")\n",
    "\n",
    "dataset[\"train\"][0]"
   ],
   "id": "fe9a2fb06b0faba7"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# data preprocessing\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    inputs = examples[text_column]\n",
    "    targets = examples[label_column]\n",
    "    model_inputs = tokenizer(inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = tokenizer(targets, max_length=2, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n",
    "    labels = labels[\"input_ids\"]\n",
    "    labels[labels == tokenizer.pad_token_id] = -100\n",
    "    model_inputs[\"labels\"] = labels\n",
    "    return model_inputs\n",
    "\n",
    "\n",
    "processed_datasets = dataset.map(\n",
    "    preprocess_function,\n",
    "    batched=True,\n",
    "    num_proc=1,\n",
    "    remove_columns=dataset[\"train\"].column_names,\n",
    "    load_from_cache_file=False,\n",
    "    desc=\"Running tokenizer on dataset\",\n",
    ")\n",
    "\n",
    "train_dataset = processed_datasets[\"train\"].shuffle()\n",
    "eval_dataset = processed_datasets[\"validation\"]"
   ],
   "id": "b32bd1131cc673b0"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# training and evaluation\n",
    "\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    preds, labels = eval_preds\n",
    "    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
    "    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "\n",
    "    correct = 0\n",
    "    total = 0\n",
    "    for pred, true in zip(preds, labels):\n",
    "        if pred.strip() == true.strip():\n",
    "            correct += 1\n",
    "        total += 1\n",
    "    accuracy = correct / total\n",
    "    return {\"accuracy\": accuracy}\n",
    "\n",
    "\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    \"out\",\n",
    "    per_device_train_batch_size=batch_size,\n",
    "    learning_rate=lr,\n",
    "    num_train_epochs=num_epochs,\n",
    "    eval_strategy=\"epoch\",\n",
    "    logging_strategy=\"epoch\",\n",
    "    save_strategy=\"no\",\n",
    "    report_to=[],\n",
    "    predict_with_generate=True,\n",
    "    generation_config=GenerationConfig(max_length=max_length),\n",
    ")\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model=model,\n",
    "    tokenizer=tokenizer,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    eval_dataset=eval_dataset,\n",
    "    data_collator=default_data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    ")\n",
    "trainer.train()"
   ],
   "id": "1137b6f44600b5c2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# saving model\n",
    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
    "model.save_pretrained(peft_model_id)"
   ],
   "id": "15ba2574e53d43df"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "ckpt = f\"{peft_model_id}/adapter_model.bin\"\n",
    "!du -h $ckpt"
   ],
   "id": "fb4cdb2fd2a776b2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "from peft import PeftModel, PeftConfig\n",
    "\n",
    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
    "\n",
    "config = PeftConfig.from_pretrained(peft_model_id)\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n",
    "model = PeftModel.from_pretrained(model, peft_model_id)"
   ],
   "id": "ae009f6f62635242"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "model.eval()\n",
    "i = 107\n",
    "inputs = tokenizer(dataset[\"validation\"][text_column][i], return_tensors=\"pt\")\n",
    "print(dataset[\"validation\"][text_column][i])\n",
    "print(inputs)\n",
    "\n",
    "with torch.no_grad():\n",
    "    outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n",
    "    print(outputs)\n",
    "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
   ],
   "id": "78a2a1cc8d46237f"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "62d39b5e09b619fa"
  }
 ],
 "metadata": {},
 "nbformat": 5,
 "nbformat_minor": 9
}
