{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!pip install transformers accelerate -U"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1ymE0R3EuIZd",
        "outputId": "014eff3e-f4f2-43bd-9688-19db92ad8d2c"
      },
      "id": "1ymE0R3EuIZd",
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.32.1)\n",
            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.22.0)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.16.4)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.3)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
            "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu118)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (2023.6.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (4.7.1)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.12)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.2)\n",
            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.0.0)\n",
            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (3.27.2)\n",
            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (16.0.6)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.2.0)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n",
            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import reservoirtransformers\n",
        "import math\n",
        "import os\n",
        "import warnings\n",
        "from dataclasses import dataclass\n",
        "from typing import List, Optional, Tuple, Union\n",
        "\n",
        "import torch\n",
        "import torch.utils.checkpoint\n",
        "from torch import nn\n",
        "from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss\n",
        "import torch.nn.functional as F"
      ],
      "metadata": {
        "id": "mhmjveElujN2"
      },
      "id": "mhmjveElujN2",
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "7eb58cec",
      "metadata": {
        "id": "7eb58cec"
      },
      "outputs": [],
      "source": [
        "from configuration import ReservoirTConfig\n",
        "\n",
        "configuration = ReservoirTConfig()\n",
        "\n",
        "configuration.output_size=7\n",
        "configuration.re_output_size=20\n",
        "configuration.max_sequence_length=1399\n",
        "configuration.sequence_length=384\n",
        "configuration.pred_len=336\n",
        "configuration.hidden_size=8\n",
        "configuration.num_attention_heads=4\n",
        "configuration.hidden_dropout_prob=0.0\n",
        "configuration.num_hidden_layers=4\n",
        "configuration.num_reservoirs = 10\n",
        "configuration.reservoir_size = [30, 15, 20, 25, 30, 35, 40, 45, 50, 50]\n",
        "configuration.spectral_radius = [0.6, 0.8, 0.55, 0.6, 0.5, 0.4, 0.3, 0.2, 0.81, 0.05]\n",
        "configuration.sparsity = [0.6, 0.55, 0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15]\n",
        "configuration.leaky = [0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39]\n",
        "configuration.attention_probs_dropout_prob=0.0\n",
        "#bert_model = TabularBertForRegression(config=configuration).to(\"cpu\", dtype=float)\n",
        "model = reservoirtransformers.ReservoirTTimeSeries(config=configuration).to(\"cpu\", dtype=float)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "0f6bf41e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        },
        "id": "0f6bf41e",
        "outputId": "4bcca8e6-3a14-4a61-a6b6-ec87de124ab0"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"\\nzero_col = np.zeros((X.shape[0], 1))\\n\\n# Concatenate the original array with the zero column along the second axis (columns)\\nX = np.hstack((X, zero_col))\\n#X =  dataset.drop(['ate'], axis = 1).values\\n\\n#X_train, X_test, y_train, y_test =train_test_split(X.values, y, test_size=0.2, shuffle=False)\\n\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 4
        }
      ],
      "source": [
        "import numpy as np\n",
        "# prepare data for lstm\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from pandas import read_csv\n",
        "from pandas import DataFrame\n",
        "import random\n",
        "from sklearn.model_selection import train_test_split\n",
        "from pandas import concat\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "from sklearn.preprocessing import MinMaxScaler\n",
        "dataset= read_csv('/content/ETTm2.csv')\n",
        "dataset=dataset.dropna()\n",
        "dataset = dataset.drop(['date'], axis = 1)\n",
        "dataset = dataset.dropna()\n",
        "#data = dataset.values[0:14000]\n",
        "\n",
        "\n",
        "y = dataset.OT.values\n",
        "\n",
        "\n",
        "X = dataset.values\n",
        "\n",
        "scaler = StandardScaler()\n",
        "X = scaler.fit_transform(X)\n",
        "\n",
        "\n",
        "\n",
        "#X=X[1:]\n",
        "\n",
        "#Reservoir_id = np.array([[0] * len(X[0])] + X[:-1].tolist())\n",
        "# Create a zero column of shape (100, 1)\n",
        "'''\n",
        "zero_col = np.zeros((X.shape[0], 1))\n",
        "\n",
        "# Concatenate the original array with the zero column along the second axis (columns)\n",
        "X = np.hstack((X, zero_col))\n",
        "#X =  dataset.drop(['ate'], axis = 1).values\n",
        "\n",
        "#X_train, X_test, y_train, y_test =train_test_split(X.values, y, test_size=0.2, shuffle=False)\n",
        "'''"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "baffd6ba",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "baffd6ba",
        "outputId": "5f05db1d-37d3-4adc-d252-d0d96ab90597"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(69680, 7)"
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ],
      "source": [
        "X.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "6d0706c5",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 138,
          "referenced_widgets": [
            "3a1b2a42cc844ef982e99d3f70abc1e5",
            "f597bf3b256a4746a664b326b56b96e9",
            "8b423b40e2af4e37897ab4180e5ff95c",
            "8c97c27b3c674299a01619b12ce16e5e",
            "df136c951ee049ea82e6fee406d18971",
            "28af17880ea04062bee002f2bfa91c97",
            "4c09b64952914682955da61e18bec753",
            "5f449c4f26d144a29aeefce9a57988a8",
            "428191df92ae40ddbe14e790be9c9bf4",
            "3b43eb0d025c4d1e8a2c3ffc5fa44625",
            "fb2880e8783746d8b29401cff8f86440"
          ]
        },
        "id": "6d0706c5",
        "outputId": "66c38a8f-2242-4789-96c5-270291f5f40d"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "creating seq\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  0%|          | 0/68961 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "3a1b2a42cc844ef982e99d3f70abc1e5"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-5-1feea5ab717f>:11: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:245.)\n",
            "  return torch.tensor(sequences), torch.tensor(targets)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "torch.Size([68961, 384, 7])\n"
          ]
        }
      ],
      "source": [
        "from tqdm.auto import tqdm\n",
        "# 1. Preprocess the data into the required format\n",
        "def create_sequences(data, seq_length, pred_length):\n",
        "    sequences = []\n",
        "    targets = []\n",
        "    print('creating seq')\n",
        "    for i in tqdm(range(len(data) - seq_length - pred_length + 1)):\n",
        "\n",
        "        sequences.append(data[i:i+seq_length])\n",
        "        targets.append(data[i+seq_length:i+seq_length+pred_length])\n",
        "    return torch.tensor(sequences), torch.tensor(targets)\n",
        "\n",
        "X, y = create_sequences(data=X, seq_length=configuration.sequence_length, pred_length=configuration.pred_len)\n",
        "# Zeros tensor of shape [16941, 384, 1]\n",
        "print(X.shape)\n",
        "zeros = torch.zeros((X.size(0), X.size(1), 1), dtype=X.dtype)\n",
        "\n",
        "# Concatenate along the last dimension\n",
        "X = torch.cat((X, zeros), dim=-1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "id": "3e13e0d0",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3e13e0d0",
        "outputId": "8faccd22-70ec-4485-ac84-733a08027c42"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(torch.Size([69201, 384, 10]), torch.Size([69201, 96, 7]))"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ],
      "source": [
        "X.shape, y.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "id": "7955f82a",
      "metadata": {
        "id": "7955f82a"
      },
      "outputs": [],
      "source": [
        "\n",
        "batch=64\n",
        "indices = np.arange(len(X))\n",
        "barrier = int(len(indices)/batch)*batch\n",
        "indices = indices[0:barrier]\n",
        "soft_border = int((configuration.sequence_length/batch))+16\n",
        "\n",
        "indices = [indices[i:i+batch] for i in range(0, len(indices), batch)]\n",
        "\n",
        "border1 = int(len(indices)*0.7)\n",
        "border2 = border1+int(len(indices)*0.1)\n",
        "border3 = border2+int(len(indices)*0.2)\n",
        "\n",
        "train_ind = indices[0:border1]\n",
        "val_ind = indices[border1-soft_border: border2]\n",
        "test_ind = indices[border2-soft_border: border3]\n",
        "\n",
        "random.shuffle(train_ind)\n",
        "random.shuffle(val_ind)\n",
        "#random.shuffle(test_ind)\n",
        "\n",
        "\n",
        "X_train = [X[item] for sublist in train_ind for item in sublist]\n",
        "y_train = [y[item] for sublist in train_ind for item in sublist]\n",
        "\n",
        "X_val = [X[item] for sublist in val_ind for item in sublist]\n",
        "y_val = [y[item] for sublist in val_ind for item in sublist]\n",
        "\n",
        "X_test = [X[item] for sublist in test_ind for item in sublist]\n",
        "y_test = [y[item] for sublist in test_ind for item in sublist]\n",
        "\n",
        "#train_indices, test_indices =train_test_split(indices,  test_size=0.2, shuffle=False)\n",
        "#indices = [item for sublist in indices for item in sublist]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "id": "26d5bda1",
      "metadata": {
        "id": "26d5bda1"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from torch.utils.data import Dataset, DataLoader\n",
        "\n",
        "class CustomDataset(Dataset):\n",
        "    def __init__(self, tokenized_inputs,  labels=None, pos=None):\n",
        "        self.tokenized_inputs = tokenized_inputs\n",
        "        self.labels = labels\n",
        "        self.pos = pos\n",
        "        self.id_list = None\n",
        "        self.re = None\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.tokenized_inputs)\n",
        "\n",
        "    def __getitem__(self, idx):\n",
        "        if self.labels is not None:\n",
        "            return {\n",
        "                \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
        "                \"labels_ids\": torch.tensor(self.labels[idx]),\n",
        "                #\"id\": torch.tensor(self.id_list[idx]),  # Include the id directly\n",
        "                #\"reservoir_ids\": torch.tensor(self.re[idx]),\n",
        "            }\n",
        "        else:\n",
        "            return {\n",
        "                \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
        "            }\n",
        "\n",
        "# Assuming you have X_train, y_train, X_test, y_test, trainpos, and testpos defined\n",
        "\n",
        "\n",
        "train_dataset = CustomDataset(X_train, y_train)\n",
        "\n",
        "test_dataset = CustomDataset(X_test, y_test)\n",
        "\n",
        "val_dataset = CustomDataset(X_val, y_val)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "13806390",
      "metadata": {
        "id": "13806390"
      },
      "outputs": [],
      "source": [
        "from transformers import Trainer, TrainingArguments\n",
        "from torch.utils.data import DataLoader\n",
        "from torch.cuda.amp import GradScaler\n",
        "class CustomTrainer(Trainer):\n",
        "    def __init__(self, *args, gradient_accumulation_steps=1, **kwargs):\n",
        "        super().__init__(*args, **kwargs)\n",
        "        self.gradient_accumulation_steps = gradient_accumulation_steps\n",
        "        self.scaler = GradScaler()\n",
        "\n",
        "    def training_step(self, model, inputs):\n",
        "        model.train()\n",
        "        inputs = self._prepare_inputs(inputs)\n",
        "        loss = self.compute_loss(model, inputs)\n",
        "\n",
        "        if self.args.n_gpu > 1:\n",
        "            loss = loss.mean()  # mean() to average on multi-gpu parallel training\n",
        "\n",
        "        loss = loss / self.gradient_accumulation_steps\n",
        "        self.scaler.scale(loss).backward()\n",
        "\n",
        "        return loss.detach()\n",
        "\n",
        "\n",
        "    def get_train_dataloader(self) -> DataLoader:\n",
        "        \"\"\"\n",
        "        Returns the training [`~torch.utils.data.DataLoader`].\n",
        "        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed\n",
        "        training if necessary) otherwise.\n",
        "        Subclass and override this method if you want to inject some custom behavior.\n",
        "        \"\"\"\n",
        "        if self.train_dataset is None:\n",
        "            raise ValueError(\"Trainer: training requires a train_dataset.\")\n",
        "\n",
        "        train_dataset = self.train_dataset\n",
        "\n",
        "\n",
        "        loader =  DataLoader(\n",
        "            train_dataset,\n",
        "            batch_size=self._train_batch_size,\n",
        "            drop_last=self.args.dataloader_drop_last,\n",
        "            shuffle = False,\n",
        "        )\n",
        "        return loader\n",
        "\n",
        "#bert_model = TabularBertForSequenceClassification(config=configuration).to(\"cpu\", dtype=float)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "id": "cb4d10de",
      "metadata": {
        "scrolled": true,
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "cb4d10de",
        "outputId": "174688c5-5fbb-4cc3-cc85-f8d42ce2f4f1"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n",
            "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.6782, 'learning_rate': 0.0009995573262505534, 'epoch': 0.07}\n",
            "{'eval_loss': 0.8157996192982309, 'eval_r2_score': -0.1818431358076915, 'eval_mse': 0.8157996192982303, 'eval_runtime': 46.6468, 'eval_samples_per_second': 176.99, 'eval_steps_per_second': 2.765, 'epoch': 0.07}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 1.0382, 'learning_rate': 0.0009991146525011067, 'epoch': 0.13}\n",
            "{'eval_loss': 0.6301944444442849, 'eval_r2_score': 0.08704176764472515, 'eval_mse': 0.630194444444284, 'eval_runtime': 46.4809, 'eval_samples_per_second': 177.621, 'eval_steps_per_second': 2.775, 'epoch': 0.13}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.5176, 'learning_rate': 0.00099867197875166, 'epoch': 0.2}\n",
            "{'eval_loss': 0.5139392962470085, 'eval_r2_score': 0.2554597782064919, 'eval_mse': 0.5139392962470096, 'eval_runtime': 46.8861, 'eval_samples_per_second': 176.086, 'eval_steps_per_second': 2.751, 'epoch': 0.2}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 1.1041, 'learning_rate': 0.0009982293050022134, 'epoch': 0.27}\n",
            "{'eval_loss': 0.44930733079850166, 'eval_r2_score': 0.3490916492102899, 'eval_mse': 0.4493073307985012, 'eval_runtime': 46.4371, 'eval_samples_per_second': 177.789, 'eval_steps_per_second': 2.778, 'epoch': 0.27}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3507, 'learning_rate': 0.0009977866312527667, 'epoch': 0.33}\n",
            "{'eval_loss': 0.43647193980916593, 'eval_r2_score': 0.36768618931218733, 'eval_mse': 0.43647193980916615, 'eval_runtime': 46.598, 'eval_samples_per_second': 177.175, 'eval_steps_per_second': 2.768, 'epoch': 0.33}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.6587, 'learning_rate': 0.00099734395750332, 'epoch': 0.4}\n",
            "{'eval_loss': 0.4282414848658383, 'eval_r2_score': 0.37960959114916515, 'eval_mse': 0.42824148486583846, 'eval_runtime': 47.0955, 'eval_samples_per_second': 175.303, 'eval_steps_per_second': 2.739, 'epoch': 0.4}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.7274, 'learning_rate': 0.0009969012837538734, 'epoch': 0.46}\n",
            "{'eval_loss': 0.3574402877234974, 'eval_r2_score': 0.48217878445379403, 'eval_mse': 0.35744028772349745, 'eval_runtime': 51.1255, 'eval_samples_per_second': 161.485, 'eval_steps_per_second': 2.523, 'epoch': 0.46}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.4895, 'learning_rate': 0.0009964586100044267, 'epoch': 0.53}\n",
            "{'eval_loss': 0.38594331703266765, 'eval_r2_score': 0.4408866475835356, 'eval_mse': 0.3859433170326668, 'eval_runtime': 47.3498, 'eval_samples_per_second': 174.362, 'eval_steps_per_second': 2.724, 'epoch': 0.53}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3955, 'learning_rate': 0.00099601593625498, 'epoch': 0.6}\n",
            "{'eval_loss': 0.3077084490037781, 'eval_r2_score': 0.554224947300199, 'eval_mse': 0.30770844900377814, 'eval_runtime': 46.8529, 'eval_samples_per_second': 176.211, 'eval_steps_per_second': 2.753, 'epoch': 0.6}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3129, 'learning_rate': 0.0009955732625055334, 'epoch': 0.66}\n",
            "{'eval_loss': 0.2968212123234154, 'eval_r2_score': 0.5699972100399985, 'eval_mse': 0.2968212123234157, 'eval_runtime': 46.7499, 'eval_samples_per_second': 176.599, 'eval_steps_per_second': 2.759, 'epoch': 0.66}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.5435, 'learning_rate': 0.0009951305887560867, 'epoch': 0.73}\n",
            "{'eval_loss': 0.32251617244865793, 'eval_r2_score': 0.5327731031263512, 'eval_mse': 0.32251617244865805, 'eval_runtime': 46.2815, 'eval_samples_per_second': 178.387, 'eval_steps_per_second': 2.787, 'epoch': 0.73}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3573, 'learning_rate': 0.00099468791500664, 'epoch': 0.8}\n",
            "{'eval_loss': 0.3212089399365049, 'eval_r2_score': 0.5346668816166152, 'eval_mse': 0.3212089399365051, 'eval_runtime': 46.8304, 'eval_samples_per_second': 176.296, 'eval_steps_per_second': 2.755, 'epoch': 0.8}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3892, 'learning_rate': 0.0009942452412571934, 'epoch': 0.86}\n",
            "{'eval_loss': 0.3286775333938976, 'eval_r2_score': 0.5238471831233127, 'eval_mse': 0.32867753339389794, 'eval_runtime': 46.5151, 'eval_samples_per_second': 177.491, 'eval_steps_per_second': 2.773, 'epoch': 0.86}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3843, 'learning_rate': 0.0009938025675077468, 'epoch': 0.93}\n",
            "{'eval_loss': 0.28943687838120336, 'eval_r2_score': 0.580694842369887, 'eval_mse': 0.2894368783812032, 'eval_runtime': 46.7565, 'eval_samples_per_second': 176.574, 'eval_steps_per_second': 2.759, 'epoch': 0.93}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.478, 'learning_rate': 0.0009933598937583, 'epoch': 1.0}\n",
            "{'eval_loss': 0.28925133507052836, 'eval_r2_score': 0.5809636376511398, 'eval_mse': 0.2892513350705292, 'eval_runtime': 46.9808, 'eval_samples_per_second': 175.731, 'eval_steps_per_second': 2.746, 'epoch': 1.0}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2893, 'learning_rate': 0.0009929172200088536, 'epoch': 1.06}\n",
            "{'eval_loss': 0.2831267337820616, 'eval_r2_score': 0.5898363041995272, 'eval_mse': 0.2831267337820614, 'eval_runtime': 46.52, 'eval_samples_per_second': 177.472, 'eval_steps_per_second': 2.773, 'epoch': 1.06}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.5025, 'learning_rate': 0.0009924745462594068, 'epoch': 1.13}\n",
            "{'eval_loss': 0.29449547686020755, 'eval_r2_score': 0.5733664865484405, 'eval_mse': 0.29449547686020716, 'eval_runtime': 46.6915, 'eval_samples_per_second': 176.82, 'eval_steps_per_second': 2.763, 'epoch': 1.13}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2366, 'learning_rate': 0.00099203187250996, 'epoch': 1.2}\n",
            "{'eval_loss': 0.2507262365934877, 'eval_r2_score': 0.6367746752078538, 'eval_mse': 0.25072623659348797, 'eval_runtime': 46.66, 'eval_samples_per_second': 176.94, 'eval_steps_per_second': 2.765, 'epoch': 1.2}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.5971, 'learning_rate': 0.0009915891987605137, 'epoch': 1.26}\n",
            "{'eval_loss': 0.26010194497096095, 'eval_r2_score': 0.6231921528247437, 'eval_mse': 0.2601019449709607, 'eval_runtime': 46.3923, 'eval_samples_per_second': 177.96, 'eval_steps_per_second': 2.781, 'epoch': 1.26}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2994, 'learning_rate': 0.0009911465250110668, 'epoch': 1.33}\n",
            "{'eval_loss': 0.2995267140860969, 'eval_r2_score': 0.5660777687807703, 'eval_mse': 0.29952671408609666, 'eval_runtime': 46.9005, 'eval_samples_per_second': 176.032, 'eval_steps_per_second': 2.751, 'epoch': 1.33}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3591, 'learning_rate': 0.0009907038512616201, 'epoch': 1.39}\n",
            "{'eval_loss': 0.3112384655776482, 'eval_r2_score': 0.5491110372683403, 'eval_mse': 0.3112384655776482, 'eval_runtime': 46.587, 'eval_samples_per_second': 177.217, 'eval_steps_per_second': 2.769, 'epoch': 1.39}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.454, 'learning_rate': 0.0009902611775121737, 'epoch': 1.46}\n",
            "{'eval_loss': 0.5302427655586827, 'eval_r2_score': 0.23184105757944973, 'eval_mse': 0.5302427655586828, 'eval_runtime': 46.3354, 'eval_samples_per_second': 178.179, 'eval_steps_per_second': 2.784, 'epoch': 1.46}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.255, 'learning_rate': 0.0009898185037627268, 'epoch': 1.53}\n",
            "{'eval_loss': 0.2527385845851758, 'eval_r2_score': 0.6338594009118454, 'eval_mse': 0.2527385845851765, 'eval_runtime': 47.0306, 'eval_samples_per_second': 175.545, 'eval_steps_per_second': 2.743, 'epoch': 1.53}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2316, 'learning_rate': 0.0009893758300132803, 'epoch': 1.59}\n",
            "{'eval_loss': 0.22032022134805643, 'eval_r2_score': 0.6808236543382802, 'eval_mse': 0.22032022134805623, 'eval_runtime': 46.2879, 'eval_samples_per_second': 178.362, 'eval_steps_per_second': 2.787, 'epoch': 1.59}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2076, 'learning_rate': 0.0009889331562638337, 'epoch': 1.66}\n",
            "{'eval_loss': 0.24330190583364364, 'eval_r2_score': 0.6475302506444246, 'eval_mse': 0.24330190583364367, 'eval_runtime': 46.8718, 'eval_samples_per_second': 176.14, 'eval_steps_per_second': 2.752, 'epoch': 1.66}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.4079, 'learning_rate': 0.0009884904825143868, 'epoch': 1.73}\n",
            "{'eval_loss': 0.2483579680378207, 'eval_r2_score': 0.6402055691063728, 'eval_mse': 0.24835796803782073, 'eval_runtime': 46.8313, 'eval_samples_per_second': 176.292, 'eval_steps_per_second': 2.755, 'epoch': 1.73}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2823, 'learning_rate': 0.0009880478087649404, 'epoch': 1.79}\n",
            "{'eval_loss': 0.25760457020039984, 'eval_r2_score': 0.6268100819832126, 'eval_mse': 0.2576045702003997, 'eval_runtime': 46.4538, 'eval_samples_per_second': 177.725, 'eval_steps_per_second': 2.777, 'epoch': 1.79}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2727, 'learning_rate': 0.0009876051350154937, 'epoch': 1.86}\n",
            "{'eval_loss': 0.32003583582614326, 'eval_r2_score': 0.5363663492403095, 'eval_mse': 0.32003583582614287, 'eval_runtime': 46.9421, 'eval_samples_per_second': 175.876, 'eval_steps_per_second': 2.748, 'epoch': 1.86}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2384, 'learning_rate': 0.0009871624612660468, 'epoch': 1.93}\n",
            "{'eval_loss': 0.25463475645596423, 'eval_r2_score': 0.6311124301401134, 'eval_mse': 0.2546347564559645, 'eval_runtime': 46.5823, 'eval_samples_per_second': 177.235, 'eval_steps_per_second': 2.769, 'epoch': 1.93}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3666, 'learning_rate': 0.0009867197875166004, 'epoch': 1.99}\n",
            "{'eval_loss': 0.3047233868148099, 'eval_r2_score': 0.5585493857707958, 'eval_mse': 0.3047233868148096, 'eval_runtime': 46.3858, 'eval_samples_per_second': 177.985, 'eval_steps_per_second': 2.781, 'epoch': 1.99}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.2745, 'learning_rate': 0.0009862771137671537, 'epoch': 2.06}\n",
            "{'eval_loss': 0.2781263867546574, 'eval_r2_score': 0.5970802715552324, 'eval_mse': 0.27812638675465795, 'eval_runtime': 47.0033, 'eval_samples_per_second': 175.647, 'eval_steps_per_second': 2.744, 'epoch': 2.06}\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-7-5df45e9992a5>:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"inputs_embeds\": torch.tensor(self.tokenized_inputs[idx]),\n",
            "<ipython-input-7-5df45e9992a5>:19: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
            "  \"labels_ids\": torch.tensor(self.labels[idx]),\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{'loss': 0.3716, 'learning_rate': 0.000985834440017707, 'epoch': 2.12}\n"
          ]
        },
        {
          "output_type": "error",
          "ename": "KeyboardInterrupt",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-9-7ec879d4ca03>\u001b[0m in \u001b[0;36m<cell line: 71>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     69\u001b[0m )\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1553\u001b[0m                 \u001b[0mhf_hub_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menable_progress_bars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1554\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1555\u001b[0;31m             return inner_training_loop(\n\u001b[0m\u001b[1;32m   1556\u001b[0m                 \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1557\u001b[0m                 \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1927\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1928\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1929\u001b[0;31m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_log_save_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtr_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1930\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1931\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_substep_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2254\u001b[0m                     \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_metrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2255\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2256\u001b[0;31m                 \u001b[0mmetrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mignore_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2257\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_report_to_hp_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   2970\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2971\u001b[0m         \u001b[0meval_loop\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprediction_loop\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_legacy_prediction_loop\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluation_loop\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2972\u001b[0;31m         output = eval_loop(\n\u001b[0m\u001b[1;32m   2973\u001b[0m             \u001b[0meval_dataloader\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2974\u001b[0m             \u001b[0mdescription\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Evaluation\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mevaluation_loop\u001b[0;34m(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   3159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3160\u001b[0m             \u001b[0;31m# Prediction step\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3161\u001b[0;31m             \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprediction_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprediction_loss_only\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_keys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3162\u001b[0m             \u001b[0mmain_input_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"main_input_name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"input_ids\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3163\u001b[0m             \u001b[0minputs_decode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmain_input_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minclude_inputs_for_metrics\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mprediction_step\u001b[0;34m(self, model, inputs, prediction_loss_only, ignore_keys)\u001b[0m\n\u001b[1;32m   3374\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mhas_labels\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mloss_without_labels\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3375\u001b[0m                     \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_loss_context_manager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3376\u001b[0;31m                         \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_outputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3377\u001b[0m                     \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3378\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mcompute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   2705\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2706\u001b[0m             \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2707\u001b[0;31m         \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2708\u001b[0m         \u001b[0;31m# Save past state if it exists\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2709\u001b[0m         \u001b[0;31m# TODO: this needs to be fixed and made cleaner later.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, reservoir_ids, state_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels_ids, output_attentions, output_hidden_states, return_dict, id)\u001b[0m\n\u001b[1;32m   1127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1129\u001b[0;31m         outputs = self.bert(\n\u001b[0m\u001b[1;32m   1130\u001b[0m             \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1131\u001b[0m             \u001b[0mreservoir_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreservoir_outputs\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_reservoirs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, reservoir_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    846\u001b[0m             \u001b[0mpast_key_values_length\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpast_key_values_length\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    847\u001b[0m         )\n\u001b[0;32m--> 848\u001b[0;31m         encoder_outputs = self.encoder(\n\u001b[0m\u001b[1;32m    849\u001b[0m             \u001b[0membedding_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    850\u001b[0m             \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mextended_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    563\u001b[0m                 )\n\u001b[1;32m    564\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 565\u001b[0;31m                 layer_outputs = layer_module(\n\u001b[0m\u001b[1;32m    566\u001b[0m                     \u001b[0mhidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    567\u001b[0m                     \u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    490\u001b[0m             \u001b[0mpresent_key_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpresent_key_value\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcross_attn_present_key_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    491\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 492\u001b[0;31m         layer_output = apply_chunking_to_forward(\n\u001b[0m\u001b[1;32m    493\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunk_size_feed_forward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq_len_dim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    494\u001b[0m         )\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/pytorch_utils.py\u001b[0m in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m    237\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_chunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    238\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 239\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mforward_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput_tensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    240\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mfeed_forward_chunk\u001b[0;34m(self, attention_output)\u001b[0m\n\u001b[1;32m    503\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m         \u001b[0mintermediate_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintermediate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m         \u001b[0mlayer_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintermediate_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    506\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mlayer_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/content/reservoirtransformers.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, input_tensor)\u001b[0m\n\u001b[1;32m    417\u001b[0m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    418\u001b[0m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 419\u001b[0;31m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLayerNorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    420\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    421\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1499\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1500\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1502\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/normalization.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    189\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m         return F.layer_norm(\n\u001b[0m\u001b[1;32m    191\u001b[0m             input, self.normalized_shape, self.weight, self.bias, self.eps)\n\u001b[1;32m    192\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mlayer_norm\u001b[0;34m(input, normalized_shape, weight, bias, eps)\u001b[0m\n\u001b[1;32m   2513\u001b[0m             \u001b[0mlayer_norm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalized_shape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0meps\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2514\u001b[0m         )\n\u001b[0;32m-> 2515\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_norm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalized_shape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meps\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackends\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcudnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menabled\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2516\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2517\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
          ]
        }
      ],
      "source": [
        "from transformers import BertForSequenceClassification, Trainer, TrainingArguments\n",
        "from sklearn.metrics import mean_squared_error\n",
        "from transformers import Trainer, TrainingArguments\n",
        "from transformers import EarlyStoppingCallback, IntervalStrategy\n",
        "from sklearn.metrics import r2_score, accuracy_score\n",
        "import numpy as np\n",
        "\n",
        "def compute_metrics1(p):\n",
        "\n",
        "    preds = p.predictions.flatten()\n",
        "    labels = p.label_ids.flatten()\n",
        "\n",
        "    r2 = r2_score(labels, preds)\n",
        "    mse = mean_squared_error(labels, preds)\n",
        "\n",
        "    return {\"r2_score\": r2, \"mse\": mse}\n",
        "\n",
        "def compute_metrics_classification(p):\n",
        "    preds = np.argmax(p.predictions , axis=1)\n",
        "    labels = p.label_ids\n",
        "\n",
        "    accuracy = accuracy_score(labels, preds)\n",
        "    return {\"accuracy_score\": accuracy}\n",
        "\n",
        "def compute_metrics(p):\n",
        "\n",
        "\n",
        "    prediction_scores, labels_ids = p\n",
        "    #print('here')\n",
        "    #print(prediction_scores)\n",
        "\n",
        "    mask = labels_ids != 100\n",
        "    #print(mask)\n",
        "    masked_predictions = prediction_scores[mask]\n",
        "    masked_labels = labels_ids[mask]\n",
        "\n",
        "    mse = mean_squared_error(masked_predictions, masked_labels)\n",
        "    return {\"mse\": mse}\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir='./results_task1',\n",
        "    num_train_epochs=150,\n",
        "    label_names=[\"labels_ids\"],\n",
        "    disable_tqdm = True,\n",
        "    #label_names=[\"labels_mask\"],\n",
        "    do_eval=True,\n",
        "    learning_rate=0.001,\n",
        "    per_device_train_batch_size=batch,\n",
        "    per_device_eval_batch_size=batch,\n",
        "    logging_dir='./logs',\n",
        "    logging_strategy=\"steps\",\n",
        "    logging_steps=50,\n",
        "    evaluation_strategy=\"steps\",\n",
        "    eval_steps = 50,\n",
        "    save_strategy=\"steps\",\n",
        "    save_steps=50,\n",
        "\n",
        "    save_total_limit=2,\n",
        "    load_best_model_at_end=True,\n",
        ")\n",
        "\n",
        "trainer = CustomTrainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=train_dataset,\n",
        "    eval_dataset=val_dataset,\n",
        "    compute_metrics=compute_metrics1, #compute_metrics1,#compute_metrics_classification,\n",
        "    callbacks = [EarlyStoppingCallback(early_stopping_patience=50)]\n",
        ")\n",
        "\n",
        "trainer.train()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "id": "cac7d37e",
      "metadata": {
        "id": "cac7d37e"
      },
      "outputs": [],
      "source": [
        "model = reservoirtransformers.ReservoirTTimeSeries.from_pretrained(\"/content/results_task1/checkpoint-1200\", config=configuration).to(\"cuda\", dtype=float)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "id": "827f3400",
      "metadata": {
        "id": "827f3400"
      },
      "outputs": [],
      "source": [
        "cnt = 0\n",
        "ln = len(X_test)\n",
        "y_pred = []\n",
        "y_test1 = []\n",
        "while cnt < ln:\n",
        "    #print(cnt, ln)\n",
        "    input_ids = torch.stack(X_test[cnt:cnt+batch], dim=0)\n",
        "    #y_test1 = y_test1 + [k.detach().numpy().flatten() for k in y_test[cnt:cnt+64]]\n",
        "\n",
        "    output = model(inputs_embeds = input_ids.to(model.device))['logits']\n",
        "    y_pred = y_pred + list(output.cpu().detach().numpy().reshape(output.size(0), -1))\n",
        "    #y_test = y_test + labels_ids\n",
        "\n",
        "    cnt=cnt+batch\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "id": "1fae404b",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1fae404b",
        "outputId": "d8ed1618-078a-4ca8-a255-cffbad6d13d4"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.21056613594457688"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
        "y_test1 = [i.detach().numpy().flatten() for i in y_test]\n",
        "\n",
        "mse = mean_squared_error(y_test1, y_pred)\n",
        "mse"
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "AOenxRwjeFH6"
      },
      "id": "AOenxRwjeFH6",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.4"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "3a1b2a42cc844ef982e99d3f70abc1e5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_f597bf3b256a4746a664b326b56b96e9",
              "IPY_MODEL_8b423b40e2af4e37897ab4180e5ff95c",
              "IPY_MODEL_8c97c27b3c674299a01619b12ce16e5e"
            ],
            "layout": "IPY_MODEL_df136c951ee049ea82e6fee406d18971"
          }
        },
        "f597bf3b256a4746a664b326b56b96e9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_28af17880ea04062bee002f2bfa91c97",
            "placeholder": "​",
            "style": "IPY_MODEL_4c09b64952914682955da61e18bec753",
            "value": "100%"
          }
        },
        "8b423b40e2af4e37897ab4180e5ff95c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_5f449c4f26d144a29aeefce9a57988a8",
            "max": 68961,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_428191df92ae40ddbe14e790be9c9bf4",
            "value": 68961
          }
        },
        "8c97c27b3c674299a01619b12ce16e5e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_3b43eb0d025c4d1e8a2c3ffc5fa44625",
            "placeholder": "​",
            "style": "IPY_MODEL_fb2880e8783746d8b29401cff8f86440",
            "value": " 68961/68961 [00:00&lt;00:00, 184083.62it/s]"
          }
        },
        "df136c951ee049ea82e6fee406d18971": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "28af17880ea04062bee002f2bfa91c97": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4c09b64952914682955da61e18bec753": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "5f449c4f26d144a29aeefce9a57988a8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "428191df92ae40ddbe14e790be9c9bf4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "3b43eb0d025c4d1e8a2c3ffc5fa44625": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "fb2880e8783746d8b29401cff8f86440": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}