{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute the Activation Covariance Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\PC\\miniconda3\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n",
      "  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n",
      "  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n",
      "  - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n",
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import sys\n",
    "import os\n",
    "import pickle\n",
    "sys.path.append(\"./util\")\n",
    "import featureM_utility as util\n",
    "from datasets import load_dataset\n",
    "# Load model directly\n",
    "from transformers import BertTokenizer, BertForMaskedLM\n",
    "\n",
    "# Model identifier from Hugging Face hub\n",
    "model_name = \"bert-base-uncased\"\n",
    "\n",
    "# Load from Hugging Face (will download and cache if needed)\n",
    "\n",
    "model = BertForMaskedLM.from_pretrained(model_name)\n",
    "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
    "\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
    "model.eval()\n",
    "\n",
    "model.to(device)\n",
    "\n",
    "replacedM = util.replaceWithFMmodules( model=model, device=device)\n",
    "\n",
    "DATASET = \"WikiText\" # WikiText\n",
    "\n",
    "\n",
    "# Load some dataset \n",
    "def hugginfaceFMgeneration( model, input_ids, attention_mask):\n",
    "\n",
    "    batch_size = 16\n",
    "\n",
    "    for i in range(0, input_ids.size(0), batch_size):\n",
    "        input_batch = input_ids[i:i+batch_size]\n",
    "        attn_batch = attention_mask[i:i+batch_size]\n",
    "\n",
    "        with torch.no_grad():\n",
    "            outputs = model(input_ids=input_batch, attention_mask=attn_batch)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "using WikiText\n"
     ]
    }
   ],
   "source": [
    "# Load the dataset\n",
    "if( DATASET == \"BookCorpus\"):\n",
    "    dataset = load_dataset(\"bookcorpus\", split=\"train\", trust_remote_code=True) #\n",
    "    evalSentences = 10000\n",
    "    dataset = dataset.select(range(evalSentences))\n",
    "    print(\"using Bookcorpus\")\n",
    "\n",
    "if( DATASET == \"WikiText\"):\n",
    "    dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"test\", trust_remote_code=True)\n",
    "    print(\"using WikiText\")\n",
    "\n",
    "encodings = tokenizer(dataset[\"text\"], return_tensors=\"pt\", truncation=True, padding=True)\n",
    "\n",
    "input_ids = encodings.input_ids.to(device)\n",
    "attention_mask = encodings.attention_mask.to(device)\n",
    "\n",
    "\n",
    "hugginfaceFMgeneration( model, input_ids, attention_mask)\n",
    "\n",
    "# compute the FM\n",
    "util.setComputationMode(model, mode=\"FM\")\n",
    "hugginfaceFMgeneration(  model, input_ids, attention_mask)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# save the FM to a dictionary\n",
    "modules = [(name, module) for name, module in model.named_modules()  if isinstance(module, util.FeatureLayer) ]\n",
    "\n",
    "dictionary = {}\n",
    "for name, module in modules:\n",
    "    # only load valid ones\n",
    "    if( len(module.weight.shape) == 2 and (not \"pooler\" in name) and name != \"head.module.layers.0\" ):\n",
    "        # add it to the dictionary\n",
    "        dictionary[name] = {\"weight\":  module.weight.detach().cpu().numpy().copy(), \"FM\" : module.correlationM.detach().cpu().numpy().copy() } \n",
    "        \n",
    "with open(f'Data/{DATASET}_Bert', 'wb') as file:\n",
    "    pickle.dump(dictionary, file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
