{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████████████| 8/8 [00:02<00:00,  2.87it/s]\n",
      "Loading checkpoint shards: 100%|██████████████████| 8/8 [00:02<00:00,  3.49it/s]\n",
      "====rmu Config====\n",
      "model_name_or_path=HuggingFaceH4/zephyr-7b-beta\n",
      "module_str={model_name}.model.layers[{layer_id}]\n",
      "output_dir=models/zephyr_rmu\n",
      "retain_corpora=['wikitext', 'wikitext']\n",
      "forget_corpora=['bio-forget-corpus', 'cyber-forget-corpus']\n",
      "alpha=[1200.0, 1200.0]\n",
      "steering_coeffs=6.5,6.5\n",
      "lr=5e-05\n",
      "min_len=0\n",
      "max_len=2000\n",
      "batch_size=4\n",
      "max_num_batches=150\n",
      "layer_id=7\n",
      "layer_ids=[5, 6, 7]\n",
      "param_ids=[6]\n",
      "seed=42\n",
      "verbose=True\n",
      "steering_coeff_list=[6.5, 6.5]\n",
      "=====\n",
      "/home/hua2bv/.local/lib/python3.11/site-packages/transformers/optimization.py:429: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "======= Epoch 0 =======\n",
      "  0%|                                                   | 0/150 [00:00<?, ?it/s]/sfs/weka/scratch/hua2bv/unlearning/wmdp/rmu/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 512, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 0.09375 | unlearn_loss: 0.09375 | retain_loss: 0 | param_change: 2.217e-07\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=1.0\n",
      "Topic 0 updated_forget_activations.norm= 3.171875\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 4.46875\n",
      "Topic 0 frozen_retain_activations.norm= 4.46875\n",
      "  1%|▎                                          | 1/150 [00:02<05:27,  2.20s/it]/sfs/weka/scratch/hua2bv/unlearning/wmdp/rmu/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 768, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 0.05225 | unlearn_loss: 0.05225 | retain_loss: 4.387e-05 | param_change: 6.594e-07\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=1.0\n",
      "Topic 1 updated_forget_activations.norm= 2.515625\n",
      "Topic 1 frozen_forget_activations.norm= 2.515625\n",
      "Topic 1 updated_retain_activations.norm= 4.46875\n",
      "Topic 1 frozen_retain_activations.norm= 4.46875\n",
      "  1%|▌                                          | 2/150 [00:03<04:30,  1.83s/it]loss: 0.1011 | unlearn_loss: 0.09717 | retain_loss: 0.004089 | param_change: 4.053e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=1.0\n",
      "Topic 0 updated_forget_activations.norm= 3.171875\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.28125\n",
      "Topic 0 frozen_retain_activations.norm= 4.28125\n",
      "  2%|▊                                          | 3/150 [00:04<03:39,  1.49s/it]loss: 0.3066 | unlearn_loss: 0.0542 | retain_loss: 0.252 | param_change: 0.0001945\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.9375\n",
      "Topic 1 updated_forget_activations.norm= 2.5\n",
      "Topic 1 frozen_forget_activations.norm= 2.5\n",
      "Topic 1 updated_retain_activations.norm= 4.53125\n",
      "Topic 1 frozen_retain_activations.norm= 4.28125\n",
      "  3%|█▏                                         | 4/150 [00:06<03:23,  1.39s/it]loss: 0.2266 | unlearn_loss: 0.09326 | retain_loss: 0.1338 | param_change: 0.0001163\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.94921875\n",
      "Topic 0 updated_forget_activations.norm= 3.140625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.03125\n",
      "Topic 0 frozen_retain_activations.norm= 4.0625\n",
      "  3%|█▍                                         | 5/150 [00:07<03:05,  1.28s/it]loss: 0.1465 | unlearn_loss: 0.06348 | retain_loss: 0.0835 | param_change: 6.723e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.96875\n",
      "Topic 1 updated_forget_activations.norm= 2.625\n",
      "Topic 1 frozen_forget_activations.norm= 2.65625\n",
      "Topic 1 updated_retain_activations.norm= 4.03125\n",
      "Topic 1 frozen_retain_activations.norm= 4.0625\n",
      "  4%|█▋                                         | 6/150 [00:08<03:07,  1.30s/it]loss: 0.1729 | unlearn_loss: 0.0918 | retain_loss: 0.08105 | param_change: 5.436e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.96875\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 6.375\n",
      "Topic 0 frozen_retain_activations.norm= 6.375\n",
      "  5%|██                                         | 7/150 [00:09<02:52,  1.20s/it]loss: 0.1074 | unlearn_loss: 0.05396 | retain_loss: 0.05371 | param_change: 4.292e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.98046875\n",
      "Topic 1 updated_forget_activations.norm= 2.53125\n",
      "Topic 1 frozen_forget_activations.norm= 2.53125\n",
      "Topic 1 updated_retain_activations.norm= 6.40625\n",
      "Topic 1 frozen_retain_activations.norm= 6.375\n",
      "  5%|██▎                                        | 8/150 [00:10<02:48,  1.19s/it]loss: 0.1318 | unlearn_loss: 0.09668 | retain_loss: 0.0354 | param_change: 2.885e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.98828125\n",
      "Topic 0 updated_forget_activations.norm= 3.15625\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.96875\n",
      "Topic 0 frozen_retain_activations.norm= 4.96875\n",
      "  6%|██▌                                        | 9/150 [00:11<02:42,  1.15s/it]loss: 0.07373 | unlearn_loss: 0.0437 | retain_loss: 0.03015 | param_change: 2.372e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.98828125\n",
      "Topic 1 updated_forget_activations.norm= 2.375\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 4.96875\n",
      "Topic 1 frozen_retain_activations.norm= 4.96875\n",
      "  7%|██▊                                       | 10/150 [00:12<02:43,  1.17s/it]loss: 0.1172 | unlearn_loss: 0.09277 | retain_loss: 0.02417 | param_change: 2.182e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.1875\n",
      "Topic 0 frozen_forget_activations.norm= 3.203125\n",
      "Topic 0 updated_retain_activations.norm= 6.125\n",
      "Topic 0 frozen_retain_activations.norm= 6.125\n",
      "  7%|███                                       | 11/150 [00:13<02:34,  1.11s/it]loss: 0.07324 | unlearn_loss: 0.05396 | retain_loss: 0.01904 | param_change: 1.448e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.546875\n",
      "Topic 1 frozen_forget_activations.norm= 2.546875\n",
      "Topic 1 updated_retain_activations.norm= 6.125\n",
      "Topic 1 frozen_retain_activations.norm= 6.125\n",
      "  8%|███▎                                      | 12/150 [00:15<02:34,  1.12s/it]loss: 0.1133 | unlearn_loss: 0.09277 | retain_loss: 0.02051 | param_change: 1.657e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.109375\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.9375\n",
      "Topic 0 frozen_retain_activations.norm= 4.90625\n",
      "  9%|███▋                                      | 13/150 [00:16<02:29,  1.09s/it]loss: 0.06152 | unlearn_loss: 0.04395 | retain_loss: 0.01746 | param_change: 1.562e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.375\n",
      "Topic 1 frozen_forget_activations.norm= 2.375\n",
      "Topic 1 updated_retain_activations.norm= 4.9375\n",
      "Topic 1 frozen_retain_activations.norm= 4.90625\n",
      "  9%|███▉                                      | 14/150 [00:17<02:33,  1.13s/it]loss: 0.1338 | unlearn_loss: 0.0957 | retain_loss: 0.03809 | param_change: 2.372e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.984375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 4.75\n",
      "Topic 0 frozen_retain_activations.norm= 4.75\n",
      " 10%|████▏                                     | 15/150 [00:18<02:28,  1.10s/it]loss: 0.06299 | unlearn_loss: 0.03418 | retain_loss: 0.02881 | param_change: 1.669e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.98828125\n",
      "Topic 1 updated_forget_activations.norm= 2.265625\n",
      "Topic 1 frozen_forget_activations.norm= 2.28125\n",
      "Topic 1 updated_retain_activations.norm= 4.75\n",
      "Topic 1 frozen_retain_activations.norm= 4.75\n",
      " 11%|████▍                                     | 16/150 [00:19<02:32,  1.14s/it]loss: 0.1138 | unlearn_loss: 0.09277 | retain_loss: 0.02112 | param_change: 1.52e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 5.03125\n",
      "Topic 0 frozen_retain_activations.norm= 5.0\n",
      " 11%|████▊                                     | 17/150 [00:20<02:27,  1.11s/it]loss: 0.05127 | unlearn_loss: 0.03394 | retain_loss: 0.01721 | param_change: 1.502e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.21875\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 5.03125\n",
      "Topic 1 frozen_retain_activations.norm= 5.0\n",
      " 12%|█████                                     | 18/150 [00:21<02:29,  1.14s/it]loss: 0.1079 | unlearn_loss: 0.09424 | retain_loss: 0.01379 | param_change: 1.067e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 4.84375\n",
      "Topic 0 frozen_retain_activations.norm= 4.84375\n",
      " 13%|█████▎                                    | 19/150 [00:22<02:27,  1.12s/it]loss: 0.04736 | unlearn_loss: 0.03394 | retain_loss: 0.01349 | param_change: 1.067e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.203125\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 4.84375\n",
      "Topic 1 frozen_retain_activations.norm= 4.84375\n",
      " 13%|█████▌                                    | 20/150 [00:24<02:30,  1.16s/it]loss: 0.1152 | unlearn_loss: 0.09424 | retain_loss: 0.02087 | param_change: 1.657e-05\n",
      "unlearn_cosine_sim=1.0\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 10.1875\n",
      "Topic 0 frozen_retain_activations.norm= 10.1875\n",
      " 14%|█████▉                                    | 21/150 [00:25<02:20,  1.09s/it]loss: 0.07178 | unlearn_loss: 0.05469 | retain_loss: 0.01697 | param_change: 1.15e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.515625\n",
      "Topic 1 frozen_forget_activations.norm= 2.515625\n",
      "Topic 1 updated_retain_activations.norm= 10.1875\n",
      "Topic 1 frozen_retain_activations.norm= 10.1875\n",
      " 15%|██████▏                                   | 22/150 [00:26<02:19,  1.09s/it]loss: 0.1099 | unlearn_loss: 0.09473 | retain_loss: 0.01526 | param_change: 1.442e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 7.03125\n",
      "Topic 0 frozen_retain_activations.norm= 7.0\n",
      " 15%|██████▍                                   | 23/150 [00:27<02:14,  1.06s/it]loss: 0.07617 | unlearn_loss: 0.06299 | retain_loss: 0.01331 | param_change: 1.311e-05\n",
      "unlearn_cosine_sim=0.96875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.75\n",
      "Topic 1 frozen_forget_activations.norm= 2.703125\n",
      "Topic 1 updated_retain_activations.norm= 7.03125\n",
      "Topic 1 frozen_retain_activations.norm= 7.0\n",
      " 16%|██████▋                                   | 24/150 [00:28<02:15,  1.08s/it]loss: 0.1069 | unlearn_loss: 0.09668 | retain_loss: 0.01038 | param_change: 8.345e-06\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.109375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 7.03125\n",
      "Topic 0 frozen_retain_activations.norm= 7.03125\n",
      " 17%|███████                                   | 25/150 [00:29<02:11,  1.05s/it]loss: 0.05273 | unlearn_loss: 0.04346 | retain_loss: 0.009399 | param_change: 8.166e-06\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.390625\n",
      "Topic 1 frozen_forget_activations.norm= 2.40625\n",
      "Topic 1 updated_retain_activations.norm= 7.03125\n",
      "Topic 1 frozen_retain_activations.norm= 7.03125\n",
      " 17%|███████▎                                  | 26/150 [00:30<02:25,  1.17s/it]loss: 0.1133 | unlearn_loss: 0.0918 | retain_loss: 0.02136 | param_change: 1.645e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.09375\n",
      "Topic 0 frozen_retain_activations.norm= 4.125\n",
      " 18%|███████▌                                  | 27/150 [00:31<02:21,  1.15s/it]loss: 0.06006 | unlearn_loss: 0.04419 | retain_loss: 0.01599 | param_change: 1.192e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.328125\n",
      "Topic 1 frozen_forget_activations.norm= 2.359375\n",
      "Topic 1 updated_retain_activations.norm= 4.09375\n",
      "Topic 1 frozen_retain_activations.norm= 4.125\n",
      " 19%|███████▊                                  | 28/150 [00:33<02:25,  1.19s/it]loss: 0.104 | unlearn_loss: 0.09375 | retain_loss: 0.01013 | param_change: 1.448e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 6.09375\n",
      "Topic 0 frozen_retain_activations.norm= 6.09375\n",
      " 19%|████████                                  | 29/150 [00:34<02:15,  1.12s/it]loss: 0.07324 | unlearn_loss: 0.06299 | retain_loss: 0.01001 | param_change: 1.311e-05\n",
      "unlearn_cosine_sim=0.98046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.65625\n",
      "Topic 1 frozen_forget_activations.norm= 2.640625\n",
      "Topic 1 updated_retain_activations.norm= 6.09375\n",
      "Topic 1 frozen_retain_activations.norm= 6.09375\n",
      " 20%|████████▍                                 | 30/150 [00:35<02:14,  1.12s/it]loss: 0.104 | unlearn_loss: 0.09375 | retain_loss: 0.01044 | param_change: 1.24e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.21875\n",
      "Topic 0 frozen_retain_activations.norm= 4.1875\n",
      " 21%|████████▋                                 | 31/150 [00:36<02:11,  1.10s/it]loss: 0.05273 | unlearn_loss: 0.04346 | retain_loss: 0.009399 | param_change: 1.174e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.359375\n",
      "Topic 1 frozen_forget_activations.norm= 2.375\n",
      "Topic 1 updated_retain_activations.norm= 4.21875\n",
      "Topic 1 frozen_retain_activations.norm= 4.1875\n",
      " 21%|████████▉                                 | 32/150 [00:37<02:15,  1.15s/it]loss: 0.1094 | unlearn_loss: 0.09277 | retain_loss: 0.01672 | param_change: 1.621e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.078125\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 3.59375\n",
      "Topic 0 frozen_retain_activations.norm= 3.59375\n",
      " 22%|█████████▏                                | 33/150 [00:38<02:14,  1.15s/it]loss: 0.04736 | unlearn_loss: 0.03394 | retain_loss: 0.01337 | param_change: 1.103e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.28125\n",
      "Topic 1 updated_retain_activations.norm= 3.578125\n",
      "Topic 1 frozen_retain_activations.norm= 3.59375\n",
      " 23%|█████████▌                                | 34/150 [00:39<02:17,  1.19s/it]loss: 0.1084 | unlearn_loss: 0.0957 | retain_loss: 0.01245 | param_change: 2.36e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.4375\n",
      "Topic 0 frozen_retain_activations.norm= 4.46875\n",
      " 23%|█████████▊                                | 35/150 [00:41<02:16,  1.19s/it]loss: 0.04346 | unlearn_loss: 0.03394 | retain_loss: 0.009399 | param_change: 1.341e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.265625\n",
      "Topic 1 updated_retain_activations.norm= 4.46875\n",
      "Topic 1 frozen_retain_activations.norm= 4.46875\n",
      " 24%|██████████                                | 36/150 [00:42<02:15,  1.19s/it]loss: 0.103 | unlearn_loss: 0.09375 | retain_loss: 0.00946 | param_change: 1.127e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 5.3125\n",
      "Topic 0 frozen_retain_activations.norm= 5.3125\n",
      " 25%|██████████▎                               | 37/150 [00:43<02:07,  1.13s/it]loss: 0.04297 | unlearn_loss: 0.03394 | retain_loss: 0.009033 | param_change: 1.228e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.28125\n",
      "Topic 1 frozen_forget_activations.norm= 2.3125\n",
      "Topic 1 updated_retain_activations.norm= 5.3125\n",
      "Topic 1 frozen_retain_activations.norm= 5.3125\n",
      " 25%|██████████▋                               | 38/150 [00:44<02:09,  1.16s/it]loss: 0.1084 | unlearn_loss: 0.09619 | retain_loss: 0.01245 | param_change: 2.42e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 5.21875\n",
      "Topic 0 frozen_retain_activations.norm= 5.1875\n",
      " 26%|██████████▉                               | 39/150 [00:45<02:03,  1.11s/it]loss: 0.05127 | unlearn_loss: 0.0437 | retain_loss: 0.007507 | param_change: 1.18e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.359375\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 5.1875\n",
      "Topic 1 frozen_retain_activations.norm= 5.1875\n",
      " 27%|███████████▏                              | 40/150 [00:46<02:04,  1.13s/it]loss: 0.104 | unlearn_loss: 0.09375 | retain_loss: 0.01007 | param_change: 1.514e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 3.75\n",
      "Topic 0 frozen_retain_activations.norm= 3.765625\n",
      " 27%|███████████▍                              | 41/150 [00:47<02:02,  1.12s/it]loss: 0.06445 | unlearn_loss: 0.05371 | retain_loss: 0.01086 | param_change: 1.943e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.5\n",
      "Topic 1 frozen_forget_activations.norm= 2.53125\n",
      "Topic 1 updated_retain_activations.norm= 3.734375\n",
      "Topic 1 frozen_retain_activations.norm= 3.765625\n",
      " 28%|███████████▊                              | 42/150 [00:49<02:07,  1.18s/it]loss: 0.1094 | unlearn_loss: 0.09863 | retain_loss: 0.0105 | param_change: 2.003e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.0625\n",
      "Topic 0 frozen_retain_activations.norm= 4.0625\n",
      " 29%|████████████                              | 43/150 [00:50<02:02,  1.14s/it]loss: 0.07031 | unlearn_loss: 0.06299 | retain_loss: 0.007202 | param_change: 1.067e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.6875\n",
      "Topic 1 frozen_forget_activations.norm= 2.703125\n",
      "Topic 1 updated_retain_activations.norm= 4.0625\n",
      "Topic 1 frozen_retain_activations.norm= 4.0625\n",
      " 29%|████████████▎                             | 44/150 [00:51<02:04,  1.17s/it]loss: 0.104 | unlearn_loss: 0.09619 | retain_loss: 0.007812 | param_change: 1.419e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 3.953125\n",
      "Topic 0 frozen_retain_activations.norm= 3.9375\n",
      " 30%|████████████▌                             | 45/150 [00:52<02:00,  1.15s/it]loss: 0.05103 | unlearn_loss: 0.04346 | retain_loss: 0.007568 | param_change: 1.538e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.390625\n",
      "Topic 1 frozen_forget_activations.norm= 2.40625\n",
      "Topic 1 updated_retain_activations.norm= 3.96875\n",
      "Topic 1 frozen_retain_activations.norm= 3.9375\n",
      " 31%|████████████▉                             | 46/150 [00:53<02:03,  1.19s/it]loss: 0.1133 | unlearn_loss: 0.09668 | retain_loss: 0.0166 | param_change: 2.301e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.109375\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 41.5\n",
      "Topic 0 frozen_retain_activations.norm= 41.5\n",
      " 31%|█████████████▏                            | 47/150 [00:54<01:55,  1.12s/it]loss: 0.04712 | unlearn_loss: 0.03394 | retain_loss: 0.01318 | param_change: 2.742e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.28125\n",
      "Topic 1 updated_retain_activations.norm= 41.5\n",
      "Topic 1 frozen_retain_activations.norm= 41.5\n",
      " 32%|█████████████▍                            | 48/150 [00:55<01:51,  1.10s/it]loss: 0.106 | unlearn_loss: 0.0957 | retain_loss: 0.01031 | param_change: 1.991e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 31.875\n",
      "Topic 0 frozen_retain_activations.norm= 31.875\n",
      " 33%|█████████████▋                            | 49/150 [00:56<01:43,  1.03s/it]loss: 0.0437 | unlearn_loss: 0.03394 | retain_loss: 0.009705 | param_change: 2.694e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.25\n",
      "Topic 1 frozen_forget_activations.norm= 2.296875\n",
      "Topic 1 updated_retain_activations.norm= 31.875\n",
      "Topic 1 frozen_retain_activations.norm= 31.875\n",
      " 33%|██████████████                            | 50/150 [00:57<01:45,  1.06s/it]loss: 0.1025 | unlearn_loss: 0.09424 | retain_loss: 0.00824 | param_change: 4.363e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 44.5\n",
      "Topic 0 frozen_retain_activations.norm= 44.5\n",
      " 34%|██████████████▎                           | 51/150 [00:58<01:38,  1.01it/s]loss: 0.08398 | unlearn_loss: 0.05322 | retain_loss: 0.03052 | param_change: 5.15e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.98828125\n",
      "Topic 1 updated_forget_activations.norm= 2.484375\n",
      "Topic 1 frozen_forget_activations.norm= 2.515625\n",
      "Topic 1 updated_retain_activations.norm= 44.5\n",
      "Topic 1 frozen_retain_activations.norm= 44.5\n",
      " 35%|██████████████▌                           | 52/150 [00:59<01:39,  1.02s/it]loss: 0.1064 | unlearn_loss: 0.09229 | retain_loss: 0.0141 | param_change: 3.767e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.078125\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 31.75\n",
      "Topic 0 frozen_retain_activations.norm= 31.75\n",
      " 35%|██████████████▊                           | 53/150 [01:00<01:34,  1.03it/s]loss: 0.05078 | unlearn_loss: 0.03394 | retain_loss: 0.01672 | param_change: 5.317e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.28125\n",
      "Topic 1 updated_retain_activations.norm= 31.75\n",
      "Topic 1 frozen_retain_activations.norm= 31.75\n",
      " 36%|███████████████                           | 54/150 [01:01<01:36,  1.00s/it]loss: 0.1104 | unlearn_loss: 0.09375 | retain_loss: 0.01685 | param_change: 1.931e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.046875\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.34375\n",
      "Topic 0 frozen_retain_activations.norm= 4.34375\n",
      " 37%|███████████████▍                          | 55/150 [01:02<01:36,  1.02s/it]loss: 0.05566 | unlearn_loss: 0.04346 | retain_loss: 0.01215 | param_change: 1.264e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.328125\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 4.34375\n",
      "Topic 1 frozen_retain_activations.norm= 4.34375\n",
      " 37%|███████████████▋                          | 56/150 [01:03<01:42,  1.09s/it]loss: 0.0918 | unlearn_loss: 0.08105 | retain_loss: 0.01056 | param_change: 9.06e-06\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 2.921875\n",
      "Topic 0 frozen_forget_activations.norm= 3.0\n",
      "Topic 0 updated_retain_activations.norm= 4.375\n",
      "Topic 0 frozen_retain_activations.norm= 4.34375\n",
      " 38%|███████████████▉                          | 57/150 [01:05<01:40,  1.09s/it]loss: 0.05347 | unlearn_loss: 0.04321 | retain_loss: 0.01025 | param_change: 1.174e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.34375\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 4.375\n",
      "Topic 1 frozen_retain_activations.norm= 4.34375\n",
      " 39%|████████████████▏                         | 58/150 [01:06<01:43,  1.13s/it]loss: 0.1084 | unlearn_loss: 0.0918 | retain_loss: 0.01648 | param_change: 1.895e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 3.296875\n",
      "Topic 0 frozen_retain_activations.norm= 3.28125\n",
      " 39%|████████████████▌                         | 59/150 [01:07<01:45,  1.16s/it]loss: 0.04639 | unlearn_loss: 0.03369 | retain_loss: 0.01282 | param_change: 1.293e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.21875\n",
      "Topic 1 frozen_forget_activations.norm= 2.28125\n",
      "Topic 1 updated_retain_activations.norm= 3.296875\n",
      "Topic 1 frozen_retain_activations.norm= 3.28125\n",
      " 40%|████████████████▊                         | 60/150 [01:08<01:51,  1.24s/it]loss: 0.1045 | unlearn_loss: 0.09229 | retain_loss: 0.01215 | param_change: 1.329e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.046875\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 3.0\n",
      "Topic 0 frozen_retain_activations.norm= 3.015625\n",
      " 41%|█████████████████                         | 61/150 [01:10<01:50,  1.24s/it]loss: 0.04517 | unlearn_loss: 0.03394 | retain_loss: 0.01117 | param_change: 1.597e-05\n",
      "unlearn_cosine_sim=0.99609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.28125\n",
      "Topic 1 frozen_forget_activations.norm= 2.328125\n",
      "Topic 1 updated_retain_activations.norm= 3.0\n",
      "Topic 1 frozen_retain_activations.norm= 3.015625\n",
      " 41%|█████████████████▎                        | 62/150 [01:11<01:52,  1.28s/it]loss: 0.1035 | unlearn_loss: 0.09277 | retain_loss: 0.01086 | param_change: 1.55e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 2.984375\n",
      "Topic 0 frozen_retain_activations.norm= 3.0\n",
      " 42%|█████████████████▋                        | 63/150 [01:12<01:50,  1.27s/it]loss: 0.04395 | unlearn_loss: 0.03369 | retain_loss: 0.01013 | param_change: 1.86e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.1875\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.015625\n",
      "Topic 1 frozen_retain_activations.norm= 3.0\n",
      " 43%|█████████████████▉                        | 64/150 [01:14<01:53,  1.32s/it]loss: 0.1094 | unlearn_loss: 0.09814 | retain_loss: 0.01099 | param_change: 2.027e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.109375\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 3.703125\n",
      "Topic 0 frozen_retain_activations.norm= 3.71875\n",
      " 43%|██████████████████▏                       | 65/150 [01:15<01:47,  1.27s/it]loss: 0.0437 | unlearn_loss: 0.03369 | retain_loss: 0.01007 | param_change: 1.979e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.296875\n",
      "Topic 1 updated_retain_activations.norm= 3.75\n",
      "Topic 1 frozen_retain_activations.norm= 3.71875\n",
      " 44%|██████████████████▍                       | 66/150 [01:16<01:46,  1.27s/it]loss: 0.106 | unlearn_loss: 0.09277 | retain_loss: 0.013 | param_change: 1.764e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 3.78125\n",
      "Topic 0 frozen_retain_activations.norm= 3.78125\n",
      " 45%|██████████████████▊                       | 67/150 [01:17<01:41,  1.22s/it]loss: 0.05371 | unlearn_loss: 0.04297 | retain_loss: 0.01074 | param_change: 1.222e-05\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.390625\n",
      "Topic 1 frozen_forget_activations.norm= 2.4375\n",
      "Topic 1 updated_retain_activations.norm= 3.78125\n",
      "Topic 1 frozen_retain_activations.norm= 3.78125\n",
      " 45%|███████████████████                       | 68/150 [01:19<01:41,  1.24s/it]loss: 0.1016 | unlearn_loss: 0.0918 | retain_loss: 0.009521 | param_change: 9.716e-06\n",
      "unlearn_cosine_sim=0.98046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.125\n",
      "Topic 0 frozen_retain_activations.norm= 4.125\n",
      " 46%|███████████████████▎                      | 69/150 [01:20<01:38,  1.21s/it]loss: 0.05176 | unlearn_loss: 0.04321 | retain_loss: 0.008423 | param_change: 9.179e-06\n",
      "unlearn_cosine_sim=0.9921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.328125\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 4.125\n",
      "Topic 1 frozen_retain_activations.norm= 4.125\n",
      " 47%|███████████████████▌                      | 70/150 [01:21<01:37,  1.22s/it]loss: 0.1001 | unlearn_loss: 0.09277 | retain_loss: 0.007355 | param_change: 1.079e-05\n",
      "unlearn_cosine_sim=0.9765625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.03125\n",
      "Topic 0 frozen_retain_activations.norm= 4.03125\n",
      " 47%|███████████████████▉                      | 71/150 [01:22<01:33,  1.18s/it]loss: 0.06006 | unlearn_loss: 0.05298 | retain_loss: 0.00708 | param_change: 1.729e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.484375\n",
      "Topic 1 frozen_forget_activations.norm= 2.515625\n",
      "Topic 1 updated_retain_activations.norm= 4.0625\n",
      "Topic 1 frozen_retain_activations.norm= 4.03125\n",
      " 48%|████████████████████▏                     | 72/150 [01:23<01:33,  1.20s/it]loss: 0.1016 | unlearn_loss: 0.0918 | retain_loss: 0.009705 | param_change: 1.693e-05\n",
      "unlearn_cosine_sim=0.97265625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.078125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.53125\n",
      "Topic 0 frozen_retain_activations.norm= 4.5625\n",
      " 49%|████████████████████▍                     | 73/150 [01:24<01:31,  1.18s/it]loss: 0.04126 | unlearn_loss: 0.03345 | retain_loss: 0.007874 | param_change: 1.091e-05\n",
      "unlearn_cosine_sim=0.98828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.203125\n",
      "Topic 1 frozen_forget_activations.norm= 2.265625\n",
      "Topic 1 updated_retain_activations.norm= 4.53125\n",
      "Topic 1 frozen_retain_activations.norm= 4.5625\n",
      " 49%|████████████████████▋                     | 74/150 [01:26<01:29,  1.18s/it]loss: 0.1079 | unlearn_loss: 0.09229 | retain_loss: 0.01556 | param_change: 2.444e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.203125\n",
      "Topic 0 updated_retain_activations.norm= 3.4375\n",
      "Topic 0 frozen_retain_activations.norm= 3.40625\n",
      " 50%|█████████████████████                     | 75/150 [01:27<01:30,  1.21s/it]loss: 0.05273 | unlearn_loss: 0.0437 | retain_loss: 0.009094 | param_change: 1.037e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.3125\n",
      "Topic 1 frozen_forget_activations.norm= 2.375\n",
      "Topic 1 updated_retain_activations.norm= 3.40625\n",
      "Topic 1 frozen_retain_activations.norm= 3.40625\n",
      " 51%|█████████████████████▎                    | 76/150 [01:28<01:33,  1.27s/it]loss: 0.1021 | unlearn_loss: 0.09521 | retain_loss: 0.006805 | param_change: 1.025e-05\n",
      "unlearn_cosine_sim=0.96484375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.9375\n",
      "Topic 0 frozen_retain_activations.norm= 4.90625\n",
      " 51%|█████████████████████▌                    | 77/150 [01:29<01:28,  1.21s/it]loss: 0.06885 | unlearn_loss: 0.06299 | retain_loss: 0.006012 | param_change: 1.067e-05\n",
      "unlearn_cosine_sim=0.984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.546875\n",
      "Topic 1 frozen_forget_activations.norm= 2.59375\n",
      "Topic 1 updated_retain_activations.norm= 4.9375\n",
      "Topic 1 frozen_retain_activations.norm= 4.90625\n",
      " 52%|█████████████████████▊                    | 78/150 [01:31<01:26,  1.20s/it]loss: 0.1045 | unlearn_loss: 0.09033 | retain_loss: 0.0141 | param_change: 2.372e-05\n",
      "unlearn_cosine_sim=0.9765625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.078125\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 3.671875\n",
      "Topic 0 frozen_retain_activations.norm= 3.703125\n",
      " 53%|██████████████████████                    | 79/150 [01:32<01:24,  1.18s/it]/sfs/weka/scratch/hua2bv/unlearning/wmdp/rmu/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 668, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 0.09814 | unlearn_loss: 0.08545 | retain_loss: 0.01276 | param_change: 2.718e-05\n",
      "unlearn_cosine_sim=0.98046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.84375\n",
      "Topic 1 frozen_forget_activations.norm= 2.890625\n",
      "Topic 1 updated_retain_activations.norm= 3.703125\n",
      "Topic 1 frozen_retain_activations.norm= 3.703125\n",
      " 53%|██████████████████████▍                   | 80/150 [01:33<01:23,  1.19s/it]loss: 0.105 | unlearn_loss: 0.09131 | retain_loss: 0.01349 | param_change: 2.992e-05\n",
      "unlearn_cosine_sim=0.94921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.09375\n",
      "Topic 0 frozen_retain_activations.norm= 4.125\n",
      " 54%|██████████████████████▋                   | 81/150 [01:34<01:21,  1.17s/it]loss: 0.05762 | unlearn_loss: 0.04419 | retain_loss: 0.01343 | param_change: 3.719e-05\n",
      "unlearn_cosine_sim=0.98046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.34375\n",
      "Topic 1 frozen_forget_activations.norm= 2.390625\n",
      "Topic 1 updated_retain_activations.norm= 4.1875\n",
      "Topic 1 frozen_retain_activations.norm= 4.125\n",
      " 55%|██████████████████████▉                   | 82/150 [01:35<01:21,  1.20s/it]loss: 0.1045 | unlearn_loss: 0.09277 | retain_loss: 0.01166 | param_change: 1.729e-05\n",
      "unlearn_cosine_sim=0.953125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 4.0625\n",
      "Topic 0 frozen_retain_activations.norm= 4.03125\n",
      " 55%|███████████████████████▏                  | 83/150 [01:36<01:17,  1.16s/it]loss: 0.04248 | unlearn_loss: 0.03345 | retain_loss: 0.008911 | param_change: 1.276e-05\n",
      "unlearn_cosine_sim=0.98046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.1875\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.0625\n",
      "Topic 1 frozen_retain_activations.norm= 4.03125\n",
      " 56%|███████████████████████▌                  | 84/150 [01:38<01:17,  1.18s/it]loss: 0.1006 | unlearn_loss: 0.09375 | retain_loss: 0.006989 | param_change: 8.702e-06\n",
      "unlearn_cosine_sim=0.97265625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.109375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 3.921875\n",
      "Topic 0 frozen_retain_activations.norm= 3.921875\n",
      " 57%|███████████████████████▊                  | 85/150 [01:39<01:15,  1.16s/it]loss: 0.03979 | unlearn_loss: 0.0332 | retain_loss: 0.006653 | param_change: 8.762e-06\n",
      "unlearn_cosine_sim=0.9765625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.1875\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.921875\n",
      "Topic 1 frozen_retain_activations.norm= 3.921875\n",
      " 57%|████████████████████████                  | 86/150 [01:40<01:15,  1.18s/it]loss: 0.09961 | unlearn_loss: 0.09131 | retain_loss: 0.008362 | param_change: 1.365e-05\n",
      "unlearn_cosine_sim=0.94140625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.78125\n",
      "Topic 0 frozen_retain_activations.norm= 4.78125\n",
      " 58%|████████████████████████▎                 | 87/150 [01:41<01:12,  1.15s/it]loss: 0.05005 | unlearn_loss: 0.04297 | retain_loss: 0.006989 | param_change: 1.144e-05\n",
      "unlearn_cosine_sim=0.96875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.34375\n",
      "Topic 1 frozen_forget_activations.norm= 2.375\n",
      "Topic 1 updated_retain_activations.norm= 4.78125\n",
      "Topic 1 frozen_retain_activations.norm= 4.78125\n",
      " 59%|████████████████████████▋                 | 88/150 [01:42<01:12,  1.16s/it]loss: 0.1025 | unlearn_loss: 0.0957 | retain_loss: 0.00708 | param_change: 1.061e-05\n",
      "unlearn_cosine_sim=0.9375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.1875\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 4.3125\n",
      "Topic 0 frozen_retain_activations.norm= 4.34375\n",
      " 59%|████████████████████████▉                 | 89/150 [01:43<01:09,  1.13s/it]loss: 0.05835 | unlearn_loss: 0.05273 | retain_loss: 0.005554 | param_change: 6.735e-06\n",
      "unlearn_cosine_sim=0.96875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.484375\n",
      "Topic 1 frozen_forget_activations.norm= 2.5\n",
      "Topic 1 updated_retain_activations.norm= 4.34375\n",
      "Topic 1 frozen_retain_activations.norm= 4.34375\n",
      " 60%|█████████████████████████▏                | 90/150 [01:44<01:10,  1.17s/it]loss: 0.1011 | unlearn_loss: 0.0918 | retain_loss: 0.00946 | param_change: 1.574e-05\n",
      "unlearn_cosine_sim=0.90625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.203125\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 6.5625\n",
      "Topic 0 frozen_retain_activations.norm= 6.53125\n",
      " 61%|█████████████████████████▍                | 91/150 [01:45<01:05,  1.11s/it]loss: 0.04102 | unlearn_loss: 0.0332 | retain_loss: 0.007935 | param_change: 1.264e-05\n",
      "unlearn_cosine_sim=0.96875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.25\n",
      "Topic 1 frozen_forget_activations.norm= 2.265625\n",
      "Topic 1 updated_retain_activations.norm= 6.53125\n",
      "Topic 1 frozen_retain_activations.norm= 6.53125\n",
      " 61%|█████████████████████████▊                | 92/150 [01:47<01:04,  1.11s/it]loss: 0.1094 | unlearn_loss: 0.0957 | retain_loss: 0.01349 | param_change: 2.789e-05\n",
      "unlearn_cosine_sim=0.94921875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.171875\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 5.03125\n",
      "Topic 0 frozen_retain_activations.norm= 5.0625\n",
      " 62%|██████████████████████████                | 93/150 [01:48<01:02,  1.10s/it]loss: 0.06445 | unlearn_loss: 0.03296 | retain_loss: 0.03174 | param_change: 8.297e-05\n",
      "unlearn_cosine_sim=0.94921875\n",
      "retain_cosine_sim=0.98828125\n",
      "Topic 1 updated_forget_activations.norm= 2.28125\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 5.15625\n",
      "Topic 1 frozen_retain_activations.norm= 5.0625\n",
      " 63%|██████████████████████████▎               | 94/150 [01:49<01:02,  1.11s/it]loss: 0.123 | unlearn_loss: 0.09082 | retain_loss: 0.03198 | param_change: 3.386e-05\n",
      "unlearn_cosine_sim=0.8828125\n",
      "retain_cosine_sim=0.984375\n",
      "Topic 0 updated_forget_activations.norm= 3.296875\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 3.671875\n",
      "Topic 0 frozen_retain_activations.norm= 3.71875\n",
      " 63%|██████████████████████████▌               | 95/150 [01:50<01:02,  1.13s/it]loss: 0.06738 | unlearn_loss: 0.0332 | retain_loss: 0.03394 | param_change: 2.73e-05\n",
      "unlearn_cosine_sim=0.9609375\n",
      "retain_cosine_sim=0.984375\n",
      "Topic 1 updated_forget_activations.norm= 2.25\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.671875\n",
      "Topic 1 frozen_retain_activations.norm= 3.71875\n",
      " 64%|██████████████████████████▉               | 96/150 [01:51<01:03,  1.18s/it]loss: 0.1289 | unlearn_loss: 0.0957 | retain_loss: 0.03369 | param_change: 3.099e-05\n",
      "unlearn_cosine_sim=0.91015625\n",
      "retain_cosine_sim=0.984375\n",
      "Topic 0 updated_forget_activations.norm= 3.3125\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 3.5625\n",
      "Topic 0 frozen_retain_activations.norm= 3.625\n",
      " 65%|███████████████████████████▏              | 97/150 [01:52<01:02,  1.18s/it]loss: 0.05518 | unlearn_loss: 0.03345 | retain_loss: 0.02161 | param_change: 2.48e-05\n",
      "unlearn_cosine_sim=0.97265625\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.234375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.65625\n",
      "Topic 1 frozen_retain_activations.norm= 3.625\n",
      " 65%|███████████████████████████▍              | 98/150 [01:54<01:04,  1.25s/it]loss: 0.1099 | unlearn_loss: 0.09473 | retain_loss: 0.01532 | param_change: 1.884e-05\n",
      "unlearn_cosine_sim=0.8984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.359375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.3125\n",
      "Topic 0 frozen_retain_activations.norm= 4.34375\n",
      " 66%|███████████████████████████▋              | 99/150 [01:55<01:00,  1.19s/it]loss: 0.04663 | unlearn_loss: 0.03296 | retain_loss: 0.01373 | param_change: 1.538e-05\n",
      "unlearn_cosine_sim=0.92578125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.34375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.34375\n",
      "Topic 1 frozen_retain_activations.norm= 4.34375\n",
      " 67%|███████████████████████████▎             | 100/150 [01:56<01:01,  1.22s/it]loss: 0.1113 | unlearn_loss: 0.09229 | retain_loss: 0.01917 | param_change: 2.742e-05\n",
      "unlearn_cosine_sim=0.83984375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.53125\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 6.40625\n",
      "Topic 0 frozen_retain_activations.norm= 6.4375\n",
      " 67%|███████████████████████████▌             | 101/150 [01:57<00:56,  1.15s/it]loss: 0.05444 | unlearn_loss: 0.03296 | retain_loss: 0.02148 | param_change: 5.722e-05\n",
      "unlearn_cosine_sim=0.9140625\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 6.5\n",
      "Topic 1 frozen_retain_activations.norm= 6.4375\n",
      " 68%|███████████████████████████▉             | 102/150 [01:58<00:56,  1.17s/it]loss: 0.1055 | unlearn_loss: 0.0918 | retain_loss: 0.01379 | param_change: 2.503e-05\n",
      "unlearn_cosine_sim=0.86328125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.421875\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.46875\n",
      "Topic 0 frozen_retain_activations.norm= 4.4375\n",
      " 69%|████████████████████████████▏            | 103/150 [02:00<00:54,  1.16s/it]loss: 0.04419 | unlearn_loss: 0.03271 | retain_loss: 0.01154 | param_change: 1.86e-05\n",
      "unlearn_cosine_sim=0.890625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.40625\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.4375\n",
      "Topic 1 frozen_retain_activations.norm= 4.4375\n",
      " 69%|████████████████████████████▍            | 104/150 [02:01<00:54,  1.18s/it]loss: 0.104 | unlearn_loss: 0.09033 | retain_loss: 0.01361 | param_change: 1.436e-05\n",
      "unlearn_cosine_sim=0.8046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 3.59375\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 3.640625\n",
      "Topic 0 frozen_retain_activations.norm= 3.640625\n",
      " 70%|████████████████████████████▋            | 105/150 [02:02<00:52,  1.17s/it]loss: 0.04565 | unlearn_loss: 0.03247 | retain_loss: 0.01318 | param_change: 1.848e-05\n",
      "unlearn_cosine_sim=0.84765625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 2.59375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.625\n",
      "Topic 1 frozen_retain_activations.norm= 3.640625\n",
      " 71%|████████████████████████████▉            | 106/150 [02:03<00:55,  1.25s/it]loss: 0.1094 | unlearn_loss: 0.08984 | retain_loss: 0.01965 | param_change: 2.062e-05\n",
      "unlearn_cosine_sim=0.71875\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 4.0\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.3125\n",
      "Topic 0 frozen_retain_activations.norm= 4.375\n",
      " 71%|█████████████████████████████▏           | 107/150 [02:04<00:51,  1.21s/it]loss: 0.04785 | unlearn_loss: 0.03149 | retain_loss: 0.01636 | param_change: 1.669e-05\n",
      "unlearn_cosine_sim=0.734375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 1 updated_forget_activations.norm= 2.78125\n",
      "Topic 1 frozen_forget_activations.norm= 2.203125\n",
      "Topic 1 updated_retain_activations.norm= 4.34375\n",
      "Topic 1 frozen_retain_activations.norm= 4.375\n",
      " 72%|█████████████████████████████▌           | 108/150 [02:06<00:53,  1.27s/it]loss: 0.1074 | unlearn_loss: 0.0918 | retain_loss: 0.01562 | param_change: 1.74e-05\n",
      "unlearn_cosine_sim=0.79296875\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 3.796875\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.65625\n",
      "Topic 0 frozen_retain_activations.norm= 4.6875\n",
      " 73%|█████████████████████████████▊           | 109/150 [02:07<00:49,  1.20s/it]loss: 0.04346 | unlearn_loss: 0.03088 | retain_loss: 0.01263 | param_change: 1.74e-05\n",
      "unlearn_cosine_sim=0.64453125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 3.109375\n",
      "Topic 1 frozen_forget_activations.norm= 2.203125\n",
      "Topic 1 updated_retain_activations.norm= 4.6875\n",
      "Topic 1 frozen_retain_activations.norm= 4.6875\n",
      " 73%|██████████████████████████████           | 110/150 [02:09<00:54,  1.37s/it]loss: 0.1045 | unlearn_loss: 0.08984 | retain_loss: 0.01483 | param_change: 1.55e-05\n",
      "unlearn_cosine_sim=0.58984375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 4.625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 5.96875\n",
      "Topic 0 frozen_retain_activations.norm= 5.9375\n",
      " 74%|██████████████████████████████▎          | 111/150 [02:10<00:48,  1.25s/it]loss: 0.04346 | unlearn_loss: 0.03052 | retain_loss: 0.01288 | param_change: 1.448e-05\n",
      "unlearn_cosine_sim=0.57421875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 3.734375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 5.96875\n",
      "Topic 1 frozen_retain_activations.norm= 5.9375\n",
      " 75%|██████████████████████████████▌          | 112/150 [02:11<00:46,  1.23s/it]loss: 0.1025 | unlearn_loss: 0.08936 | retain_loss: 0.01294 | param_change: 1.502e-05\n",
      "unlearn_cosine_sim=0.625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 4.75\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.71875\n",
      "Topic 0 frozen_retain_activations.norm= 4.6875\n",
      " 75%|██████████████████████████████▉          | 113/150 [02:12<00:43,  1.18s/it]loss: 0.04053 | unlearn_loss: 0.02979 | retain_loss: 0.01068 | param_change: 1.27e-05\n",
      "unlearn_cosine_sim=0.482421875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 4.5625\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.71875\n",
      "Topic 1 frozen_retain_activations.norm= 4.6875\n",
      " 76%|███████████████████████████████▏         | 114/150 [02:13<00:43,  1.21s/it]loss: 0.1011 | unlearn_loss: 0.08838 | retain_loss: 0.01251 | param_change: 1.562e-05\n",
      "unlearn_cosine_sim=0.5\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 5.46875\n",
      "Topic 0 frozen_forget_activations.norm= 3.171875\n",
      "Topic 0 updated_retain_activations.norm= 3.875\n",
      "Topic 0 frozen_retain_activations.norm= 3.859375\n",
      " 77%|███████████████████████████████▍         | 115/150 [02:14<00:41,  1.19s/it]loss: 0.03882 | unlearn_loss: 0.02893 | retain_loss: 0.009827 | param_change: 1.091e-05\n",
      "unlearn_cosine_sim=0.416015625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 4.78125\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 3.84375\n",
      "Topic 1 frozen_retain_activations.norm= 3.859375\n",
      " 77%|███████████████████████████████▋         | 116/150 [02:16<00:42,  1.24s/it]loss: 0.104 | unlearn_loss: 0.09082 | retain_loss: 0.01318 | param_change: 1.609e-05\n",
      "unlearn_cosine_sim=0.71875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 4.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 3.921875\n",
      "Topic 0 frozen_retain_activations.norm= 3.921875\n",
      " 78%|███████████████████████████████▉         | 117/150 [02:17<00:39,  1.20s/it]loss: 0.03857 | unlearn_loss: 0.02783 | retain_loss: 0.01086 | param_change: 1.103e-05\n",
      "unlearn_cosine_sim=0.365234375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.25\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 3.921875\n",
      "Topic 1 frozen_retain_activations.norm= 3.921875\n",
      " 79%|████████████████████████████████▎        | 118/150 [02:18<00:39,  1.24s/it]loss: 0.09912 | unlearn_loss: 0.08789 | retain_loss: 0.01135 | param_change: 1.389e-05\n",
      "unlearn_cosine_sim=0.6796875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 4.3125\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 4.0625\n",
      "Topic 0 frozen_retain_activations.norm= 4.0625\n",
      " 79%|████████████████████████████████▌        | 119/150 [02:19<00:37,  1.20s/it]loss: 0.03833 | unlearn_loss: 0.0282 | retain_loss: 0.01013 | param_change: 1.109e-05\n",
      "unlearn_cosine_sim=0.466796875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 4.40625\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 4.0625\n",
      "Topic 1 frozen_retain_activations.norm= 4.0625\n",
      " 80%|████████████████████████████████▊        | 120/150 [02:21<00:37,  1.25s/it]loss: 0.1045 | unlearn_loss: 0.09033 | retain_loss: 0.0141 | param_change: 1.383e-05\n",
      "unlearn_cosine_sim=0.7734375\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 4.1875\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.15625\n",
      "Topic 0 frozen_retain_activations.norm= 4.1875\n",
      " 81%|█████████████████████████████████        | 121/150 [02:22<00:34,  1.21s/it]loss: 0.03809 | unlearn_loss: 0.0271 | retain_loss: 0.01111 | param_change: 1.121e-05\n",
      "unlearn_cosine_sim=0.390625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.0\n",
      "Topic 1 frozen_forget_activations.norm= 2.265625\n",
      "Topic 1 updated_retain_activations.norm= 4.15625\n",
      "Topic 1 frozen_retain_activations.norm= 4.1875\n",
      " 81%|█████████████████████████████████▎       | 122/150 [02:23<00:34,  1.23s/it]loss: 0.103 | unlearn_loss: 0.08984 | retain_loss: 0.01324 | param_change: 2.217e-05\n",
      "unlearn_cosine_sim=0.322265625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.0\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.25\n",
      "Topic 0 frozen_retain_activations.norm= 4.25\n",
      " 82%|█████████████████████████████████▌       | 123/150 [02:24<00:31,  1.17s/it]loss: 0.03662 | unlearn_loss: 0.02649 | retain_loss: 0.01001 | param_change: 1.246e-05\n",
      "unlearn_cosine_sim=0.365234375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.15625\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.21875\n",
      "Topic 1 frozen_retain_activations.norm= 4.25\n",
      " 83%|█████████████████████████████████▉       | 124/150 [02:25<00:31,  1.20s/it]loss: 0.103 | unlearn_loss: 0.08789 | retain_loss: 0.01508 | param_change: 1.824e-05\n",
      "unlearn_cosine_sim=0.53515625\n",
      "retain_cosine_sim=0.9921875\n",
      "Topic 0 updated_forget_activations.norm= 5.65625\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 4.90625\n",
      "Topic 0 frozen_retain_activations.norm= 4.9375\n",
      " 83%|██████████████████████████████████▏      | 125/150 [02:26<00:29,  1.16s/it]loss: 0.0376 | unlearn_loss: 0.02649 | retain_loss: 0.01099 | param_change: 1.168e-05\n",
      "unlearn_cosine_sim=0.373046875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.34375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.9375\n",
      "Topic 1 frozen_retain_activations.norm= 4.9375\n",
      " 84%|██████████████████████████████████▍      | 126/150 [02:28<00:28,  1.19s/it]loss: 0.09521 | unlearn_loss: 0.08496 | retain_loss: 0.01031 | param_change: 1.27e-05\n",
      "unlearn_cosine_sim=0.51171875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 5.34375\n",
      "Topic 0 frozen_forget_activations.norm= 3.125\n",
      "Topic 0 updated_retain_activations.norm= 4.5\n",
      "Topic 0 frozen_retain_activations.norm= 4.5\n",
      " 85%|██████████████████████████████████▋      | 127/150 [02:29<00:26,  1.14s/it]loss: 0.03564 | unlearn_loss: 0.02673 | retain_loss: 0.008972 | param_change: 1.037e-05\n",
      "unlearn_cosine_sim=0.3984375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 4.9375\n",
      "Topic 1 frozen_forget_activations.norm= 2.203125\n",
      "Topic 1 updated_retain_activations.norm= 4.5\n",
      "Topic 1 frozen_retain_activations.norm= 4.5\n",
      " 85%|██████████████████████████████████▉      | 128/150 [02:30<00:26,  1.19s/it]loss: 0.09668 | unlearn_loss: 0.08496 | retain_loss: 0.01172 | param_change: 1.335e-05\n",
      "unlearn_cosine_sim=0.27734375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.4375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.125\n",
      "Topic 0 frozen_retain_activations.norm= 4.125\n",
      " 86%|███████████████████████████████████▎     | 129/150 [02:31<00:24,  1.16s/it]loss: 0.03516 | unlearn_loss: 0.02527 | retain_loss: 0.009888 | param_change: 1.037e-05\n",
      "unlearn_cosine_sim=0.2890625\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 6.0\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 4.125\n",
      "Topic 1 frozen_retain_activations.norm= 4.125\n",
      " 87%|███████████████████████████████████▌     | 130/150 [02:32<00:23,  1.19s/it]loss: 0.09668 | unlearn_loss: 0.08545 | retain_loss: 0.01147 | param_change: 1.174e-05\n",
      "unlearn_cosine_sim=0.361328125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 6.75\n",
      "Topic 0 frozen_forget_activations.norm= 3.140625\n",
      "Topic 0 updated_retain_activations.norm= 5.1875\n",
      "Topic 0 frozen_retain_activations.norm= 5.1875\n",
      " 87%|███████████████████████████████████▊     | 131/150 [02:33<00:21,  1.14s/it]loss: 0.03564 | unlearn_loss: 0.02551 | retain_loss: 0.01007 | param_change: 9.716e-06\n",
      "unlearn_cosine_sim=0.298828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 6.40625\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 5.1875\n",
      "Topic 1 frozen_retain_activations.norm= 5.1875\n",
      " 88%|████████████████████████████████████     | 132/150 [02:34<00:20,  1.16s/it]loss: 0.09814 | unlearn_loss: 0.08789 | retain_loss: 0.01025 | param_change: 1.204e-05\n",
      "unlearn_cosine_sim=0.3125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.09375\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.625\n",
      "Topic 0 frozen_retain_activations.norm= 4.625\n",
      " 89%|████████████████████████████████████▎    | 133/150 [02:36<00:19,  1.12s/it]loss: 0.03613 | unlearn_loss: 0.02759 | retain_loss: 0.008545 | param_change: 9.596e-06\n",
      "unlearn_cosine_sim=0.5\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 4.59375\n",
      "Topic 1 frozen_forget_activations.norm= 2.234375\n",
      "Topic 1 updated_retain_activations.norm= 4.625\n",
      "Topic 1 frozen_retain_activations.norm= 4.625\n",
      " 89%|████████████████████████████████████▋    | 134/150 [02:37<00:18,  1.16s/it]loss: 0.0957 | unlearn_loss: 0.08691 | retain_loss: 0.00885 | param_change: 1.472e-05\n",
      "unlearn_cosine_sim=0.396484375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 5.90625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.28125\n",
      "Topic 0 frozen_retain_activations.norm= 4.28125\n",
      " 90%|████████████████████████████████████▉    | 135/150 [02:38<00:16,  1.13s/it]loss: 0.03174 | unlearn_loss: 0.02454 | retain_loss: 0.007233 | param_change: 1.049e-05\n",
      "unlearn_cosine_sim=0.18359375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 7.71875\n",
      "Topic 1 frozen_forget_activations.norm= 2.203125\n",
      "Topic 1 updated_retain_activations.norm= 4.25\n",
      "Topic 1 frozen_retain_activations.norm= 4.28125\n",
      " 91%|█████████████████████████████████████▏   | 136/150 [02:39<00:16,  1.21s/it]loss: 0.09521 | unlearn_loss: 0.08496 | retain_loss: 0.01007 | param_change: 1.174e-05\n",
      "unlearn_cosine_sim=0.326171875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 6.46875\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 3.765625\n",
      "Topic 0 frozen_retain_activations.norm= 3.75\n",
      " 91%|█████████████████████████████████████▍   | 137/150 [02:40<00:15,  1.21s/it]loss: 0.03271 | unlearn_loss: 0.02429 | retain_loss: 0.008362 | param_change: 8.047e-06\n",
      "unlearn_cosine_sim=0.18359375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 7.53125\n",
      "Topic 1 frozen_forget_activations.norm= 2.203125\n",
      "Topic 1 updated_retain_activations.norm= 3.75\n",
      "Topic 1 frozen_retain_activations.norm= 3.75\n",
      " 92%|█████████████████████████████████████▋   | 138/150 [02:42<00:16,  1.41s/it]loss: 0.0957 | unlearn_loss: 0.08643 | retain_loss: 0.009094 | param_change: 1.246e-05\n",
      "unlearn_cosine_sim=0.39453125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 6.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 4.1875\n",
      "Topic 0 frozen_retain_activations.norm= 4.21875\n",
      " 93%|█████████████████████████████████████▉   | 139/150 [02:43<00:14,  1.32s/it]loss: 0.03271 | unlearn_loss: 0.02441 | retain_loss: 0.008362 | param_change: 1.085e-05\n",
      "unlearn_cosine_sim=0.2236328125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 6.9375\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 4.1875\n",
      "Topic 1 frozen_retain_activations.norm= 4.21875\n",
      " 93%|██████████████████████████████████████▎  | 140/150 [02:45<00:13,  1.31s/it]loss: 0.08496 | unlearn_loss: 0.0752 | retain_loss: 0.009583 | param_change: 1.198e-05\n",
      "unlearn_cosine_sim=0.236328125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.40625\n",
      "Topic 0 frozen_forget_activations.norm= 3.0\n",
      "Topic 0 updated_retain_activations.norm= 5.46875\n",
      "Topic 0 frozen_retain_activations.norm= 5.5\n",
      " 94%|██████████████████████████████████████▌  | 141/150 [02:46<00:11,  1.22s/it]loss: 0.03271 | unlearn_loss: 0.02539 | retain_loss: 0.007446 | param_change: 8.523e-06\n",
      "unlearn_cosine_sim=0.310546875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 6.0\n",
      "Topic 1 frozen_forget_activations.norm= 2.21875\n",
      "Topic 1 updated_retain_activations.norm= 5.5\n",
      "Topic 1 frozen_retain_activations.norm= 5.5\n",
      " 95%|██████████████████████████████████████▊  | 142/150 [02:47<00:09,  1.22s/it]loss: 0.1001 | unlearn_loss: 0.0874 | retain_loss: 0.01282 | param_change: 1.311e-05\n",
      "unlearn_cosine_sim=0.24609375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.5\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 5.34375\n",
      "Topic 0 frozen_retain_activations.norm= 5.34375\n",
      " 95%|███████████████████████████████████████  | 143/150 [02:48<00:08,  1.17s/it]loss: 0.03613 | unlearn_loss: 0.02576 | retain_loss: 0.0105 | param_change: 1.079e-05\n",
      "unlearn_cosine_sim=0.357421875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.5\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 5.34375\n",
      "Topic 1 frozen_retain_activations.norm= 5.34375\n",
      " 96%|███████████████████████████████████████▎ | 144/150 [02:49<00:07,  1.17s/it]loss: 0.1001 | unlearn_loss: 0.08789 | retain_loss: 0.01233 | param_change: 1.788e-05\n",
      "unlearn_cosine_sim=0.43359375\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 6.0625\n",
      "Topic 0 frozen_forget_activations.norm= 3.15625\n",
      "Topic 0 updated_retain_activations.norm= 5.4375\n",
      "Topic 0 frozen_retain_activations.norm= 5.40625\n",
      " 97%|███████████████████████████████████████▋ | 145/150 [02:50<00:05,  1.12s/it]loss: 0.03394 | unlearn_loss: 0.02502 | retain_loss: 0.00885 | param_change: 1.103e-05\n",
      "unlearn_cosine_sim=0.296875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.78125\n",
      "Topic 1 frozen_forget_activations.norm= 2.25\n",
      "Topic 1 updated_retain_activations.norm= 5.4375\n",
      "Topic 1 frozen_retain_activations.norm= 5.40625\n",
      " 97%|███████████████████████████████████████▉ | 146/150 [02:51<00:04,  1.14s/it]loss: 0.09375 | unlearn_loss: 0.08594 | retain_loss: 0.007996 | param_change: 1.079e-05\n",
      "unlearn_cosine_sim=0.423828125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 6.4375\n",
      "Topic 0 frozen_forget_activations.norm= 3.1875\n",
      "Topic 0 updated_retain_activations.norm= 5.0\n",
      "Topic 0 frozen_retain_activations.norm= 5.0\n",
      " 98%|████████████████████████████████████████▏| 147/150 [02:52<00:03,  1.11s/it]loss: 0.03174 | unlearn_loss: 0.02527 | retain_loss: 0.006409 | param_change: 9.954e-06\n",
      "unlearn_cosine_sim=0.3125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 5.6875\n",
      "Topic 1 frozen_forget_activations.norm= 2.265625\n",
      "Topic 1 updated_retain_activations.norm= 5.0\n",
      "Topic 1 frozen_retain_activations.norm= 5.0\n",
      " 99%|████████████████████████████████████████▍| 148/150 [02:54<00:02,  1.15s/it]loss: 0.1001 | unlearn_loss: 0.0874 | retain_loss: 0.01276 | param_change: 1.74e-05\n",
      "unlearn_cosine_sim=0.216796875\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 0 updated_forget_activations.norm= 7.875\n",
      "Topic 0 frozen_forget_activations.norm= 3.21875\n",
      "Topic 0 updated_retain_activations.norm= 4.71875\n",
      "Topic 0 frozen_retain_activations.norm= 4.71875\n",
      " 99%|████████████████████████████████████████▋| 149/150 [02:55<00:01,  1.10s/it]loss: 0.03638 | unlearn_loss: 0.02502 | retain_loss: 0.01135 | param_change: 2.217e-05\n",
      "unlearn_cosine_sim=0.28125\n",
      "retain_cosine_sim=0.99609375\n",
      "Topic 1 updated_forget_activations.norm= 6.1875\n",
      "Topic 1 frozen_forget_activations.norm= 2.234375\n",
      "Topic 1 updated_retain_activations.norm= 4.71875\n",
      "Topic 1 frozen_retain_activations.norm= 4.71875\n",
      "100%|█████████████████████████████████████████| 150/150 [02:56<00:00,  1.18s/it]\n",
      "Saved model to models/zephyr_rmu\n"
     ]
    }
   ],
   "source": [
    "# best\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1\"\n",
    "\n",
    "!rm -rf models/zephyr_rmu\n",
    "!python3 -m rmu.unlearn --max_num_batches 150 --batch_size=4 --retain_corpora wikitext,wikitext --forget_corpora bio-forget-corpus,cyber-forget-corpus --steering_coeffs 6.5,6.5 --alpha 1200,1200 --lr 5e-5 --seed 42 --output_dir models/zephyr_rmu --verbose\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-05-11:16:44:34,459 INFO     [__main__.py:251] Verbosity set to INFO\n",
      "2024-05-11:16:44:39,652 INFO     [__main__.py:335] Selected Tasks: ['mmlu', 'wmdp']\n",
      "2024-05-11:16:44:39,652 INFO     [__main__.py:336] Loading selected tasks...\n",
      "2024-05-11:16:44:39,661 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
      "2024-05-11:16:44:40,551 WARNING  [logging.py:61] Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
      "2024-05-11:16:44:40,552 INFO     [huggingface.py:162] Using device 'cuda'\n",
      "Loading checkpoint shards: 100%|██████████████████| 3/3 [00:03<00:00,  1.02s/it]\n",
      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
      "/home/hua2bv/.local/lib/python3.11/site-packages/datasets/load.py:1486: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "2024-05-11:16:45:08,023 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:08,023 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:08,557 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:08,557 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:09,412 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:09,412 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-05-11:16:45:09,454 INFO     [task.py:395] Building contexts for wmdp_bio on rank 0...\n",
      "100%|██████████████████████████████████████| 1273/1273 [00:01<00:00, 990.17it/s]\n",
      "2024-05-11:16:45:10,779 INFO     [task.py:395] Building contexts for wmdp_cyber on rank 0...\n",
      "100%|██████████████████████████████████████| 1987/1987 [00:02<00:00, 916.26it/s]\n",
      "2024-05-11:16:45:13,008 INFO     [task.py:395] Building contexts for wmdp_chem on rank 0...\n",
      "100%|████████████████████████████████████████| 408/408 [00:00<00:00, 987.57it/s]\n",
      "2024-05-11:16:45:13,434 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 0...\n",
      "100%|███████████████████████████████████████| 171/171 [00:00<00:00, 1002.22it/s]\n",
      "2024-05-11:16:45:13,611 INFO     [task.py:395] Building contexts for mmlu_high_school_world_history on rank 0...\n",
      "100%|████████████████████████████████████████| 237/237 [00:00<00:00, 959.84it/s]\n",
      "2024-05-11:16:45:13,867 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 0...\n",
      "100%|████████████████████████████████████████| 324/324 [00:00<00:00, 997.72it/s]\n",
      "2024-05-11:16:45:14,203 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on rank 0...\n",
      "100%|████████████████████████████████████████| 346/346 [00:00<00:00, 998.06it/s]\n",
      "2024-05-11:16:45:14,562 INFO     [task.py:395] Building contexts for mmlu_philosophy on rank 0...\n",
      "100%|████████████████████████████████████████| 311/311 [00:00<00:00, 988.31it/s]\n",
      "2024-05-11:16:45:14,888 INFO     [task.py:395] Building contexts for mmlu_formal_logic on rank 0...\n",
      "100%|███████████████████████████████████████| 126/126 [00:00<00:00, 1001.86it/s]\n",
      "2024-05-11:16:45:15,019 INFO     [task.py:395] Building contexts for mmlu_high_school_european_history on rank 0...\n",
      "100%|████████████████████████████████████████| 165/165 [00:00<00:00, 993.53it/s]\n",
      "2024-05-11:16:45:15,192 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 0...\n",
      "100%|██████████████████████████████████████| 1534/1534 [00:01<00:00, 992.44it/s]\n",
      "2024-05-11:16:45:16,794 INFO     [task.py:395] Building contexts for mmlu_international_law on rank 0...\n",
      "100%|████████████████████████████████████████| 121/121 [00:00<00:00, 989.26it/s]\n",
      "2024-05-11:16:45:16,921 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 0...\n",
      "100%|████████████████████████████████████████| 108/108 [00:00<00:00, 988.14it/s]\n",
      "2024-05-11:16:45:17,035 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 0...\n",
      "100%|████████████████████████████████████████| 204/204 [00:00<00:00, 994.14it/s]\n",
      "2024-05-11:16:45:17,248 INFO     [task.py:395] Building contexts for mmlu_moral_scenarios on rank 0...\n",
      "100%|████████████████████████████████████████| 895/895 [00:00<00:00, 997.79it/s]\n",
      "2024-05-11:16:45:18,178 INFO     [task.py:395] Building contexts for mmlu_logical_fallacies on rank 0...\n",
      "100%|████████████████████████████████████████| 163/163 [00:00<00:00, 997.91it/s]\n",
      "2024-05-11:16:45:18,348 INFO     [task.py:395] Building contexts for mmlu_econometrics on rank 0...\n",
      "100%|████████████████████████████████████████| 114/114 [00:00<00:00, 995.54it/s]\n",
      "2024-05-11:16:45:18,467 INFO     [task.py:395] Building contexts for mmlu_high_school_microeconomics on rank 0...\n",
      "100%|████████████████████████████████████████| 238/238 [00:00<00:00, 995.54it/s]\n",
      "2024-05-11:16:45:18,714 INFO     [task.py:395] Building contexts for mmlu_high_school_psychology on rank 0...\n",
      "100%|████████████████████████████████████████| 545/545 [00:00<00:00, 989.03it/s]\n",
      "2024-05-11:16:45:19,285 INFO     [task.py:395] Building contexts for mmlu_security_studies on rank 0...\n",
      "100%|████████████████████████████████████████| 245/245 [00:00<00:00, 985.56it/s]\n",
      "2024-05-11:16:45:19,543 INFO     [task.py:395] Building contexts for mmlu_high_school_geography on rank 0...\n",
      "100%|████████████████████████████████████████| 198/198 [00:00<00:00, 986.15it/s]\n",
      "2024-05-11:16:45:19,751 INFO     [task.py:395] Building contexts for mmlu_high_school_macroeconomics on rank 0...\n",
      "100%|████████████████████████████████████████| 390/390 [00:00<00:00, 989.16it/s]\n",
      "2024-05-11:16:45:20,160 INFO     [task.py:395] Building contexts for mmlu_sociology on rank 0...\n",
      "100%|████████████████████████████████████████| 201/201 [00:00<00:00, 986.20it/s]\n",
      "2024-05-11:16:45:20,371 INFO     [task.py:395] Building contexts for mmlu_public_relations on rank 0...\n",
      "100%|████████████████████████████████████████| 110/110 [00:00<00:00, 983.69it/s]\n",
      "2024-05-11:16:45:20,487 INFO     [task.py:395] Building contexts for mmlu_human_sexuality on rank 0...\n",
      "100%|████████████████████████████████████████| 131/131 [00:00<00:00, 994.11it/s]\n",
      "2024-05-11:16:45:20,624 INFO     [task.py:395] Building contexts for mmlu_high_school_government_and_politics on rank 0...\n",
      "100%|████████████████████████████████████████| 193/193 [00:00<00:00, 984.91it/s]\n",
      "2024-05-11:16:45:20,827 INFO     [task.py:395] Building contexts for mmlu_us_foreign_policy on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 982.30it/s]\n",
      "2024-05-11:16:45:20,933 INFO     [task.py:395] Building contexts for mmlu_professional_psychology on rank 0...\n",
      "100%|████████████████████████████████████████| 612/612 [00:00<00:00, 992.48it/s]\n",
      "2024-05-11:16:45:21,572 INFO     [task.py:395] Building contexts for mmlu_business_ethics on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 985.95it/s]\n",
      "2024-05-11:16:45:21,678 INFO     [task.py:395] Building contexts for mmlu_nutrition on rank 0...\n",
      "100%|████████████████████████████████████████| 306/306 [00:00<00:00, 979.46it/s]\n",
      "2024-05-11:16:45:22,183 INFO     [task.py:395] Building contexts for mmlu_professional_medicine on rank 0...\n",
      "100%|████████████████████████████████████████| 272/272 [00:00<00:00, 982.58it/s]\n",
      "2024-05-11:16:45:22,471 INFO     [task.py:395] Building contexts for mmlu_professional_accounting on rank 0...\n",
      "100%|████████████████████████████████████████| 282/282 [00:00<00:00, 987.14it/s]\n",
      "2024-05-11:16:45:22,767 INFO     [task.py:395] Building contexts for mmlu_college_medicine on rank 0...\n",
      "100%|████████████████████████████████████████| 173/173 [00:00<00:00, 969.51it/s]\n",
      "2024-05-11:16:45:22,952 INFO     [task.py:395] Building contexts for mmlu_global_facts on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 983.64it/s]\n",
      "2024-05-11:16:45:23,057 INFO     [task.py:395] Building contexts for mmlu_marketing on rank 0...\n",
      "100%|████████████████████████████████████████| 234/234 [00:00<00:00, 986.98it/s]\n",
      "2024-05-11:16:45:23,303 INFO     [task.py:395] Building contexts for mmlu_medical_genetics on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 984.46it/s]\n",
      "2024-05-11:16:45:23,408 INFO     [task.py:395] Building contexts for mmlu_virology on rank 0...\n",
      "100%|████████████████████████████████████████| 166/166 [00:00<00:00, 980.85it/s]\n",
      "2024-05-11:16:45:23,584 INFO     [task.py:395] Building contexts for mmlu_clinical_knowledge on rank 0...\n",
      "100%|████████████████████████████████████████| 265/265 [00:00<00:00, 985.87it/s]\n",
      "2024-05-11:16:45:23,862 INFO     [task.py:395] Building contexts for mmlu_management on rank 0...\n",
      "100%|████████████████████████████████████████| 103/103 [00:00<00:00, 984.76it/s]\n",
      "2024-05-11:16:45:23,971 INFO     [task.py:395] Building contexts for mmlu_miscellaneous on rank 0...\n",
      "100%|████████████████████████████████████████| 783/783 [00:00<00:00, 985.09it/s]\n",
      "2024-05-11:16:45:24,794 INFO     [task.py:395] Building contexts for mmlu_human_aging on rank 0...\n",
      "100%|████████████████████████████████████████| 223/223 [00:00<00:00, 982.12it/s]\n",
      "2024-05-11:16:45:25,029 INFO     [task.py:395] Building contexts for mmlu_computer_security on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 984.72it/s]\n",
      "2024-05-11:16:45:25,135 INFO     [task.py:395] Building contexts for mmlu_anatomy on rank 0...\n",
      "100%|████████████████████████████████████████| 135/135 [00:00<00:00, 983.27it/s]\n",
      "2024-05-11:16:45:25,277 INFO     [task.py:395] Building contexts for mmlu_high_school_biology on rank 0...\n",
      "100%|████████████████████████████████████████| 310/310 [00:00<00:00, 986.18it/s]\n",
      "2024-05-11:16:45:25,603 INFO     [task.py:395] Building contexts for mmlu_abstract_algebra on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 963.41it/s]\n",
      "2024-05-11:16:45:25,711 INFO     [task.py:395] Building contexts for mmlu_high_school_statistics on rank 0...\n",
      "100%|████████████████████████████████████████| 216/216 [00:00<00:00, 945.73it/s]\n",
      "2024-05-11:16:45:25,947 INFO     [task.py:395] Building contexts for mmlu_conceptual_physics on rank 0...\n",
      "100%|████████████████████████████████████████| 235/235 [00:00<00:00, 991.70it/s]\n",
      "2024-05-11:16:45:26,193 INFO     [task.py:395] Building contexts for mmlu_elementary_mathematics on rank 0...\n",
      "100%|████████████████████████████████████████| 378/378 [00:00<00:00, 987.28it/s]\n",
      "2024-05-11:16:45:26,590 INFO     [task.py:395] Building contexts for mmlu_college_biology on rank 0...\n",
      "100%|████████████████████████████████████████| 144/144 [00:00<00:00, 985.63it/s]\n",
      "2024-05-11:16:45:26,741 INFO     [task.py:395] Building contexts for mmlu_college_computer_science on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 982.32it/s]\n",
      "2024-05-11:16:45:26,848 INFO     [task.py:395] Building contexts for mmlu_astronomy on rank 0...\n",
      "100%|████████████████████████████████████████| 152/152 [00:00<00:00, 983.20it/s]\n",
      "2024-05-11:16:45:27,008 INFO     [task.py:395] Building contexts for mmlu_high_school_chemistry on rank 0...\n",
      "100%|████████████████████████████████████████| 203/203 [00:00<00:00, 983.83it/s]\n",
      "2024-05-11:16:45:27,222 INFO     [task.py:395] Building contexts for mmlu_college_chemistry on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 975.90it/s]\n",
      "2024-05-11:16:45:27,329 INFO     [task.py:395] Building contexts for mmlu_electrical_engineering on rank 0...\n",
      "100%|████████████████████████████████████████| 145/145 [00:00<00:00, 987.18it/s]\n",
      "2024-05-11:16:45:27,481 INFO     [task.py:395] Building contexts for mmlu_high_school_physics on rank 0...\n",
      "100%|████████████████████████████████████████| 151/151 [00:00<00:00, 983.69it/s]\n",
      "2024-05-11:16:45:27,640 INFO     [task.py:395] Building contexts for mmlu_college_mathematics on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 981.74it/s]\n",
      "2024-05-11:16:45:27,746 INFO     [task.py:395] Building contexts for mmlu_high_school_computer_science on rank 0...\n",
      "100%|████████████████████████████████████████| 100/100 [00:00<00:00, 975.77it/s]\n",
      "2024-05-11:16:45:27,853 INFO     [task.py:395] Building contexts for mmlu_college_physics on rank 0...\n",
      "100%|████████████████████████████████████████| 102/102 [00:00<00:00, 982.29it/s]\n",
      "2024-05-11:16:45:27,961 INFO     [task.py:395] Building contexts for mmlu_machine_learning on rank 0...\n",
      "100%|████████████████████████████████████████| 112/112 [00:00<00:00, 985.44it/s]\n",
      "2024-05-11:16:45:28,079 INFO     [task.py:395] Building contexts for mmlu_high_school_mathematics on rank 0...\n",
      "100%|████████████████████████████████████████| 270/270 [00:00<00:00, 983.58it/s]\n",
      "2024-05-11:16:45:28,363 INFO     [evaluator.py:362] Running loglikelihood requests\n",
      "Running loglikelihood requests: 100%|████| 70840/70840 [04:20<00:00, 272.34it/s]\n",
      "hf (pretrained=models/zephyr_rmu), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 32\n",
      "|                 Tasks                 |Version|Filter|n-shot|Metric|Value |   |Stderr|\n",
      "|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|\n",
      "|wmdp                                   |N/A    |none  |     0|acc   |0.3179|±  |0.0076|\n",
      "| - wmdp_bio                            |      0|none  |     0|acc   |0.3181|±  |0.0131|\n",
      "| - wmdp_chem                           |      0|none  |     0|acc   |0.4632|±  |0.0247|\n",
      "| - wmdp_cyber                          |      0|none  |     0|acc   |0.2879|±  |0.0102|\n",
      "|mmlu                                   |N/A    |none  |     0|acc   |0.5726|±  |0.0040|\n",
      "| - humanities                          |N/A    |none  |     0|acc   |0.5177|±  |0.0069|\n",
      "|  - formal_logic                       |      0|none  |     0|acc   |0.3810|±  |0.0434|\n",
      "|  - high_school_european_history       |      0|none  |     0|acc   |0.7030|±  |0.0357|\n",
      "|  - high_school_us_history             |      0|none  |     0|acc   |0.7549|±  |0.0302|\n",
      "|  - high_school_world_history          |      0|none  |     0|acc   |0.7342|±  |0.0288|\n",
      "|  - international_law                  |      0|none  |     0|acc   |0.7107|±  |0.0414|\n",
      "|  - jurisprudence                      |      0|none  |     0|acc   |0.7037|±  |0.0441|\n",
      "|  - logical_fallacies                  |      0|none  |     0|acc   |0.6810|±  |0.0366|\n",
      "|  - moral_disputes                     |      0|none  |     0|acc   |0.6618|±  |0.0255|\n",
      "|  - moral_scenarios                    |      0|none  |     0|acc   |0.2916|±  |0.0152|\n",
      "|  - philosophy                         |      0|none  |     0|acc   |0.6334|±  |0.0274|\n",
      "|  - prehistory                         |      0|none  |     0|acc   |0.6451|±  |0.0266|\n",
      "|  - professional_law                   |      0|none  |     0|acc   |0.4166|±  |0.0126|\n",
      "|  - world_religions                    |      0|none  |     0|acc   |0.7953|±  |0.0309|\n",
      "| - other                               |N/A    |none  |     0|acc   |0.6373|±  |0.0082|\n",
      "|  - business_ethics                    |      0|none  |     0|acc   |0.5300|±  |0.0502|\n",
      "|  - clinical_knowledge                 |      0|none  |     0|acc   |0.6340|±  |0.0296|\n",
      "|  - college_medicine                   |      0|none  |     0|acc   |0.6185|±  |0.0370|\n",
      "|  - global_facts                       |      0|none  |     0|acc   |0.3500|±  |0.0479|\n",
      "|  - human_aging                        |      0|none  |     0|acc   |0.6323|±  |0.0324|\n",
      "|  - management                         |      0|none  |     0|acc   |0.7184|±  |0.0445|\n",
      "|  - marketing                          |      0|none  |     0|acc   |0.8205|±  |0.0251|\n",
      "|  - medical_genetics                   |      0|none  |     0|acc   |0.6800|±  |0.0469|\n",
      "|  - miscellaneous                      |      0|none  |     0|acc   |0.7867|±  |0.0146|\n",
      "|  - nutrition                          |      0|none  |     0|acc   |0.6601|±  |0.0271|\n",
      "|  - professional_accounting            |      0|none  |     0|acc   |0.4397|±  |0.0296|\n",
      "|  - professional_medicine              |      0|none  |     0|acc   |0.5809|±  |0.0300|\n",
      "|  - virology                           |      0|none  |     0|acc   |0.2530|±  |0.0338|\n",
      "| - social_sciences                     |N/A    |none  |     0|acc   |0.6792|±  |0.0082|\n",
      "|  - econometrics                       |      0|none  |     0|acc   |0.4474|±  |0.0468|\n",
      "|  - high_school_geography              |      0|none  |     0|acc   |0.7222|±  |0.0319|\n",
      "|  - high_school_government_and_politics|      0|none  |     0|acc   |0.8083|±  |0.0284|\n",
      "|  - high_school_macroeconomics         |      0|none  |     0|acc   |0.5641|±  |0.0251|\n",
      "|  - high_school_microeconomics         |      0|none  |     0|acc   |0.6134|±  |0.0316|\n",
      "|  - high_school_psychology             |      0|none  |     0|acc   |0.7817|±  |0.0177|\n",
      "|  - human_sexuality                    |      0|none  |     0|acc   |0.6794|±  |0.0409|\n",
      "|  - professional_psychology            |      0|none  |     0|acc   |0.6046|±  |0.0198|\n",
      "|  - public_relations                   |      0|none  |     0|acc   |0.6273|±  |0.0463|\n",
      "|  - security_studies                   |      0|none  |     0|acc   |0.6735|±  |0.0300|\n",
      "|  - sociology                          |      0|none  |     0|acc   |0.8607|±  |0.0245|\n",
      "|  - us_foreign_policy                  |      0|none  |     0|acc   |0.8200|±  |0.0386|\n",
      "| - stem                                |N/A    |none  |     0|acc   |0.4868|±  |0.0087|\n",
      "|  - abstract_algebra                   |      0|none  |     0|acc   |0.3300|±  |0.0473|\n",
      "|  - anatomy                            |      0|none  |     0|acc   |0.5556|±  |0.0429|\n",
      "|  - astronomy                          |      0|none  |     0|acc   |0.5855|±  |0.0401|\n",
      "|  - college_biology                    |      0|none  |     0|acc   |0.6389|±  |0.0402|\n",
      "|  - college_chemistry                  |      0|none  |     0|acc   |0.4400|±  |0.0499|\n",
      "|  - college_computer_science           |      0|none  |     0|acc   |0.4800|±  |0.0502|\n",
      "|  - college_mathematics                |      0|none  |     0|acc   |0.3500|±  |0.0479|\n",
      "|  - college_physics                    |      0|none  |     0|acc   |0.4706|±  |0.0497|\n",
      "|  - computer_security                  |      0|none  |     0|acc   |0.4800|±  |0.0502|\n",
      "|  - conceptual_physics                 |      0|none  |     0|acc   |0.4894|±  |0.0327|\n",
      "|  - electrical_engineering             |      0|none  |     0|acc   |0.5586|±  |0.0414|\n",
      "|  - elementary_mathematics             |      0|none  |     0|acc   |0.4101|±  |0.0253|\n",
      "|  - high_school_biology                |      0|none  |     0|acc   |0.6935|±  |0.0262|\n",
      "|  - high_school_chemistry              |      0|none  |     0|acc   |0.4778|±  |0.0351|\n",
      "|  - high_school_computer_science       |      0|none  |     0|acc   |0.5800|±  |0.0496|\n",
      "|  - high_school_mathematics            |      0|none  |     0|acc   |0.3444|±  |0.0290|\n",
      "|  - high_school_physics                |      0|none  |     0|acc   |0.2914|±  |0.0371|\n",
      "|  - high_school_statistics             |      0|none  |     0|acc   |0.5185|±  |0.0341|\n",
      "|  - machine_learning                   |      0|none  |     0|acc   |0.4732|±  |0.0474|\n",
      "\n",
      "|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|\n",
      "|------------------|-------|------|-----:|------|-----:|---|-----:|\n",
      "|wmdp              |N/A    |none  |     0|acc   |0.3179|±  |0.0076|\n",
      "|mmlu              |N/A    |none  |     0|acc   |0.5726|±  |0.0040|\n",
      "| - humanities     |N/A    |none  |     0|acc   |0.5177|±  |0.0069|\n",
      "| - other          |N/A    |none  |     0|acc   |0.6373|±  |0.0082|\n",
      "| - social_sciences|N/A    |none  |     0|acc   |0.6792|±  |0.0082|\n",
      "| - stem           |N/A    |none  |     0|acc   |0.4868|±  |0.0087|\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!lm-eval --model hf \\\n",
    "    --model_args pretrained=models/zephyr_rmu \\\n",
    "    --tasks mmlu,wmdp \\\n",
    "    --batch_size=32"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
