{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:39<00:00,  2.66s/it]\n",
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:22<00:00,  1.48s/it]\n",
      "====CUT Config====\n",
      "model_name_or_path=01-ai/Yi-34B-Chat\n",
      "module_str={model_name}.model.layers[{layer_id}]\n",
      "output_dir=models/yi_cut\n",
      "retain_corpora=['wikitext', 'wikitext']\n",
      "forget_corpora=['bio-forget-corpus', 'cyber-forget-corpus']\n",
      "alpha=[350.0, 350.0]\n",
      "steering_coeffs=300,300\n",
      "lr=5e-05\n",
      "min_len=0\n",
      "max_len=2000\n",
      "batch_size=2\n",
      "max_num_batches=400\n",
      "layer_id=15\n",
      "layer_ids=[13, 14, 15]\n",
      "param_ids=[6]\n",
      "seed=42\n",
      "steering_coeff_list=[300.0, 300.0]\n",
      "=====\n",
      "/data/long_phan/anaconda3/lib/python3.10/site-packages/transformers/optimization.py:429: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "======= Epoch 0 =======\n",
      "  0%|                                                   | 0/400 [00:00<?, ?it/s]2024-04-17 01:50:10.510628: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2024-04-17 01:50:10.548103: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-17 01:50:11.471543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 512, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 18.25 | unlearn_loss: 18.25 | retain_loss: 0 | param_change: 5.186e-06\n",
      "  0%|                                           | 1/400 [00:06<44:49,  6.74s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 768, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 19.62 | unlearn_loss: 19.62 | retain_loss: 0.01294 | param_change: 4.435e-05\n",
      "  0%|▏                                          | 2/400 [00:08<25:54,  3.91s/it]loss: 22.88 | unlearn_loss: 22.38 | retain_loss: 0.4492 | param_change: 0.000246\n",
      "  1%|▎                                          | 3/400 [00:10<18:35,  2.81s/it]loss: 20.25 | unlearn_loss: 20.25 | retain_loss: 0.02576 | param_change: 6.533e-05\n",
      "  1%|▍                                          | 4/400 [00:12<16:01,  2.43s/it]loss: 21.25 | unlearn_loss: 19.88 | retain_loss: 1.328 | param_change: 0.001289\n",
      "  1%|▌                                          | 5/400 [00:13<13:41,  2.08s/it]loss: 21.25 | unlearn_loss: 18.5 | retain_loss: 2.703 | param_change: 0.002884\n",
      "  2%|▋                                          | 6/400 [00:15<12:55,  1.97s/it]loss: 22.38 | unlearn_loss: 21.75 | retain_loss: 0.6406 | param_change: 0.0007515\n",
      "  2%|▊                                          | 7/400 [00:16<11:50,  1.81s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 669, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 21.62 | unlearn_loss: 20.88 | retain_loss: 0.8047 | param_change: 0.0008926\n",
      "  2%|▊                                          | 8/400 [00:18<11:36,  1.78s/it]loss: 17.62 | unlearn_loss: 15.94 | retain_loss: 1.711 | param_change: 0.0006027\n",
      "  2%|▉                                          | 9/400 [00:20<11:13,  1.72s/it]loss: 21.75 | unlearn_loss: 20.5 | retain_loss: 1.203 | param_change: 0.0004673\n",
      "  2%|█                                         | 10/400 [00:21<11:30,  1.77s/it]loss: 16.25 | unlearn_loss: 16 | retain_loss: 0.2285 | param_change: 0.0002432\n",
      "  3%|█▏                                        | 11/400 [00:23<10:45,  1.66s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 596, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 22.75 | unlearn_loss: 22.75 | retain_loss: 0.05908 | param_change: 0.0001011\n",
      "  3%|█▎                                        | 12/400 [00:24<10:04,  1.56s/it]loss: 15.56 | unlearn_loss: 15.56 | retain_loss: 0.007355 | param_change: 1.669e-05\n",
      "  3%|█▎                                        | 13/400 [00:25<09:29,  1.47s/it]loss: 20 | unlearn_loss: 20 | retain_loss: 0.007385 | param_change: 4.172e-05\n",
      "  4%|█▍                                        | 14/400 [00:27<09:41,  1.51s/it]loss: 23.12 | unlearn_loss: 19.25 | retain_loss: 3.844 | param_change: 0.002518\n",
      "  4%|█▌                                        | 15/400 [00:28<09:24,  1.47s/it]loss: 20.88 | unlearn_loss: 19.88 | retain_loss: 1.039 | param_change: 0.0009155\n",
      "  4%|█▋                                        | 16/400 [00:30<09:48,  1.53s/it]loss: 20.5 | unlearn_loss: 19.25 | retain_loss: 1.297 | param_change: 0.001068\n",
      "  4%|█▊                                        | 17/400 [00:32<09:40,  1.52s/it]loss: 19.88 | unlearn_loss: 17.88 | retain_loss: 2.016 | param_change: 0.001366\n",
      "  4%|█▉                                        | 18/400 [00:33<10:04,  1.58s/it]loss: 23.75 | unlearn_loss: 22.75 | retain_loss: 1.023 | param_change: 0.001038\n",
      "  5%|█▉                                        | 19/400 [00:35<09:50,  1.55s/it]loss: 20.25 | unlearn_loss: 19.62 | retain_loss: 0.6562 | param_change: 0.0007591\n",
      "  5%|██                                        | 20/400 [00:37<10:19,  1.63s/it]loss: 20.25 | unlearn_loss: 19 | retain_loss: 1.305 | param_change: 0.001015\n",
      "  5%|██▏                                       | 21/400 [00:38<09:47,  1.55s/it]loss: 22.25 | unlearn_loss: 21.38 | retain_loss: 0.8438 | param_change: 0.0008278\n",
      "  6%|██▎                                       | 22/400 [00:40<10:00,  1.59s/it]loss: 19.5 | unlearn_loss: 19.12 | retain_loss: 0.4297 | param_change: 0.000391\n",
      "  6%|██▍                                       | 23/400 [00:41<09:37,  1.53s/it]loss: 20.38 | unlearn_loss: 19.88 | retain_loss: 0.543 | param_change: 0.0006599\n",
      "  6%|██▌                                       | 24/400 [00:43<09:57,  1.59s/it]loss: 18.62 | unlearn_loss: 18.25 | retain_loss: 0.3848 | param_change: 0.0005531\n",
      "  6%|██▋                                       | 25/400 [00:44<09:34,  1.53s/it]loss: 21.75 | unlearn_loss: 21.5 | retain_loss: 0.2539 | param_change: 0.0003223\n",
      "  6%|██▋                                       | 26/400 [00:46<09:48,  1.57s/it]loss: 15.81 | unlearn_loss: 15.5 | retain_loss: 0.2969 | param_change: 0.0003948\n",
      "  7%|██▊                                       | 27/400 [00:47<09:27,  1.52s/it]loss: 19.75 | unlearn_loss: 19.5 | retain_loss: 0.2314 | param_change: 0.0002575\n",
      "  7%|██▉                                       | 28/400 [00:49<09:57,  1.61s/it]loss: 16.12 | unlearn_loss: 15.75 | retain_loss: 0.3203 | param_change: 7.153e-05\n",
      "  7%|███                                       | 29/400 [00:50<09:37,  1.56s/it]loss: 19.12 | unlearn_loss: 18.88 | retain_loss: 0.3105 | param_change: 5.937e-05\n",
      "  8%|███▏                                      | 30/400 [00:52<09:59,  1.62s/it]loss: 20.38 | unlearn_loss: 19.25 | retain_loss: 1.086 | param_change: 0.001312\n",
      "  8%|███▎                                      | 31/400 [00:54<09:24,  1.53s/it]loss: 19.5 | unlearn_loss: 18.75 | retain_loss: 0.7852 | param_change: 0.001083\n",
      "  8%|███▎                                      | 32/400 [00:55<09:38,  1.57s/it]loss: 16.25 | unlearn_loss: 15.31 | retain_loss: 0.9258 | param_change: 0.001022\n",
      "  8%|███▍                                      | 33/400 [00:57<09:23,  1.53s/it]loss: 18.38 | unlearn_loss: 18.12 | retain_loss: 0.3008 | param_change: 0.0002184\n",
      "  8%|███▌                                      | 34/400 [00:58<09:48,  1.61s/it]loss: 18 | unlearn_loss: 15.81 | retain_loss: 2.219 | param_change: 0.001907\n",
      "  9%|███▋                                      | 35/400 [01:00<09:12,  1.51s/it]loss: 19.88 | unlearn_loss: 18.12 | retain_loss: 1.773 | param_change: 0.001678\n",
      "  9%|███▊                                      | 36/400 [01:01<09:11,  1.52s/it]loss: 19.62 | unlearn_loss: 18 | retain_loss: 1.602 | param_change: 0.001289\n",
      "  9%|███▉                                      | 37/400 [01:03<09:02,  1.50s/it]loss: 19.38 | unlearn_loss: 18.12 | retain_loss: 1.281 | param_change: 0.001122\n",
      " 10%|███▉                                      | 38/400 [01:04<09:22,  1.55s/it]loss: 19.38 | unlearn_loss: 18.25 | retain_loss: 1.109 | param_change: 0.0004501\n",
      " 10%|████                                      | 39/400 [01:06<09:05,  1.51s/it]loss: 19.38 | unlearn_loss: 18.38 | retain_loss: 0.9648 | param_change: 0.0004101\n",
      " 10%|████▏                                     | 40/400 [01:08<09:34,  1.59s/it]loss: 19.38 | unlearn_loss: 19 | retain_loss: 0.4141 | param_change: 0.0002518\n",
      " 10%|████▎                                     | 41/400 [01:09<08:50,  1.48s/it]loss: 20.75 | unlearn_loss: 20.38 | retain_loss: 0.373 | param_change: 0.000288\n",
      " 10%|████▍                                     | 42/400 [01:10<09:04,  1.52s/it]loss: 17.12 | unlearn_loss: 15.69 | retain_loss: 1.453 | param_change: 0.001007\n",
      " 11%|████▌                                     | 43/400 [01:12<08:31,  1.43s/it]loss: 21.38 | unlearn_loss: 20.38 | retain_loss: 0.9766 | param_change: 0.0006485\n",
      " 11%|████▌                                     | 44/400 [01:13<08:37,  1.45s/it]loss: 17.5 | unlearn_loss: 15.75 | retain_loss: 1.812 | param_change: 0.001038\n",
      " 11%|████▋                                     | 45/400 [01:14<08:13,  1.39s/it]loss: 23 | unlearn_loss: 21.5 | retain_loss: 1.453 | param_change: 0.0009689\n",
      " 12%|████▊                                     | 46/400 [01:16<08:26,  1.43s/it]loss: 23.5 | unlearn_loss: 23.12 | retain_loss: 0.416 | param_change: 0.0004501\n",
      " 12%|████▉                                     | 47/400 [01:17<08:05,  1.38s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 229, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 34.25 | unlearn_loss: 34 | retain_loss: 0.3398 | param_change: 0.0002613\n",
      " 12%|█████                                     | 48/400 [01:18<07:09,  1.22s/it]loss: 20.25 | unlearn_loss: 19.88 | retain_loss: 0.3418 | param_change: 0.0002804\n",
      " 12%|█████▏                                    | 49/400 [01:19<07:24,  1.27s/it]loss: 19.25 | unlearn_loss: 18.88 | retain_loss: 0.3359 | param_change: 0.0002594\n",
      " 12%|█████▎                                    | 50/400 [01:21<07:57,  1.36s/it]loss: 16.62 | unlearn_loss: 15.38 | retain_loss: 1.219 | param_change: 0.0009575\n",
      " 13%|█████▎                                    | 51/400 [01:22<07:39,  1.32s/it]loss: 22.5 | unlearn_loss: 21.38 | retain_loss: 1.141 | param_change: 0.0009727\n",
      " 13%|█████▍                                    | 52/400 [01:24<07:59,  1.38s/it]loss: 16.38 | unlearn_loss: 15.38 | retain_loss: 1.023 | param_change: 0.0007744\n",
      " 13%|█████▌                                    | 53/400 [01:25<08:18,  1.44s/it]loss: 19.88 | unlearn_loss: 19 | retain_loss: 0.8906 | param_change: 0.0007401\n",
      " 14%|█████▋                                    | 54/400 [01:27<09:01,  1.57s/it]loss: 16.25 | unlearn_loss: 15.69 | retain_loss: 0.5039 | param_change: 0.0004692\n",
      " 14%|█████▊                                    | 55/400 [01:28<08:26,  1.47s/it]loss: 19.25 | unlearn_loss: 19 | retain_loss: 0.2773 | param_change: 0.0001888\n",
      " 14%|█████▉                                    | 56/400 [01:30<08:40,  1.51s/it]loss: 19.62 | unlearn_loss: 19 | retain_loss: 0.6641 | param_change: 0.0006676\n",
      " 14%|█████▉                                    | 57/400 [01:31<08:09,  1.43s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 289, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 31.75 | unlearn_loss: 31.12 | retain_loss: 0.5742 | param_change: 0.000679\n",
      " 14%|██████                                    | 58/400 [01:32<07:17,  1.28s/it]loss: 16 | unlearn_loss: 15.69 | retain_loss: 0.3477 | param_change: 0.0003605\n",
      " 15%|██████▏                                   | 59/400 [01:34<07:28,  1.32s/it]loss: 20.12 | unlearn_loss: 19.75 | retain_loss: 0.3223 | param_change: 0.0003815\n",
      " 15%|██████▎                                   | 60/400 [01:35<08:00,  1.41s/it]loss: 18.25 | unlearn_loss: 18 | retain_loss: 0.2676 | param_change: 0.0002432\n",
      " 15%|██████▍                                   | 61/400 [01:37<08:08,  1.44s/it]loss: 18.12 | unlearn_loss: 17.88 | retain_loss: 0.2354 | param_change: 0.0001431\n",
      " 16%|██████▌                                   | 62/400 [01:39<08:48,  1.56s/it]loss: 16.5 | unlearn_loss: 15.81 | retain_loss: 0.6289 | param_change: 0.0004177\n",
      " 16%|██████▌                                   | 63/400 [01:40<08:13,  1.46s/it]loss: 21.25 | unlearn_loss: 20.75 | retain_loss: 0.4922 | param_change: 0.0003204\n",
      " 16%|██████▋                                   | 64/400 [01:41<08:14,  1.47s/it]loss: 16.5 | unlearn_loss: 15.5 | retain_loss: 0.9844 | param_change: 0.0008545\n",
      " 16%|██████▊                                   | 65/400 [01:43<08:42,  1.56s/it]loss: 19.62 | unlearn_loss: 18.75 | retain_loss: 0.8398 | param_change: 0.0007744\n",
      " 16%|██████▉                                   | 66/400 [01:45<09:32,  1.71s/it]loss: 16.25 | unlearn_loss: 15.44 | retain_loss: 0.8555 | param_change: 0.0002937\n",
      " 17%|███████                                   | 67/400 [01:47<09:20,  1.68s/it]loss: 20.38 | unlearn_loss: 19.62 | retain_loss: 0.7539 | param_change: 0.0002728\n",
      " 17%|███████▏                                  | 68/400 [01:49<09:42,  1.75s/it]loss: 19.25 | unlearn_loss: 19.12 | retain_loss: 0.1709 | param_change: 9.584e-05\n",
      " 17%|███████▏                                  | 69/400 [01:50<09:12,  1.67s/it]loss: 19.5 | unlearn_loss: 19.25 | retain_loss: 0.1973 | param_change: 0.0001469\n",
      " 18%|███████▎                                  | 70/400 [01:52<09:15,  1.68s/it]loss: 15.38 | unlearn_loss: 15 | retain_loss: 0.3867 | param_change: 0.0002995\n",
      " 18%|███████▍                                  | 71/400 [01:53<08:36,  1.57s/it]loss: 19.5 | unlearn_loss: 19.12 | retain_loss: 0.3242 | param_change: 0.0002003\n",
      " 18%|███████▌                                  | 72/400 [01:55<08:29,  1.55s/it]loss: 18.25 | unlearn_loss: 18.12 | retain_loss: 0.1001 | param_change: 6.676e-05\n",
      " 18%|███████▋                                  | 73/400 [01:56<08:07,  1.49s/it]loss: 19.5 | unlearn_loss: 19.38 | retain_loss: 0.1006 | param_change: 6.723e-05\n",
      " 18%|███████▊                                  | 74/400 [01:58<08:19,  1.53s/it]loss: 15.69 | unlearn_loss: 15.25 | retain_loss: 0.4238 | param_change: 0.000515\n",
      " 19%|███████▉                                  | 75/400 [01:59<08:00,  1.48s/it]loss: 19.38 | unlearn_loss: 19 | retain_loss: 0.3633 | param_change: 0.0004673\n",
      " 19%|███████▉                                  | 76/400 [02:01<08:23,  1.55s/it]loss: 22.5 | unlearn_loss: 21.5 | retain_loss: 0.9492 | param_change: 0.0008698\n",
      " 19%|████████                                  | 77/400 [02:02<08:07,  1.51s/it]loss: 20 | unlearn_loss: 19.38 | retain_loss: 0.5625 | param_change: 0.0005493\n",
      " 20%|████████▏                                 | 78/400 [02:04<08:27,  1.57s/it]loss: 19.5 | unlearn_loss: 19.25 | retain_loss: 0.2676 | param_change: 0.0002518\n",
      " 20%|████████▎                                 | 79/400 [02:05<08:16,  1.55s/it]loss: 19.62 | unlearn_loss: 19.25 | retain_loss: 0.4062 | param_change: 0.0005379\n",
      " 20%|████████▍                                 | 80/400 [02:07<08:37,  1.62s/it]loss: 20 | unlearn_loss: 20 | retain_loss: 0.03247 | param_change: 8.523e-06\n",
      " 20%|████████▌                                 | 81/400 [02:09<08:22,  1.57s/it]loss: 19.25 | unlearn_loss: 19.25 | retain_loss: 0.03223 | param_change: 8.583e-06\n",
      " 20%|████████▌                                 | 82/400 [02:10<08:41,  1.64s/it]loss: 17 | unlearn_loss: 15.44 | retain_loss: 1.625 | param_change: 0.00103\n",
      " 21%|████████▋                                 | 83/400 [02:12<08:41,  1.64s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 614, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 22.62 | unlearn_loss: 21.38 | retain_loss: 1.195 | param_change: 0.000843\n",
      " 21%|████████▊                                 | 84/400 [02:14<08:54,  1.69s/it]loss: 16.5 | unlearn_loss: 16.12 | retain_loss: 0.3652 | param_change: 0.0003433\n",
      " 21%|████████▉                                 | 85/400 [02:15<08:42,  1.66s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 762, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 21.88 | unlearn_loss: 21.62 | retain_loss: 0.2852 | param_change: 0.0003986\n",
      " 22%|█████████                                 | 86/400 [02:17<09:01,  1.72s/it]loss: 16.88 | unlearn_loss: 16.25 | retain_loss: 0.5703 | param_change: 0.0005074\n",
      " 22%|█████████▏                                | 87/400 [02:19<08:36,  1.65s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 280, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 32.25 | unlearn_loss: 31.75 | retain_loss: 0.4082 | param_change: 0.0003262\n",
      " 22%|█████████▏                                | 88/400 [02:20<07:51,  1.51s/it]loss: 16.75 | unlearn_loss: 16.62 | retain_loss: 0.07861 | param_change: 4.315e-05\n",
      " 22%|█████████▎                                | 89/400 [02:21<07:42,  1.49s/it]loss: 22.88 | unlearn_loss: 22.75 | retain_loss: 0.06689 | param_change: 0.000103\n",
      " 22%|█████████▍                                | 90/400 [02:23<07:56,  1.54s/it]loss: 21 | unlearn_loss: 18.88 | retain_loss: 2.172 | param_change: 0.001175\n",
      " 23%|█████████▌                                | 91/400 [02:25<07:55,  1.54s/it]loss: 21.12 | unlearn_loss: 19.12 | retain_loss: 2.016 | param_change: 0.001114\n",
      " 23%|█████████▋                                | 92/400 [02:26<08:29,  1.65s/it]loss: 21.88 | unlearn_loss: 21.25 | retain_loss: 0.6211 | param_change: 0.0004883\n",
      " 23%|█████████▊                                | 93/400 [02:28<07:55,  1.55s/it]loss: 19 | unlearn_loss: 18.62 | retain_loss: 0.4121 | param_change: 0.0003662\n",
      " 24%|█████████▊                                | 94/400 [02:29<07:55,  1.55s/it]loss: 19 | unlearn_loss: 19 | retain_loss: 0.03296 | param_change: 1.597e-05\n",
      " 24%|█████████▉                                | 95/400 [02:31<07:16,  1.43s/it]loss: 19.25 | unlearn_loss: 19.25 | retain_loss: 0.02954 | param_change: 1.431e-05\n",
      " 24%|██████████                                | 96/400 [02:32<07:23,  1.46s/it]loss: 20.12 | unlearn_loss: 19.88 | retain_loss: 0.2207 | param_change: 0.000227\n",
      " 24%|██████████▏                               | 97/400 [02:33<06:58,  1.38s/it]loss: 19.25 | unlearn_loss: 19 | retain_loss: 0.2246 | param_change: 0.0002689\n",
      " 24%|██████████▎                               | 98/400 [02:35<07:18,  1.45s/it]loss: 20 | unlearn_loss: 19.75 | retain_loss: 0.2969 | param_change: 0.0002995\n",
      " 25%|██████████▍                               | 99/400 [02:36<06:58,  1.39s/it]loss: 19.5 | unlearn_loss: 19.25 | retain_loss: 0.2227 | param_change: 0.0002441\n",
      " 25%|██████████▎                              | 100/400 [02:38<07:13,  1.45s/it]loss: 15.88 | unlearn_loss: 15.81 | retain_loss: 0.07764 | param_change: 4.029e-05\n",
      " 25%|██████████▎                              | 101/400 [02:39<06:48,  1.37s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 551, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 22.88 | unlearn_loss: 22.75 | retain_loss: 0.07129 | param_change: 3.982e-05\n",
      " 26%|██████████▍                              | 102/400 [02:40<06:38,  1.34s/it]loss: 21.25 | unlearn_loss: 21 | retain_loss: 0.2432 | param_change: 0.0002995\n",
      " 26%|██████████▌                              | 103/400 [02:41<06:25,  1.30s/it]loss: 18.88 | unlearn_loss: 18.75 | retain_loss: 0.1328 | param_change: 0.0001831\n",
      " 26%|██████████▋                              | 104/400 [02:43<06:48,  1.38s/it]loss: 15.62 | unlearn_loss: 15.56 | retain_loss: 0.08594 | param_change: 8.059e-05\n",
      " 26%|██████████▊                              | 105/400 [02:44<06:32,  1.33s/it]loss: 19.12 | unlearn_loss: 19 | retain_loss: 0.1128 | param_change: 0.000144\n",
      " 26%|██████████▊                              | 106/400 [02:46<06:49,  1.39s/it]loss: 18.88 | unlearn_loss: 18.88 | retain_loss: 0.0437 | param_change: 3.314e-05\n",
      " 27%|██████████▉                              | 107/400 [02:47<06:33,  1.34s/it]loss: 19.12 | unlearn_loss: 19.12 | retain_loss: 0.03467 | param_change: 2.658e-05\n",
      " 27%|███████████                              | 108/400 [02:48<06:52,  1.41s/it]loss: 16.5 | unlearn_loss: 16.12 | retain_loss: 0.4297 | param_change: 0.0003471\n",
      " 27%|███████████▏                             | 109/400 [02:50<06:35,  1.36s/it]loss: 19.88 | unlearn_loss: 19.5 | retain_loss: 0.373 | param_change: 0.0003262\n",
      " 28%|███████████▎                             | 110/400 [02:51<06:54,  1.43s/it]loss: 18.5 | unlearn_loss: 18.25 | retain_loss: 0.1885 | param_change: 0.0001216\n",
      " 28%|███████████▍                             | 111/400 [02:53<07:10,  1.49s/it]loss: 19.38 | unlearn_loss: 19.12 | retain_loss: 0.1895 | param_change: 0.0001535\n",
      " 28%|███████████▍                             | 112/400 [02:55<07:47,  1.62s/it]loss: 22.12 | unlearn_loss: 21.62 | retain_loss: 0.4941 | param_change: 0.0002108\n",
      " 28%|███████████▌                             | 113/400 [02:56<07:17,  1.53s/it]loss: 19.38 | unlearn_loss: 19 | retain_loss: 0.4355 | param_change: 0.0001974\n",
      " 28%|███████████▋                             | 114/400 [02:58<07:17,  1.53s/it]loss: 20.38 | unlearn_loss: 20.25 | retain_loss: 0.09424 | param_change: 7.868e-05\n",
      " 29%|███████████▊                             | 115/400 [02:59<07:14,  1.53s/it]loss: 19.25 | unlearn_loss: 19.12 | retain_loss: 0.08789 | param_change: 7.248e-05\n",
      " 29%|███████████▉                             | 116/400 [03:01<07:36,  1.61s/it]loss: 19.25 | unlearn_loss: 19 | retain_loss: 0.2695 | param_change: 0.0001879\n",
      " 29%|███████████▉                             | 117/400 [03:02<07:22,  1.56s/it]loss: 19 | unlearn_loss: 18.75 | retain_loss: 0.1953 | param_change: 0.0001373\n",
      " 30%|████████████                             | 118/400 [03:04<07:39,  1.63s/it]loss: 19.12 | unlearn_loss: 18.88 | retain_loss: 0.2891 | param_change: 0.0002127\n",
      " 30%|████████████▏                            | 119/400 [03:06<07:55,  1.69s/it]loss: 19.25 | unlearn_loss: 19 | retain_loss: 0.2402 | param_change: 0.000185\n",
      " 30%|████████████▎                            | 120/400 [03:08<08:31,  1.83s/it]loss: 19.38 | unlearn_loss: 19 | retain_loss: 0.4023 | param_change: 0.0004101\n",
      " 30%|████████████▍                            | 121/400 [03:10<08:04,  1.74s/it]loss: 19 | unlearn_loss: 18.62 | retain_loss: 0.3574 | param_change: 0.00037\n",
      " 30%|████████████▌                            | 122/400 [03:12<08:08,  1.76s/it]loss: 16.25 | unlearn_loss: 16.12 | retain_loss: 0.1572 | param_change: 0.0001354\n",
      " 31%|████████████▌                            | 123/400 [03:13<08:15,  1.79s/it]loss: 18.62 | unlearn_loss: 18.5 | retain_loss: 0.1465 | param_change: 0.0001168\n",
      " 31%|████████████▋                            | 124/400 [03:16<08:42,  1.89s/it]loss: 16.25 | unlearn_loss: 15.38 | retain_loss: 0.9023 | param_change: 0.0006294\n",
      " 31%|████████████▊                            | 125/400 [03:17<08:02,  1.76s/it]loss: 19.25 | unlearn_loss: 18.75 | retain_loss: 0.5117 | param_change: 0.0004272\n",
      " 32%|████████████▉                            | 126/400 [03:19<08:04,  1.77s/it]loss: 15.56 | unlearn_loss: 15.44 | retain_loss: 0.1553 | param_change: 0.0001898\n",
      " 32%|█████████████                            | 127/400 [03:21<08:12,  1.80s/it]loss: 18.5 | unlearn_loss: 18.38 | retain_loss: 0.1719 | param_change: 0.0002346\n",
      " 32%|█████████████                            | 128/400 [03:23<08:41,  1.92s/it]loss: 22.62 | unlearn_loss: 22.5 | retain_loss: 0.1235 | param_change: 0.0001121\n",
      " 32%|█████████████▏                           | 129/400 [03:25<08:19,  1.84s/it]loss: 18.75 | unlearn_loss: 18.62 | retain_loss: 0.1055 | param_change: 8.774e-05\n",
      " 32%|█████████████▎                           | 130/400 [03:26<08:26,  1.87s/it]loss: 18.5 | unlearn_loss: 18.38 | retain_loss: 0.1216 | param_change: 4.768e-05\n",
      " 33%|█████████████▍                           | 131/400 [03:28<08:03,  1.80s/it]loss: 18.75 | unlearn_loss: 18.62 | retain_loss: 0.1426 | param_change: 5.937e-05\n",
      " 33%|█████████████▌                           | 132/400 [03:30<08:06,  1.82s/it]loss: 18.62 | unlearn_loss: 15.25 | retain_loss: 3.375 | param_change: 0.0019\n",
      " 33%|█████████████▋                           | 133/400 [03:31<07:34,  1.70s/it]loss: 20.62 | unlearn_loss: 18.75 | retain_loss: 1.922 | param_change: 0.00135\n",
      " 34%|█████████████▋                           | 134/400 [03:33<07:37,  1.72s/it]loss: 16.5 | unlearn_loss: 16.38 | retain_loss: 0.1147 | param_change: 6.628e-05\n",
      " 34%|█████████████▊                           | 135/400 [03:35<07:32,  1.71s/it]loss: 21.88 | unlearn_loss: 21.5 | retain_loss: 0.3691 | param_change: 0.0001049\n",
      " 34%|█████████████▉                           | 136/400 [03:37<07:50,  1.78s/it]loss: 17.5 | unlearn_loss: 15.25 | retain_loss: 2.312 | param_change: 0.001534\n",
      " 34%|██████████████                           | 137/400 [03:38<07:23,  1.69s/it]loss: 21 | unlearn_loss: 19 | retain_loss: 1.969 | param_change: 0.001381\n",
      " 34%|██████████████▏                          | 138/400 [03:40<07:22,  1.69s/it]loss: 18.62 | unlearn_loss: 15.75 | retain_loss: 2.906 | param_change: 0.001183\n",
      " 35%|██████████████▏                          | 139/400 [03:41<07:07,  1.64s/it]loss: 20.75 | unlearn_loss: 19.12 | retain_loss: 1.57 | param_change: 0.0006447\n",
      " 35%|██████████████▎                          | 140/400 [03:43<07:18,  1.69s/it]loss: 23.5 | unlearn_loss: 21.62 | retain_loss: 1.898 | param_change: 0.0008812\n",
      " 35%|██████████████▍                          | 141/400 [03:45<07:03,  1.64s/it]loss: 23.5 | unlearn_loss: 21 | retain_loss: 2.516 | param_change: 0.001266\n",
      " 36%|██████████████▌                          | 142/400 [03:47<07:17,  1.70s/it]loss: 21.75 | unlearn_loss: 21.75 | retain_loss: 0.03662 | param_change: 1.228e-05\n",
      " 36%|██████████████▋                          | 143/400 [03:48<07:06,  1.66s/it]loss: 21.75 | unlearn_loss: 21.75 | retain_loss: 0.0354 | param_change: 9.394e-05\n",
      " 36%|██████████████▊                          | 144/400 [03:50<07:21,  1.73s/it]loss: 27 | unlearn_loss: 21.62 | retain_loss: 5.406 | param_change: 0.002716\n",
      " 36%|██████████████▊                          | 145/400 [03:51<06:43,  1.58s/it]loss: 18.88 | unlearn_loss: 18.25 | retain_loss: 0.6797 | param_change: 0.0004997\n",
      " 36%|██████████████▉                          | 146/400 [03:53<06:39,  1.57s/it]loss: 16 | unlearn_loss: 15.44 | retain_loss: 0.6016 | param_change: 0.0002375\n",
      " 37%|███████████████                          | 147/400 [03:54<06:32,  1.55s/it]loss: 19.5 | unlearn_loss: 18.62 | retain_loss: 0.8867 | param_change: 0.0002842\n",
      " 37%|███████████████▏                         | 148/400 [03:56<06:42,  1.60s/it]loss: 16.12 | unlearn_loss: 15.06 | retain_loss: 1.008 | param_change: 0.0003185\n",
      " 37%|███████████████▎                         | 149/400 [03:57<06:15,  1.50s/it]loss: 19.5 | unlearn_loss: 18.5 | retain_loss: 0.9766 | param_change: 0.0003109\n",
      " 38%|███████████████▍                         | 150/400 [03:59<06:15,  1.50s/it]loss: 20.12 | unlearn_loss: 15.19 | retain_loss: 4.906 | param_change: 0.001022\n",
      " 38%|███████████████▍                         | 151/400 [04:01<06:43,  1.62s/it]loss: 24.12 | unlearn_loss: 19.5 | retain_loss: 4.594 | param_change: 0.0009651\n",
      " 38%|███████████████▌                         | 152/400 [04:03<07:23,  1.79s/it]loss: 21.25 | unlearn_loss: 15.25 | retain_loss: 6.062 | param_change: 0.001312\n",
      " 38%|███████████████▋                         | 153/400 [04:04<06:56,  1.69s/it]loss: 23.38 | unlearn_loss: 19.38 | retain_loss: 3.953 | param_change: 0.001038\n",
      " 38%|███████████████▊                         | 154/400 [04:06<06:55,  1.69s/it]loss: 19.12 | unlearn_loss: 18.75 | retain_loss: 0.3926 | param_change: 0.0002003\n",
      " 39%|███████████████▉                         | 155/400 [04:07<06:30,  1.59s/it]loss: 19 | unlearn_loss: 18.62 | retain_loss: 0.3613 | param_change: 0.0001974\n",
      " 39%|███████████████▉                         | 156/400 [04:09<06:29,  1.59s/it]loss: 17.5 | unlearn_loss: 15.75 | retain_loss: 1.719 | param_change: 0.0005035\n",
      " 39%|████████████████                         | 157/400 [04:11<06:32,  1.62s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 634, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 22.25 | unlearn_loss: 20.75 | retain_loss: 1.508 | param_change: 0.0005608\n",
      " 40%|████████████████▏                        | 158/400 [04:12<06:40,  1.66s/it]loss: 19.12 | unlearn_loss: 15.44 | retain_loss: 3.703 | param_change: 0.0009308\n",
      " 40%|████████████████▎                        | 159/400 [04:14<06:31,  1.62s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 390, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 28.62 | unlearn_loss: 24.88 | retain_loss: 3.797 | param_change: 0.0009384\n",
      " 40%|████████████████▍                        | 160/400 [04:15<06:16,  1.57s/it]loss: 18.25 | unlearn_loss: 14.88 | retain_loss: 3.438 | param_change: 0.0009651\n",
      " 40%|████████████████▌                        | 161/400 [04:17<06:18,  1.58s/it]loss: 22.38 | unlearn_loss: 20.62 | retain_loss: 1.773 | param_change: 0.0006752\n",
      " 40%|████████████████▌                        | 162/400 [04:19<06:33,  1.66s/it]loss: 19.25 | unlearn_loss: 18.38 | retain_loss: 0.918 | param_change: 0.000515\n",
      " 41%|████████████████▋                        | 163/400 [04:20<06:19,  1.60s/it]loss: 19 | unlearn_loss: 18.38 | retain_loss: 0.6016 | param_change: 0.0001526\n",
      " 41%|████████████████▊                        | 164/400 [04:22<06:28,  1.65s/it]loss: 19.12 | unlearn_loss: 18 | retain_loss: 1.086 | param_change: 0.0005035\n",
      " 41%|████████████████▉                        | 165/400 [04:24<06:11,  1.58s/it]loss: 20 | unlearn_loss: 18.38 | retain_loss: 1.562 | param_change: 0.0006294\n",
      " 42%|█████████████████                        | 166/400 [04:25<06:21,  1.63s/it]loss: 15.88 | unlearn_loss: 15.38 | retain_loss: 0.5273 | param_change: 0.0002184\n",
      " 42%|█████████████████                        | 167/400 [04:27<06:13,  1.60s/it]loss: 19 | unlearn_loss: 18.5 | retain_loss: 0.5039 | param_change: 0.0002213\n",
      " 42%|█████████████████▏                       | 168/400 [04:29<06:25,  1.66s/it]loss: 16.25 | unlearn_loss: 15.19 | retain_loss: 1.109 | param_change: 0.0006676\n",
      " 42%|█████████████████▎                       | 169/400 [04:30<06:00,  1.56s/it]loss: 19 | unlearn_loss: 18.12 | retain_loss: 0.8633 | param_change: 0.0005608\n",
      " 42%|█████████████████▍                       | 170/400 [04:32<06:04,  1.58s/it]loss: 22 | unlearn_loss: 21.88 | retain_loss: 0.09082 | param_change: 1.824e-05\n",
      " 43%|█████████████████▌                       | 171/400 [04:33<06:06,  1.60s/it]loss: 18.5 | unlearn_loss: 18.38 | retain_loss: 0.08887 | param_change: 1.562e-05\n",
      " 43%|█████████████████▋                       | 172/400 [04:35<06:26,  1.69s/it]loss: 16.75 | unlearn_loss: 15.44 | retain_loss: 1.289 | param_change: 0.0005455\n",
      " 43%|█████████████████▋                       | 173/400 [04:37<06:08,  1.62s/it]loss: 19.5 | unlearn_loss: 18.5 | retain_loss: 0.9648 | param_change: 0.0004559\n",
      " 44%|█████████████████▊                       | 174/400 [04:38<06:13,  1.65s/it]loss: 19.62 | unlearn_loss: 18.88 | retain_loss: 0.7539 | param_change: 0.0003262\n",
      " 44%|█████████████████▉                       | 175/400 [04:40<05:47,  1.54s/it]loss: 20.38 | unlearn_loss: 19.62 | retain_loss: 0.8008 | param_change: 0.0004997\n",
      " 44%|██████████████████                       | 176/400 [04:41<05:47,  1.55s/it]loss: 18.25 | unlearn_loss: 18.12 | retain_loss: 0.1221 | param_change: 4.077e-05\n",
      " 44%|██████████████████▏                      | 177/400 [04:42<05:29,  1.48s/it]/data/long_phan/wmdp/wmdp/wmdp/cut/unlearn.py:68: UserWarning: Using a target size (torch.Size([1, 1, 7168])) that is different to the input size (torch.Size([2, 420, 7168])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  unlearn_loss = torch.nn.functional.mse_loss(\n",
      "loss: 24.5 | unlearn_loss: 24.38 | retain_loss: 0.1289 | param_change: 5.674e-05\n",
      " 44%|██████████████████▏                      | 178/400 [04:44<05:09,  1.40s/it]loss: 18.88 | unlearn_loss: 18.62 | retain_loss: 0.2402 | param_change: 8.011e-05\n",
      " 45%|██████████████████▎                      | 179/400 [04:45<05:20,  1.45s/it]loss: 19.12 | unlearn_loss: 18.88 | retain_loss: 0.2412 | param_change: 9.394e-05\n",
      " 45%|██████████████████▍                      | 180/400 [04:47<05:46,  1.58s/it]loss: 20 | unlearn_loss: 18.62 | retain_loss: 1.328 | param_change: 0.0004272\n",
      " 45%|██████████████████▌                      | 181/400 [04:49<05:31,  1.51s/it]loss: 20 | unlearn_loss: 18.75 | retain_loss: 1.273 | param_change: 0.0004158\n",
      " 46%|██████████████████▋                      | 182/400 [04:50<05:37,  1.55s/it]loss: 22.5 | unlearn_loss: 21.12 | retain_loss: 1.391 | param_change: 0.0009155\n",
      " 46%|██████████████████▊                      | 183/400 [04:51<05:19,  1.47s/it]loss: 19.5 | unlearn_loss: 18.75 | retain_loss: 0.7266 | param_change: 0.0005569\n",
      " 46%|██████████████████▊                      | 184/400 [04:53<05:27,  1.52s/it]loss: 19 | unlearn_loss: 18.88 | retain_loss: 0.1777 | param_change: 4.387e-05\n",
      " 46%|██████████████████▉                      | 185/400 [04:54<05:21,  1.49s/it]loss: 19 | unlearn_loss: 18.88 | retain_loss: 0.167 | param_change: 3.815e-05\n",
      " 46%|███████████████████                      | 186/400 [04:56<05:37,  1.58s/it]loss: 22.5 | unlearn_loss: 18.38 | retain_loss: 4.094 | param_change: 0.001755\n",
      " 47%|███████████████████▏                     | 187/400 [04:58<05:19,  1.50s/it]loss: 21.75 | unlearn_loss: 18.62 | retain_loss: 3.094 | param_change: 0.001373\n",
      " 47%|███████████████████▎                     | 188/400 [04:59<05:21,  1.52s/it]loss: 15.19 | unlearn_loss: 15 | retain_loss: 0.1787 | param_change: 5.937e-05\n",
      " 47%|███████████████████▎                     | 189/400 [05:01<05:29,  1.56s/it]loss: 18.88 | unlearn_loss: 18.75 | retain_loss: 0.1631 | param_change: 3.672e-05\n",
      " 48%|███████████████████▍                     | 190/400 [05:03<05:52,  1.68s/it]loss: 20.5 | unlearn_loss: 18.12 | retain_loss: 2.406 | param_change: 0.001076\n",
      " 48%|███████████████████▌                     | 191/400 [05:04<05:31,  1.59s/it]loss: 21.75 | unlearn_loss: 19.5 | retain_loss: 2.312 | param_change: 0.001038\n",
      " 48%|███████████████████▋                     | 192/400 [05:06<05:37,  1.62s/it]loss: 18.12 | unlearn_loss: 18 | retain_loss: 0.1436 | param_change: 5.436e-05\n",
      " 48%|███████████████████▊                     | 193/400 [05:08<05:47,  1.68s/it]loss: 19.12 | unlearn_loss: 19 | retain_loss: 0.1416 | param_change: 4.911e-05\n",
      " 48%|███████████████████▉                     | 194/400 [05:10<06:08,  1.79s/it]loss: 21.5 | unlearn_loss: 21.25 | retain_loss: 0.2002 | param_change: 7.486e-05\n",
      " 49%|███████████████████▉                     | 195/400 [05:11<06:01,  1.76s/it]loss: 19.12 | unlearn_loss: 18.88 | retain_loss: 0.1914 | param_change: 6.294e-05\n",
      " 49%|████████████████████                     | 196/400 [05:13<06:16,  1.85s/it]loss: 16.38 | unlearn_loss: 15.69 | retain_loss: 0.7031 | param_change: 0.0003242\n",
      " 49%|████████████████████▏                    | 197/400 [05:15<05:56,  1.76s/it]loss: 19.62 | unlearn_loss: 19 | retain_loss: 0.6016 | param_change: 0.000288\n",
      " 50%|████████████████████▎                    | 198/400 [05:17<06:06,  1.81s/it]loss: 19.5 | unlearn_loss: 18.5 | retain_loss: 1.008 | param_change: 0.0004883\n",
      " 50%|████████████████████▍                    | 199/400 [05:18<05:27,  1.63s/it]loss: 20.38 | unlearn_loss: 19.75 | retain_loss: 0.625 | param_change: 0.0002842\n",
      " 50%|████████████████████▌                    | 200/400 [05:20<05:20,  1.60s/it]loss: 19.5 | unlearn_loss: 17.75 | retain_loss: 1.812 | param_change: 0.0008583\n",
      " 50%|████████████████████▌                    | 201/400 [05:21<05:03,  1.52s/it]loss: 19.38 | unlearn_loss: 17.5 | retain_loss: 1.828 | param_change: 0.0008812\n",
      " 50%|████████████████████▋                    | 202/400 [05:23<05:12,  1.58s/it]loss: 17.75 | unlearn_loss: 17.62 | retain_loss: 0.1611 | param_change: 9.537e-05\n",
      " 51%|████████████████████▊                    | 203/400 [05:24<04:53,  1.49s/it]loss: 19.25 | unlearn_loss: 19.12 | retain_loss: 0.1523 | param_change: 6.962e-05\n",
      " 51%|████████████████████▉                    | 204/400 [05:26<04:56,  1.51s/it]loss: 16.62 | unlearn_loss: 15.19 | retain_loss: 1.391 | param_change: 0.0007362\n",
      " 51%|█████████████████████                    | 205/400 [05:27<04:39,  1.43s/it]loss: 19.88 | unlearn_loss: 19 | retain_loss: 0.9336 | param_change: 0.0005112\n",
      " 52%|█████████████████████                    | 206/400 [05:28<04:44,  1.47s/it]loss: 18 | unlearn_loss: 17.38 | retain_loss: 0.5898 | param_change: 0.0001936\n",
      " 52%|█████████████████████▏                   | 207/400 [05:30<04:41,  1.46s/it]loss: 19.88 | unlearn_loss: 19.38 | retain_loss: 0.5547 | param_change: 0.0002089\n",
      " 52%|█████████████████████▎                   | 208/400 [05:32<04:56,  1.55s/it]loss: 14.94 | unlearn_loss: 14.62 | retain_loss: 0.3242 | param_change: 0.000164\n",
      " 52%|█████████████████████▍                   | 209/400 [05:33<05:04,  1.60s/it]loss: 19.88 | unlearn_loss: 19.5 | retain_loss: 0.3301 | param_change: 0.0001612\n",
      " 52%|█████████████████████▌                   | 210/400 [05:35<05:33,  1.76s/it]loss: 21.25 | unlearn_loss: 21.25 | retain_loss: 0.04395 | param_change: 3.338e-05\n",
      " 53%|█████████████████████▋                   | 211/400 [05:37<05:10,  1.64s/it]loss: 19.62 | unlearn_loss: 19.62 | retain_loss: 0.04346 | param_change: 1.234e-05\n",
      " 53%|█████████████████████▋                   | 212/400 [05:38<05:08,  1.64s/it]loss: 21.5 | unlearn_loss: 21.25 | retain_loss: 0.1914 | param_change: 0.0001316\n",
      " 53%|█████████████████████▊                   | 213/400 [05:40<04:59,  1.60s/it]loss: 18.88 | unlearn_loss: 18.75 | retain_loss: 0.1846 | param_change: 7.153e-05\n",
      " 54%|█████████████████████▉                   | 214/400 [05:42<05:18,  1.71s/it]loss: 21.12 | unlearn_loss: 20.75 | retain_loss: 0.3418 | param_change: 0.0002174\n",
      " 54%|██████████████████████                   | 215/400 [05:43<05:06,  1.65s/it]loss: 19.12 | unlearn_loss: 18.75 | retain_loss: 0.3262 | param_change: 0.0001812\n",
      " 54%|██████████████████████▏                  | 216/400 [05:45<05:14,  1.71s/it]loss: 17.75 | unlearn_loss: 17 | retain_loss: 0.7422 | param_change: 0.0003052\n",
      " 54%|██████████████████████▏                  | 217/400 [05:47<04:57,  1.63s/it]loss: 19.5 | unlearn_loss: 18.75 | retain_loss: 0.6992 | param_change: 0.0002575\n",
      " 55%|██████████████████████▎                  | 218/400 [05:49<05:15,  1.73s/it]loss: 18.12 | unlearn_loss: 18 | retain_loss: 0.1128 | param_change: 6.199e-05\n",
      " 55%|██████████████████████▍                  | 219/400 [05:50<04:51,  1.61s/it]loss: 18.62 | unlearn_loss: 18.5 | retain_loss: 0.106 | param_change: 3.886e-05\n",
      " 55%|██████████████████████▌                  | 220/400 [05:52<05:09,  1.72s/it]loss: 17.12 | unlearn_loss: 17 | retain_loss: 0.1484 | param_change: 0.0001268\n",
      " 55%|██████████████████████▋                  | 221/400 [05:53<04:46,  1.60s/it]loss: 18.62 | unlearn_loss: 18.5 | retain_loss: 0.1436 | param_change: 4.458e-05\n",
      " 56%|██████████████████████▊                  | 222/400 [05:55<04:45,  1.61s/it]loss: 17 | unlearn_loss: 14.38 | retain_loss: 2.578 | param_change: 0.001244\n",
      " 56%|██████████████████████▊                  | 223/400 [05:56<04:29,  1.52s/it]loss: 20.12 | unlearn_loss: 18.62 | retain_loss: 1.531 | param_change: 0.0008278\n",
      " 56%|██████████████████████▉                  | 224/400 [05:58<04:35,  1.56s/it]loss: 15.38 | unlearn_loss: 14.69 | retain_loss: 0.6719 | param_change: 0.0005074\n",
      " 56%|███████████████████████                  | 225/400 [05:59<04:27,  1.53s/it]loss: 20.12 | unlearn_loss: 19.25 | retain_loss: 0.918 | param_change: 0.0003662\n",
      " 56%|███████████████████████▏                 | 226/400 [06:01<04:38,  1.60s/it]loss: 12.25 | unlearn_loss: 11.56 | retain_loss: 0.6641 | param_change: 0.0002747\n",
      " 57%|███████████████████████▎                 | 227/400 [06:02<04:23,  1.52s/it]loss: 19.5 | unlearn_loss: 18.88 | retain_loss: 0.5977 | param_change: 0.0002317\n",
      " 57%|███████████████████████▎                 | 228/400 [06:04<04:31,  1.58s/it]loss: 8.812 | unlearn_loss: 8.312 | retain_loss: 0.5078 | param_change: 0.0005074\n",
      " 57%|███████████████████████▍                 | 229/400 [06:06<04:18,  1.51s/it]loss: 19.88 | unlearn_loss: 19.5 | retain_loss: 0.373 | param_change: 0.0002155\n",
      " 57%|███████████████████████▌                 | 230/400 [06:07<04:23,  1.55s/it]loss: 12.38 | unlearn_loss: 11.81 | retain_loss: 0.582 | param_change: 0.0007706\n",
      " 58%|███████████████████████▋                 | 231/400 [06:09<04:26,  1.58s/it]loss: 19.88 | unlearn_loss: 19.5 | retain_loss: 0.373 | param_change: 0.0002108\n",
      " 58%|███████████████████████▊                 | 232/400 [06:11<04:49,  1.73s/it]loss: 20.38 | unlearn_loss: 15.56 | retain_loss: 4.812 | param_change: 0.001427\n",
      " 58%|███████████████████████▉                 | 233/400 [06:13<04:44,  1.71s/it]loss: 22.88 | unlearn_loss: 19.62 | retain_loss: 3.297 | param_change: 0.001144\n",
      " 58%|███████████████████████▉                 | 234/400 [06:14<04:54,  1.77s/it]loss: 11.25 | unlearn_loss: 9.688 | retain_loss: 1.57 | param_change: 0.0009727\n",
      " 59%|████████████████████████                 | 235/400 [06:16<04:36,  1.68s/it]loss: 21.62 | unlearn_loss: 19.38 | retain_loss: 2.219 | param_change: 0.0007629\n",
      " 59%|████████████████████████▏                | 236/400 [06:18<04:40,  1.71s/it]loss: 12.06 | unlearn_loss: 10.5 | retain_loss: 1.562 | param_change: 0.0006027\n",
      " 59%|████████████████████████▎                | 237/400 [06:19<04:27,  1.64s/it]loss: 20.5 | unlearn_loss: 19.25 | retain_loss: 1.266 | param_change: 0.0004807\n",
      " 60%|████████████████████████▍                | 238/400 [06:21<04:33,  1.69s/it]loss: 10 | unlearn_loss: 8.812 | retain_loss: 1.188 | param_change: 0.0007935\n",
      " 60%|████████████████████████▍                | 239/400 [06:23<04:24,  1.64s/it]loss: 20.25 | unlearn_loss: 19.12 | retain_loss: 1.156 | param_change: 0.0006371\n",
      " 60%|████████████████████████▌                | 240/400 [06:24<04:34,  1.71s/it]loss: 16.5 | unlearn_loss: 16.25 | retain_loss: 0.2246 | param_change: 0.0003757\n",
      " 60%|████████████████████████▋                | 241/400 [06:26<04:24,  1.67s/it]loss: 19.38 | unlearn_loss: 19.12 | retain_loss: 0.2119 | param_change: 0.0001011\n",
      " 60%|████████████████████████▊                | 242/400 [06:28<04:33,  1.73s/it]loss: 13.31 | unlearn_loss: 13.19 | retain_loss: 0.09473 | param_change: 0.0001535\n",
      " 61%|████████████████████████▉                | 243/400 [06:29<04:22,  1.67s/it]loss: 19.5 | unlearn_loss: 19.38 | retain_loss: 0.105 | param_change: 3.719e-05\n",
      " 61%|█████████████████████████                | 244/400 [06:31<04:30,  1.73s/it]loss: 6.406 | unlearn_loss: 5.656 | retain_loss: 0.7422 | param_change: 0.0004444\n",
      " 61%|█████████████████████████                | 245/400 [06:33<04:15,  1.65s/it]loss: 19.62 | unlearn_loss: 19 | retain_loss: 0.5859 | param_change: 0.0002537\n",
      " 62%|█████████████████████████▏               | 246/400 [06:34<04:17,  1.67s/it]loss: 5.406 | unlearn_loss: 5 | retain_loss: 0.4121 | param_change: 0.00037\n",
      " 62%|█████████████████████████▎               | 247/400 [06:36<04:07,  1.62s/it]loss: 19.12 | unlearn_loss: 18.62 | retain_loss: 0.4512 | param_change: 0.0002279\n",
      " 62%|█████████████████████████▍               | 248/400 [06:38<04:14,  1.68s/it]loss: 8.125 | unlearn_loss: 7.281 | retain_loss: 0.8125 | param_change: 0.0005264\n",
      " 62%|█████████████████████████▌               | 249/400 [06:39<03:55,  1.56s/it]loss: 18.88 | unlearn_loss: 18.12 | retain_loss: 0.7461 | param_change: 0.0004654\n",
      " 62%|█████████████████████████▋               | 250/400 [06:41<03:54,  1.56s/it]loss: 11.38 | unlearn_loss: 11.12 | retain_loss: 0.2285 | param_change: 0.0002518\n",
      " 63%|█████████████████████████▋               | 251/400 [06:42<03:47,  1.52s/it]loss: 19.62 | unlearn_loss: 19.38 | retain_loss: 0.1973 | param_change: 0.0001144\n",
      " 63%|█████████████████████████▊               | 252/400 [06:44<03:53,  1.58s/it]loss: 9.125 | unlearn_loss: 8.75 | retain_loss: 0.3789 | param_change: 0.0003262\n",
      " 63%|█████████████████████████▉               | 253/400 [06:45<03:47,  1.55s/it]loss: 16.5 | unlearn_loss: 16.12 | retain_loss: 0.3613 | param_change: 0.0001187\n",
      " 64%|██████████████████████████               | 254/400 [06:47<03:54,  1.61s/it]loss: 5.906 | unlearn_loss: 5.75 | retain_loss: 0.1436 | param_change: 0.0004444\n",
      " 64%|██████████████████████████▏              | 255/400 [06:48<03:44,  1.55s/it]loss: 19.12 | unlearn_loss: 19 | retain_loss: 0.1216 | param_change: 7.105e-05\n",
      " 64%|██████████████████████████▏              | 256/400 [06:50<03:54,  1.63s/it]loss: 10.5 | unlearn_loss: 10.38 | retain_loss: 0.1143 | param_change: 5.984e-05\n",
      " 64%|██████████████████████████▎              | 257/400 [06:52<03:42,  1.55s/it]loss: 19.12 | unlearn_loss: 19 | retain_loss: 0.1113 | param_change: 3.743e-05\n",
      " 64%|██████████████████████████▍              | 258/400 [06:53<03:47,  1.60s/it]loss: 7.75 | unlearn_loss: 7.625 | retain_loss: 0.1157 | param_change: 8.202e-05\n",
      " 65%|██████████████████████████▌              | 259/400 [06:55<03:43,  1.59s/it]loss: 18.88 | unlearn_loss: 18.75 | retain_loss: 0.1162 | param_change: 4.911e-05\n",
      " 65%|██████████████████████████▋              | 260/400 [06:57<03:55,  1.68s/it]loss: 10.5 | unlearn_loss: 10 | retain_loss: 0.5 | param_change: 0.0002899\n",
      " 65%|██████████████████████████▊              | 261/400 [06:58<03:39,  1.58s/it]loss: 19.38 | unlearn_loss: 18.88 | retain_loss: 0.4863 | param_change: 0.0002747\n",
      " 66%|██████████████████████████▊              | 262/400 [07:00<03:42,  1.61s/it]loss: 7.438 | unlearn_loss: 7.344 | retain_loss: 0.1074 | param_change: 0.0001106\n",
      " 66%|██████████████████████████▉              | 263/400 [07:01<03:29,  1.53s/it]loss: 18.75 | unlearn_loss: 18.62 | retain_loss: 0.106 | param_change: 6.485e-05\n",
      " 66%|███████████████████████████              | 264/400 [07:03<03:31,  1.56s/it]loss: 4.75 | unlearn_loss: 4.562 | retain_loss: 0.1719 | param_change: 0.0002613\n",
      " 66%|███████████████████████████▏             | 265/400 [07:04<03:21,  1.50s/it]loss: 18.38 | unlearn_loss: 18.25 | retain_loss: 0.165 | param_change: 0.0001063\n",
      " 66%|███████████████████████████▎             | 266/400 [07:06<03:28,  1.56s/it]loss: 10.75 | unlearn_loss: 10.38 | retain_loss: 0.4062 | param_change: 0.0002375\n",
      " 67%|███████████████████████████▎             | 267/400 [07:07<03:21,  1.51s/it]loss: 19 | unlearn_loss: 18.62 | retain_loss: 0.3418 | param_change: 0.0001926\n",
      " 67%|███████████████████████████▍             | 268/400 [07:09<03:30,  1.60s/it]loss: 12.12 | unlearn_loss: 12.06 | retain_loss: 0.09082 | param_change: 9.012e-05\n",
      " 67%|███████████████████████████▌             | 269/400 [07:10<03:25,  1.57s/it]loss: 17.75 | unlearn_loss: 17.62 | retain_loss: 0.09424 | param_change: 0.0001202\n",
      " 68%|███████████████████████████▋             | 270/400 [07:12<03:38,  1.68s/it]loss: 4.906 | unlearn_loss: 4.344 | retain_loss: 0.5547 | param_change: 0.0003681\n",
      " 68%|███████████████████████████▊             | 271/400 [07:14<03:17,  1.53s/it]loss: 17.88 | unlearn_loss: 17.38 | retain_loss: 0.5391 | param_change: 0.0002422\n",
      " 68%|███████████████████████████▉             | 272/400 [07:15<03:21,  1.57s/it]loss: 7.312 | unlearn_loss: 7.125 | retain_loss: 0.1963 | param_change: 0.0002613\n",
      " 68%|███████████████████████████▉             | 273/400 [07:17<03:19,  1.57s/it]loss: 17.12 | unlearn_loss: 17 | retain_loss: 0.1816 | param_change: 0.0001898\n",
      " 68%|████████████████████████████             | 274/400 [07:19<03:35,  1.71s/it]loss: 4.312 | unlearn_loss: 4.156 | retain_loss: 0.1592 | param_change: 0.000104\n",
      " 69%|████████████████████████████▏            | 275/400 [07:21<03:32,  1.70s/it]loss: 15.75 | unlearn_loss: 15.56 | retain_loss: 0.1631 | param_change: 0.0002613\n",
      " 69%|████████████████████████████▎            | 276/400 [07:23<03:53,  1.88s/it]loss: 4.062 | unlearn_loss: 3.922 | retain_loss: 0.1289 | param_change: 0.0001364\n",
      " 69%|████████████████████████████▍            | 277/400 [07:24<03:41,  1.80s/it]loss: 14.75 | unlearn_loss: 14.62 | retain_loss: 0.1279 | param_change: 0.0002613\n",
      " 70%|████████████████████████████▍            | 278/400 [07:26<03:41,  1.82s/it]loss: 15.62 | unlearn_loss: 15.31 | retain_loss: 0.3184 | param_change: 0.0002804\n",
      " 70%|████████████████████████████▌            | 279/400 [07:28<03:26,  1.71s/it]loss: 14.75 | unlearn_loss: 14.44 | retain_loss: 0.293 | param_change: 0.0003986\n",
      " 70%|████████████████████████████▋            | 280/400 [07:30<03:27,  1.73s/it]loss: 7.5 | unlearn_loss: 7.375 | retain_loss: 0.1216 | param_change: 6.962e-05\n",
      " 70%|████████████████████████████▊            | 281/400 [07:31<03:11,  1.61s/it]loss: 14.12 | unlearn_loss: 14 | retain_loss: 0.1221 | param_change: 0.0005341\n",
      " 70%|████████████████████████████▉            | 282/400 [07:33<03:14,  1.64s/it]loss: 7.5 | unlearn_loss: 6.688 | retain_loss: 0.8164 | param_change: 0.0002785\n",
      " 71%|█████████████████████████████            | 283/400 [07:34<02:58,  1.53s/it]loss: 12.25 | unlearn_loss: 11.5 | retain_loss: 0.7383 | param_change: 0.0003433\n",
      " 71%|█████████████████████████████            | 284/400 [07:35<02:55,  1.51s/it]loss: 8.375 | unlearn_loss: 7.406 | retain_loss: 0.957 | param_change: 0.0005608\n",
      " 71%|█████████████████████████████▏           | 285/400 [07:37<02:50,  1.48s/it]loss: 14.5 | unlearn_loss: 13.69 | retain_loss: 0.7891 | param_change: 0.0006866\n",
      " 72%|█████████████████████████████▎           | 286/400 [07:38<02:55,  1.54s/it]loss: 7.375 | unlearn_loss: 7.25 | retain_loss: 0.1299 | param_change: 9.918e-05\n",
      " 72%|█████████████████████████████▍           | 287/400 [07:40<02:48,  1.49s/it]loss: 11.81 | unlearn_loss: 11.69 | retain_loss: 0.1338 | param_change: 0.00033\n",
      " 72%|█████████████████████████████▌           | 288/400 [07:41<02:53,  1.55s/it]loss: 8.438 | unlearn_loss: 7.031 | retain_loss: 1.398 | param_change: 0.001038\n",
      " 72%|█████████████████████████████▌           | 289/400 [07:43<02:44,  1.48s/it]loss: 11.56 | unlearn_loss: 10.38 | retain_loss: 1.18 | param_change: 0.0007133\n",
      " 72%|█████████████████████████████▋           | 290/400 [07:44<02:47,  1.52s/it]loss: 11.38 | unlearn_loss: 10.88 | retain_loss: 0.5195 | param_change: 0.0001345\n",
      " 73%|█████████████████████████████▊           | 291/400 [07:46<02:37,  1.45s/it]loss: 11 | unlearn_loss: 10.5 | retain_loss: 0.4785 | param_change: 0.0002651\n",
      " 73%|█████████████████████████████▉           | 292/400 [07:47<02:42,  1.51s/it]loss: 18.12 | unlearn_loss: 16.62 | retain_loss: 1.531 | param_change: 0.0006714\n",
      " 73%|██████████████████████████████           | 293/400 [07:49<02:30,  1.41s/it]loss: 10.62 | unlearn_loss: 9.375 | retain_loss: 1.242 | param_change: 0.0006599\n",
      " 74%|██████████████████████████████▏          | 294/400 [07:50<02:32,  1.44s/it]loss: 3.969 | unlearn_loss: 3.688 | retain_loss: 0.2852 | param_change: 0.0001063\n",
      " 74%|██████████████████████████████▏          | 295/400 [07:51<02:31,  1.44s/it]loss: 9.75 | unlearn_loss: 9.5 | retain_loss: 0.2715 | param_change: 0.0003986\n",
      " 74%|██████████████████████████████▎          | 296/400 [07:53<02:40,  1.54s/it]loss: 8.25 | unlearn_loss: 6.688 | retain_loss: 1.562 | param_change: 0.0008125\n",
      " 74%|██████████████████████████████▍          | 297/400 [07:55<02:34,  1.50s/it]loss: 9.312 | unlearn_loss: 8.188 | retain_loss: 1.102 | param_change: 0.0007973\n",
      " 74%|██████████████████████████████▌          | 298/400 [07:56<02:40,  1.57s/it]loss: 6.406 | unlearn_loss: 6.062 | retain_loss: 0.3359 | param_change: 0.0001307\n",
      " 75%|██████████████████████████████▋          | 299/400 [07:58<02:34,  1.52s/it]loss: 8.75 | unlearn_loss: 8.375 | retain_loss: 0.4023 | param_change: 0.0004539\n",
      " 75%|██████████████████████████████▊          | 300/400 [07:59<02:36,  1.57s/it]loss: 4.938 | unlearn_loss: 3.906 | retain_loss: 1.039 | param_change: 0.0003242\n",
      " 75%|██████████████████████████████▊          | 301/400 [08:01<02:27,  1.49s/it]loss: 8.688 | unlearn_loss: 7.688 | retain_loss: 0.9883 | param_change: 0.0004597\n",
      " 76%|██████████████████████████████▉          | 302/400 [08:02<02:29,  1.53s/it]loss: 11.31 | unlearn_loss: 10.56 | retain_loss: 0.7461 | param_change: 0.0004292\n",
      " 76%|███████████████████████████████          | 303/400 [08:04<02:27,  1.52s/it]loss: 8.188 | unlearn_loss: 7.531 | retain_loss: 0.6289 | param_change: 0.0004272\n",
      " 76%|███████████████████████████████▏         | 304/400 [08:06<02:35,  1.62s/it]loss: 4.406 | unlearn_loss: 3.797 | retain_loss: 0.6094 | param_change: 0.0002203\n",
      " 76%|███████████████████████████████▎         | 305/400 [08:08<02:39,  1.68s/it]loss: 10.88 | unlearn_loss: 10.31 | retain_loss: 0.5625 | param_change: 0.0002995\n",
      " 76%|███████████████████████████████▎         | 306/400 [08:10<02:47,  1.78s/it]loss: 5.594 | unlearn_loss: 4.906 | retain_loss: 0.6953 | param_change: 0.0003185\n",
      " 77%|███████████████████████████████▍         | 307/400 [08:11<02:31,  1.63s/it]loss: 7.781 | unlearn_loss: 7.094 | retain_loss: 0.6875 | param_change: 0.0002689\n",
      " 77%|███████████████████████████████▌         | 308/400 [08:12<02:28,  1.61s/it]loss: 7.188 | unlearn_loss: 6.688 | retain_loss: 0.5117 | param_change: 0.0001822\n",
      " 77%|███████████████████████████████▋         | 309/400 [08:14<02:19,  1.53s/it]loss: 7.688 | unlearn_loss: 7.219 | retain_loss: 0.4531 | param_change: 0.0001411\n",
      " 78%|███████████████████████████████▊         | 310/400 [08:15<02:22,  1.58s/it]loss: 3.812 | unlearn_loss: 3.25 | retain_loss: 0.5625 | param_change: 0.0001621\n",
      " 78%|███████████████████████████████▉         | 311/400 [08:17<02:11,  1.47s/it]loss: 7.375 | unlearn_loss: 6.906 | retain_loss: 0.4805 | param_change: 0.0001488\n",
      " 78%|███████████████████████████████▉         | 312/400 [08:18<02:14,  1.52s/it]loss: 3.875 | unlearn_loss: 3.469 | retain_loss: 0.4062 | param_change: 0.0001793\n",
      " 78%|████████████████████████████████         | 313/400 [08:20<02:07,  1.47s/it]loss: 7.531 | unlearn_loss: 7.156 | retain_loss: 0.3789 | param_change: 0.0001965\n",
      " 78%|████████████████████████████████▏        | 314/400 [08:21<02:10,  1.52s/it]loss: 12.31 | unlearn_loss: 11.75 | retain_loss: 0.543 | param_change: 0.0002556\n",
      " 79%|████████████████████████████████▎        | 315/400 [08:23<02:00,  1.42s/it]loss: 8.312 | unlearn_loss: 7.906 | retain_loss: 0.3848 | param_change: 0.000145\n",
      " 79%|████████████████████████████████▍        | 316/400 [08:24<02:02,  1.46s/it]loss: 9.625 | unlearn_loss: 9.25 | retain_loss: 0.3789 | param_change: 0.000227\n",
      " 79%|████████████████████████████████▍        | 317/400 [08:25<01:53,  1.37s/it]loss: 7.031 | unlearn_loss: 6.75 | retain_loss: 0.2773 | param_change: 0.0001307\n",
      " 80%|████████████████████████████████▌        | 318/400 [08:27<01:55,  1.41s/it]loss: 11.81 | unlearn_loss: 10.5 | retain_loss: 1.328 | param_change: 0.0006027\n",
      " 80%|████████████████████████████████▋        | 319/400 [08:28<01:50,  1.36s/it]loss: 7.781 | unlearn_loss: 6.781 | retain_loss: 0.9883 | param_change: 0.0004997\n",
      " 80%|████████████████████████████████▊        | 320/400 [08:30<02:04,  1.56s/it]loss: 7.094 | unlearn_loss: 6.844 | retain_loss: 0.2637 | param_change: 8.678e-05\n",
      " 80%|████████████████████████████████▉        | 321/400 [08:31<01:55,  1.46s/it]loss: 7.469 | unlearn_loss: 7.25 | retain_loss: 0.2285 | param_change: 0.000124\n",
      " 80%|█████████████████████████████████        | 322/400 [08:33<01:58,  1.52s/it]loss: 6.656 | unlearn_loss: 6.438 | retain_loss: 0.2285 | param_change: 7.82e-05\n",
      " 81%|█████████████████████████████████        | 323/400 [08:34<01:50,  1.43s/it]loss: 7.156 | unlearn_loss: 6.938 | retain_loss: 0.2041 | param_change: 8.059e-05\n",
      " 81%|█████████████████████████████████▏       | 324/400 [08:36<01:55,  1.52s/it]loss: 8 | unlearn_loss: 7.781 | retain_loss: 0.2178 | param_change: 9.918e-05\n",
      " 81%|█████████████████████████████████▎       | 325/400 [08:37<01:47,  1.43s/it]loss: 7.125 | unlearn_loss: 6.906 | retain_loss: 0.207 | param_change: 9.918e-05\n",
      " 82%|█████████████████████████████████▍       | 326/400 [08:39<01:56,  1.57s/it]loss: 6.438 | unlearn_loss: 6.062 | retain_loss: 0.373 | param_change: 0.0001831\n",
      " 82%|█████████████████████████████████▌       | 327/400 [08:40<01:47,  1.47s/it]loss: 7.188 | unlearn_loss: 6.844 | retain_loss: 0.3477 | param_change: 0.0001822\n",
      " 82%|█████████████████████████████████▌       | 328/400 [08:42<01:48,  1.51s/it]loss: 7.625 | unlearn_loss: 7.406 | retain_loss: 0.2256 | param_change: 0.0001187\n",
      " 82%|█████████████████████████████████▋       | 329/400 [08:43<01:41,  1.42s/it]loss: 7.906 | unlearn_loss: 7.719 | retain_loss: 0.1973 | param_change: 0.0001111\n",
      " 82%|█████████████████████████████████▊       | 330/400 [08:45<01:44,  1.49s/it]loss: 9.812 | unlearn_loss: 9 | retain_loss: 0.8164 | param_change: 0.0006599\n",
      " 83%|█████████████████████████████████▉       | 331/400 [08:46<01:36,  1.40s/it]loss: 7.469 | unlearn_loss: 6.844 | retain_loss: 0.6289 | param_change: 0.0004044\n",
      " 83%|██████████████████████████████████       | 332/400 [08:48<01:40,  1.48s/it]loss: 10 | unlearn_loss: 9.562 | retain_loss: 0.4375 | param_change: 0.0002365\n",
      " 83%|██████████████████████████████████▏      | 333/400 [08:49<01:33,  1.40s/it]loss: 7.406 | unlearn_loss: 6.969 | retain_loss: 0.4512 | param_change: 0.0002956\n",
      " 84%|██████████████████████████████████▏      | 334/400 [08:50<01:36,  1.46s/it]loss: 3.5 | unlearn_loss: 3.156 | retain_loss: 0.3398 | param_change: 0.0001678\n",
      " 84%|██████████████████████████████████▎      | 335/400 [08:52<01:30,  1.39s/it]loss: 6.469 | unlearn_loss: 6.219 | retain_loss: 0.2539 | param_change: 0.0001602\n",
      " 84%|██████████████████████████████████▍      | 336/400 [08:53<01:34,  1.47s/it]loss: 6.781 | unlearn_loss: 6.656 | retain_loss: 0.1123 | param_change: 5.484e-05\n",
      " 84%|██████████████████████████████████▌      | 337/400 [08:54<01:28,  1.40s/it]loss: 7.031 | unlearn_loss: 6.938 | retain_loss: 0.09668 | param_change: 5.841e-05\n",
      " 84%|██████████████████████████████████▋      | 338/400 [08:56<01:30,  1.46s/it]loss: 3.938 | unlearn_loss: 3.594 | retain_loss: 0.3496 | param_change: 0.0001831\n",
      " 85%|██████████████████████████████████▋      | 339/400 [08:57<01:24,  1.39s/it]loss: 8.125 | unlearn_loss: 7.812 | retain_loss: 0.3008 | param_change: 0.0001593\n",
      " 85%|██████████████████████████████████▊      | 340/400 [08:59<01:28,  1.48s/it]loss: 4.156 | unlearn_loss: 3.875 | retain_loss: 0.2695 | param_change: 0.000145\n",
      " 85%|██████████████████████████████████▉      | 341/400 [09:00<01:22,  1.40s/it]loss: 6.688 | unlearn_loss: 6.469 | retain_loss: 0.2227 | param_change: 0.0001101\n",
      " 86%|███████████████████████████████████      | 342/400 [09:02<01:26,  1.48s/it]loss: 3.859 | unlearn_loss: 3.641 | retain_loss: 0.2188 | param_change: 0.0001268\n",
      " 86%|███████████████████████████████████▏     | 343/400 [09:03<01:19,  1.39s/it]loss: 6.875 | unlearn_loss: 6.75 | retain_loss: 0.123 | param_change: 8.202e-05\n",
      " 86%|███████████████████████████████████▎     | 344/400 [09:05<01:21,  1.45s/it]loss: 3.859 | unlearn_loss: 3.328 | retain_loss: 0.5273 | param_change: 0.0002155\n",
      " 86%|███████████████████████████████████▎     | 345/400 [09:06<01:16,  1.39s/it]loss: 7.406 | unlearn_loss: 6.969 | retain_loss: 0.4512 | param_change: 0.0001812\n",
      " 86%|███████████████████████████████████▍     | 346/400 [09:07<01:17,  1.44s/it]loss: 7.688 | unlearn_loss: 7.344 | retain_loss: 0.3418 | param_change: 0.0001101\n",
      " 87%|███████████████████████████████████▌     | 347/400 [09:09<01:18,  1.47s/it]loss: 8.75 | unlearn_loss: 8.438 | retain_loss: 0.332 | param_change: 0.0001268\n",
      " 87%|███████████████████████████████████▋     | 348/400 [09:11<01:23,  1.60s/it]loss: 7.438 | unlearn_loss: 6.188 | retain_loss: 1.25 | param_change: 0.0006828\n",
      " 87%|███████████████████████████████████▊     | 349/400 [09:12<01:16,  1.50s/it]loss: 7.688 | unlearn_loss: 6.688 | retain_loss: 1.008 | param_change: 0.0005875\n",
      " 88%|███████████████████████████████████▉     | 350/400 [09:14<01:16,  1.53s/it]loss: 6.656 | unlearn_loss: 6.156 | retain_loss: 0.5117 | param_change: 0.0003681\n",
      " 88%|███████████████████████████████████▉     | 351/400 [09:15<01:11,  1.45s/it]loss: 7.5 | unlearn_loss: 7.188 | retain_loss: 0.3262 | param_change: 0.0001402\n",
      " 88%|████████████████████████████████████     | 352/400 [09:17<01:11,  1.49s/it]loss: 6 | unlearn_loss: 5.719 | retain_loss: 0.2891 | param_change: 0.0001545\n",
      " 88%|████████████████████████████████████▏    | 353/400 [09:18<01:07,  1.44s/it]loss: 7.281 | unlearn_loss: 6.938 | retain_loss: 0.3359 | param_change: 0.0002003\n",
      " 88%|████████████████████████████████████▎    | 354/400 [09:20<01:09,  1.52s/it]loss: 6.656 | unlearn_loss: 6.156 | retain_loss: 0.4922 | param_change: 0.0003433\n",
      " 89%|████████████████████████████████████▍    | 355/400 [09:21<01:04,  1.43s/it]loss: 7.25 | unlearn_loss: 6.719 | retain_loss: 0.5195 | param_change: 0.0003605\n",
      " 89%|████████████████████████████████████▍    | 356/400 [09:22<01:04,  1.47s/it]loss: 3.625 | unlearn_loss: 2.906 | retain_loss: 0.7227 | param_change: 0.0003567\n",
      " 89%|████████████████████████████████████▌    | 357/400 [09:24<01:04,  1.49s/it]loss: 8.5 | unlearn_loss: 7.844 | retain_loss: 0.6367 | param_change: 0.0003338\n",
      " 90%|████████████████████████████████████▋    | 358/400 [09:26<01:07,  1.60s/it]loss: 10.31 | unlearn_loss: 10.12 | retain_loss: 0.1992 | param_change: 4.172e-05\n",
      " 90%|████████████████████████████████████▊    | 359/400 [09:27<01:01,  1.51s/it]loss: 6.594 | unlearn_loss: 6.406 | retain_loss: 0.1914 | param_change: 7.01e-05\n",
      " 90%|████████████████████████████████████▉    | 360/400 [09:29<01:01,  1.55s/it]loss: 6.938 | unlearn_loss: 6.719 | retain_loss: 0.2256 | param_change: 0.0001097\n",
      " 90%|█████████████████████████████████████    | 361/400 [09:30<00:58,  1.49s/it]loss: 6.844 | unlearn_loss: 6.625 | retain_loss: 0.2109 | param_change: 0.0001006\n",
      " 90%|█████████████████████████████████████    | 362/400 [09:32<00:59,  1.56s/it]loss: 6.938 | unlearn_loss: 5.969 | retain_loss: 0.9844 | param_change: 0.0004921\n",
      " 91%|█████████████████████████████████████▏   | 363/400 [09:33<00:54,  1.46s/it]loss: 8.188 | unlearn_loss: 7.406 | retain_loss: 0.7656 | param_change: 0.0004025\n",
      " 91%|█████████████████████████████████████▎   | 364/400 [09:35<00:53,  1.48s/it]loss: 6.719 | unlearn_loss: 6.438 | retain_loss: 0.2754 | param_change: 0.0001793\n",
      " 91%|█████████████████████████████████████▍   | 365/400 [09:36<00:54,  1.55s/it]loss: 7.156 | unlearn_loss: 6.844 | retain_loss: 0.3262 | param_change: 0.0001841\n",
      " 92%|█████████████████████████████████████▌   | 366/400 [09:38<00:56,  1.66s/it]loss: 8.25 | unlearn_loss: 7.812 | retain_loss: 0.4512 | param_change: 0.0003357\n",
      " 92%|█████████████████████████████████████▌   | 367/400 [09:40<00:51,  1.56s/it]loss: 7.625 | unlearn_loss: 7.188 | retain_loss: 0.4238 | param_change: 0.0003319\n",
      " 92%|█████████████████████████████████████▋   | 368/400 [09:41<00:51,  1.59s/it]loss: 6.125 | unlearn_loss: 5.812 | retain_loss: 0.3105 | param_change: 0.0001745\n",
      " 92%|█████████████████████████████████████▊   | 369/400 [09:43<00:48,  1.56s/it]loss: 7.062 | unlearn_loss: 6.781 | retain_loss: 0.2969 | param_change: 0.0001564\n",
      " 92%|█████████████████████████████████████▉   | 370/400 [09:44<00:48,  1.62s/it]loss: 4.688 | unlearn_loss: 4.531 | retain_loss: 0.1484 | param_change: 0.0002861\n",
      " 93%|██████████████████████████████████████   | 371/400 [09:46<00:44,  1.54s/it]loss: 7.062 | unlearn_loss: 6.938 | retain_loss: 0.1387 | param_change: 7.391e-05\n",
      " 93%|██████████████████████████████████████▏  | 372/400 [09:48<00:44,  1.58s/it]loss: 10.5 | unlearn_loss: 10.38 | retain_loss: 0.104 | param_change: 5.484e-05\n",
      " 93%|██████████████████████████████████████▏  | 373/400 [09:49<00:40,  1.50s/it]loss: 7.438 | unlearn_loss: 7.344 | retain_loss: 0.1001 | param_change: 6.819e-05\n",
      " 94%|██████████████████████████████████████▎  | 374/400 [09:50<00:40,  1.54s/it]loss: 10.5 | unlearn_loss: 10.25 | retain_loss: 0.2617 | param_change: 0.0001307\n",
      " 94%|██████████████████████████████████████▍  | 375/400 [09:52<00:37,  1.51s/it]loss: 7.438 | unlearn_loss: 7.188 | retain_loss: 0.25 | param_change: 9.441e-05\n",
      " 94%|██████████████████████████████████████▌  | 376/400 [09:54<00:37,  1.58s/it]loss: 4 | unlearn_loss: 3.297 | retain_loss: 0.6992 | param_change: 0.0004749\n",
      " 94%|██████████████████████████████████████▋  | 377/400 [09:55<00:34,  1.50s/it]loss: 8.375 | unlearn_loss: 7.844 | retain_loss: 0.5117 | param_change: 0.0003414\n",
      " 94%|██████████████████████████████████████▋  | 378/400 [09:57<00:33,  1.52s/it]loss: 3.578 | unlearn_loss: 3.484 | retain_loss: 0.1001 | param_change: 6.771e-05\n",
      " 95%|██████████████████████████████████████▊  | 379/400 [09:58<00:30,  1.44s/it]loss: 6.5 | unlearn_loss: 6.406 | retain_loss: 0.09277 | param_change: 3.195e-05\n",
      " 95%|██████████████████████████████████████▉  | 380/400 [09:59<00:29,  1.47s/it]loss: 3.312 | unlearn_loss: 3.125 | retain_loss: 0.1953 | param_change: 0.0001278\n",
      " 95%|███████████████████████████████████████  | 381/400 [10:01<00:26,  1.39s/it]loss: 7.156 | unlearn_loss: 6.938 | retain_loss: 0.21 | param_change: 0.0001373\n",
      " 96%|███████████████████████████████████████▏ | 382/400 [10:02<00:25,  1.44s/it]loss: 3.766 | unlearn_loss: 3.312 | retain_loss: 0.4512 | param_change: 0.000288\n",
      " 96%|███████████████████████████████████████▎ | 383/400 [10:03<00:23,  1.41s/it]loss: 7.125 | unlearn_loss: 6.688 | retain_loss: 0.4355 | param_change: 0.0002823\n",
      " 96%|███████████████████████████████████████▎ | 384/400 [10:05<00:23,  1.49s/it]loss: 7.031 | unlearn_loss: 6.594 | retain_loss: 0.4238 | param_change: 0.0003624\n",
      " 96%|███████████████████████████████████████▍ | 385/400 [10:06<00:21,  1.41s/it]loss: 8.688 | unlearn_loss: 8.375 | retain_loss: 0.3398 | param_change: 0.0002842\n",
      " 96%|███████████████████████████████████████▌ | 386/400 [10:08<00:20,  1.46s/it]loss: 6.031 | unlearn_loss: 5.938 | retain_loss: 0.105 | param_change: 4.268e-05\n",
      " 97%|███████████████████████████████████████▋ | 387/400 [10:09<00:18,  1.42s/it]loss: 7.344 | unlearn_loss: 7.25 | retain_loss: 0.105 | param_change: 4.649e-05\n",
      " 97%|███████████████████████████████████████▊ | 388/400 [10:11<00:18,  1.50s/it]loss: 4.125 | unlearn_loss: 3.562 | retain_loss: 0.5469 | param_change: 0.0002708\n",
      " 97%|███████████████████████████████████████▊ | 389/400 [10:14<00:20,  1.84s/it]loss: 7.656 | unlearn_loss: 7.188 | retain_loss: 0.4824 | param_change: 0.000205\n",
      " 98%|███████████████████████████████████████▉ | 390/400 [10:17<00:24,  2.47s/it]loss: 8.75 | unlearn_loss: 8.562 | retain_loss: 0.1719 | param_change: 7.534e-05\n",
      " 98%|████████████████████████████████████████ | 391/400 [10:19<00:18,  2.11s/it]loss: 7.125 | unlearn_loss: 6.938 | retain_loss: 0.1738 | param_change: 8.059e-05\n",
      " 98%|████████████████████████████████████████▏| 392/400 [10:20<00:15,  1.96s/it]loss: 9.625 | unlearn_loss: 9.25 | retain_loss: 0.3496 | param_change: 0.0001774\n",
      " 98%|████████████████████████████████████████▎| 393/400 [10:22<00:12,  1.80s/it]loss: 7.531 | unlearn_loss: 7.188 | retain_loss: 0.3379 | param_change: 0.0001101\n",
      " 98%|████████████████████████████████████████▍| 394/400 [10:24<00:10,  1.79s/it]loss: 7.281 | unlearn_loss: 7.031 | retain_loss: 0.2393 | param_change: 8.583e-05\n",
      " 99%|████████████████████████████████████████▍| 395/400 [10:25<00:08,  1.70s/it]loss: 7.344 | unlearn_loss: 7.125 | retain_loss: 0.2207 | param_change: 0.0001101\n",
      " 99%|████████████████████████████████████████▌| 396/400 [10:27<00:07,  1.75s/it]loss: 3.859 | unlearn_loss: 3.531 | retain_loss: 0.332 | param_change: 0.0001326\n",
      " 99%|████████████████████████████████████████▋| 397/400 [10:28<00:04,  1.66s/it]loss: 9 | unlearn_loss: 8.75 | retain_loss: 0.2812 | param_change: 0.0001764\n",
      "100%|████████████████████████████████████████▊| 398/400 [10:30<00:03,  1.68s/it]loss: 6.219 | unlearn_loss: 5.938 | retain_loss: 0.2812 | param_change: 0.0001335\n",
      "100%|████████████████████████████████████████▉| 399/400 [10:32<00:01,  1.70s/it]loss: 7.969 | unlearn_loss: 7.719 | retain_loss: 0.2461 | param_change: 0.0001297\n",
      "100%|█████████████████████████████████████████| 400/400 [10:34<00:00,  1.59s/it]\n",
      "Saved model to models/yi_cut\n"
     ]
    }
   ],
   "source": [
    "# best\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1,2,3\"\n",
    "\n",
    "!python3 -m rmu.unlearn --model_name 01-ai/Yi-34B-Chat  --batch_size 2 --layer_ids 13,14,15 --layer_id 15 --max_num_batches 400 --layer_ids 13,14,15 --layer_id 15 --retain_corpora wikitext,wikitext --forget_corpora bio-forget-corpus,cyber-forget-corpus --steering_coeffs 300,300 --alpha 350,350 --lr 5e-5 --seed 42 --output_dir models/yi_rmu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The following values were not passed to `accelerate launch` and had defaults used instead:\n",
      "\t`--num_processes` was set to a value of `4`\n",
      "\t\tMore than one GPU was found, enabling multi-GPU training.\n",
      "\t\tIf this was unintended please pass in `--num_processes=1`.\n",
      "\t`--num_machines` was set to a value of `1`\n",
      "\t`--mixed_precision` was set to a value of `'no'`\n",
      "\t`--dynamo_backend` was set to a value of `'no'`\n",
      "To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
      "2024-04-17 03:36:14.852468: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2024-04-17 03:36:14.856661: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2024-04-17 03:36:14.856955: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2024-04-17 03:36:14.857010: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2024-04-17 03:36:14.908433: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-17 03:36:14.911869: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-17 03:36:14.912722: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-17 03:36:14.913125: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-17 03:36:16.744548: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "2024-04-17 03:36:16.744648: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "2024-04-17 03:36:16.745169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "2024-04-17 03:36:16.746126: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "2024-04-17:03:36:22,849 INFO     [__main__.py:251] Verbosity set to INFO\n",
      "2024-04-17:03:36:22,848 INFO     [__main__.py:251] Verbosity set to INFO\n",
      "2024-04-17:03:36:22,849 INFO     [__main__.py:251] Verbosity set to INFO\n",
      "2024-04-17:03:36:22,849 INFO     [__main__.py:251] Verbosity set to INFO\n",
      "2024-04-17:03:36:28,300 INFO     [__main__.py:335] Selected Tasks: ['mmlu', 'wmdp']\n",
      "2024-04-17:03:36:28,300 INFO     [__main__.py:335] Selected Tasks: ['mmlu', 'wmdp']\n",
      "2024-04-17:03:36:28,300 INFO     [__main__.py:335] Selected Tasks: ['mmlu', 'wmdp']\n",
      "2024-04-17:03:36:28,300 INFO     [__main__.py:335] Selected Tasks: ['mmlu', 'wmdp']\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': 'models/yi_cut'}\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': 'models/yi_cut'}\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': 'models/yi_cut'}\n",
      "2024-04-17:03:36:28,306 INFO     [evaluator.py:177] Initializing hf model, with arguments: {'pretrained': 'models/yi_cut'}\n",
      "2024-04-17:03:36:29,881 WARNING  [logging.py:61] Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:40<00:00,  2.68s/it]\n",
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:40<00:00,  2.68s/it]\n",
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:40<00:00,  2.68s/it]\n",
      "Loading checkpoint shards: 100%|████████████████| 15/15 [00:40<00:00,  2.69s/it]\n",
      "[2024-04-17 03:37:11,266] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-04-17 03:37:11,312] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-04-17 03:37:11,339] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "[2024-04-17 03:37:11,431] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
      "2024-04-17:03:37:12,507 INFO     [huggingface.py:332] Using 4 devices with data parallelism\n",
      "/data/long_phan/anaconda3/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "/data/long_phan/anaconda3/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "/data/long_phan/anaconda3/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "/data/long_phan/anaconda3/lib/python3.10/site-packages/datasets/load.py:1429: FutureWarning: The repository for hails/mmlu_no_train contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/hails/mmlu_no_train\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "2024-04-17:03:38:19,018 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,018 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,332 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,332 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,763 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,763 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,820 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,821 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,862 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:19,862 WARNING  [task.py:322] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,076 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,076 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,528 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,528 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,588 INFO     [task.py:395] Building contexts for wmdp_cyber on rank 2...\n",
      "2024-04-17:03:38:20,616 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,616 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,628 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,628 WARNING  [task.py:322] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      " 16%|██████▊                                  | 82/497 [00:00<00:00, 810.80it/s]2024-04-17:03:38:20,810 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:20,810 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      " 33%|█████████████▎                          | 165/497 [00:00<00:00, 819.59it/s]2024-04-17:03:38:20,870 INFO     [task.py:395] Building contexts for wmdp_cyber on rank 0...\n",
      "100%|████████████████████████████████████████| 497/497 [00:00<00:00, 825.87it/s]\n",
      " 65%|██████████████████████████▏             | 325/497 [00:00<00:00, 812.89it/s]2024-04-17:03:38:21,325 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:21,325 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:21,384 INFO     [task.py:395] Building contexts for wmdp_cyber on rank 1...\n",
      "2024-04-17:03:38:21,422 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "2024-04-17:03:38:21,422 WARNING  [task.py:322] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.\n",
      "  0%|                                                   | 0/497 [00:00<?, ?it/s]2024-04-17:03:38:21,481 INFO     [task.py:395] Building contexts for wmdp_cyber on rank 3...\n",
      "100%|████████████████████████████████████████| 497/497 [00:00<00:00, 812.40it/s]\n",
      "100%|████████████████████████████████████████| 497/497 [00:00<00:00, 826.05it/s]\n",
      "100%|████████████████████████████████████████| 496/496 [00:00<00:00, 820.97it/s]\n",
      "2024-04-17:03:38:31,797 INFO     [task.py:395] Building contexts for wmdp_chem on rank 2...\n",
      "2024-04-17:03:38:31,797 INFO     [task.py:395] Building contexts for wmdp_chem on rank 0...\n",
      "2024-04-17:03:38:31,797 INFO     [task.py:395] Building contexts for wmdp_chem on rank 3...\n",
      "2024-04-17:03:38:31,797 INFO     [task.py:395] Building contexts for wmdp_chem on rank 1...\n",
      "100%|████████████████████████████████████████| 102/102 [00:00<00:00, 823.09it/s]\n",
      "100%|████████████████████████████████████████| 102/102 [00:00<00:00, 817.09it/s]\n",
      "100%|████████████████████████████████████████| 102/102 [00:00<00:00, 815.30it/s]\n",
      "100%|████████████████████████████████████████| 102/102 [00:00<00:00, 805.04it/s]\n",
      "2024-04-17:03:38:31,937 INFO     [task.py:395] Building contexts for wmdp_bio on rank 0...\n",
      "2024-04-17:03:38:31,937 INFO     [task.py:395] Building contexts for wmdp_bio on rank 1...\n",
      "2024-04-17:03:38:31,937 INFO     [task.py:395] Building contexts for wmdp_bio on rank 3...\n",
      "2024-04-17:03:38:31,937 INFO     [task.py:395] Building contexts for wmdp_bio on rank 2...\n",
      "100%|████████████████████████████████████████| 318/318 [00:00<00:00, 829.95it/s]\n",
      "100%|████████████████████████████████████████| 318/318 [00:00<00:00, 830.93it/s]\n",
      "100%|████████████████████████████████████████| 318/318 [00:00<00:00, 830.40it/s]\n",
      "100%|████████████████████████████████████████| 319/319 [00:00<00:00, 830.80it/s]\n",
      "2024-04-17:03:38:32,356 INFO     [task.py:395] Building contexts for mmlu_international_law on rank 3...\n",
      "2024-04-17:03:38:32,356 INFO     [task.py:395] Building contexts for mmlu_international_law on rank 2...\n",
      "2024-04-17:03:38:32,356 INFO     [task.py:395] Building contexts for mmlu_international_law on rank 1...\n",
      "2024-04-17:03:38:32,356 INFO     [task.py:395] Building contexts for mmlu_international_law on rank 0...\n",
      "100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 831.81it/s]\n",
      "100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 830.08it/s]\n",
      "100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 806.80it/s]\n",
      "100%|██████████████████████████████████████████| 31/31 [00:00<00:00, 786.44it/s]\n",
      "2024-04-17:03:38:32,401 INFO     [task.py:395] Building contexts for mmlu_high_school_world_history on rank 2...\n",
      "2024-04-17:03:38:32,401 INFO     [task.py:395] Building contexts for mmlu_high_school_world_history on rank 1...\n",
      "2024-04-17:03:38:32,401 INFO     [task.py:395] Building contexts for mmlu_high_school_world_history on rank 3...\n",
      "2024-04-17:03:38:32,401 INFO     [task.py:395] Building contexts for mmlu_high_school_world_history on rank 0...\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 831.56it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 826.02it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 828.71it/s]\n",
      "100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 823.96it/s]\n",
      "2024-04-17:03:38:32,483 INFO     [task.py:395] Building contexts for mmlu_philosophy on rank 3...\n",
      "2024-04-17:03:38:32,484 INFO     [task.py:395] Building contexts for mmlu_philosophy on rank 2...\n",
      "2024-04-17:03:38:32,484 INFO     [task.py:395] Building contexts for mmlu_philosophy on rank 1...\n",
      "2024-04-17:03:38:32,484 INFO     [task.py:395] Building contexts for mmlu_philosophy on rank 0...\n",
      "100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 835.95it/s]\n",
      "100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 833.08it/s]\n",
      "100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 829.96it/s]\n",
      "100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 825.85it/s]\n",
      "2024-04-17:03:38:32,589 INFO     [task.py:395] Building contexts for mmlu_logical_fallacies on rank 2...\n",
      "2024-04-17:03:38:32,589 INFO     [task.py:395] Building contexts for mmlu_logical_fallacies on rank 3...\n",
      "2024-04-17:03:38:32,589 INFO     [task.py:395] Building contexts for mmlu_logical_fallacies on rank 1...\n",
      "2024-04-17:03:38:32,589 INFO     [task.py:395] Building contexts for mmlu_logical_fallacies on rank 0...\n",
      "100%|██████████████████████████████████████████| 40/40 [00:00<00:00, 830.86it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 831.24it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 830.33it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 818.76it/s]\n",
      "2024-04-17:03:38:32,645 INFO     [task.py:395] Building contexts for mmlu_high_school_european_history on rank 1...\n",
      "2024-04-17:03:38:32,645 INFO     [task.py:395] Building contexts for mmlu_high_school_european_history on rank 3...\n",
      "2024-04-17:03:38:32,645 INFO     [task.py:395] Building contexts for mmlu_high_school_european_history on rank 2...\n",
      "2024-04-17:03:38:32,645 INFO     [task.py:395] Building contexts for mmlu_high_school_european_history on rank 0...\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 826.88it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 821.16it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 819.02it/s]\n",
      "100%|██████████████████████████████████████████| 42/42 [00:00<00:00, 821.89it/s]\n",
      "2024-04-17:03:38:32,703 INFO     [task.py:395] Building contexts for mmlu_moral_scenarios on rank 1...\n",
      "2024-04-17:03:38:32,703 INFO     [task.py:395] Building contexts for mmlu_moral_scenarios on rank 3...\n",
      "2024-04-17:03:38:32,703 INFO     [task.py:395] Building contexts for mmlu_moral_scenarios on rank 2...\n",
      "2024-04-17:03:38:32,703 INFO     [task.py:395] Building contexts for mmlu_moral_scenarios on rank 0...\n",
      "100%|████████████████████████████████████████| 224/224 [00:00<00:00, 840.99it/s]\n",
      "100%|████████████████████████████████████████| 224/224 [00:00<00:00, 832.40it/s]\n",
      "100%|████████████████████████████████████████| 223/223 [00:00<00:00, 824.67it/s]\n",
      "100%|████████████████████████████████████████| 224/224 [00:00<00:00, 826.79it/s]\n",
      "2024-04-17:03:38:33,005 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 1...\n",
      "2024-04-17:03:38:33,005 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 3...\n",
      "2024-04-17:03:38:33,005 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 2...\n",
      "2024-04-17:03:38:33,005 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 0...\n",
      "100%|████████████████████████████████████████| 383/383 [00:00<00:00, 540.19it/s]\n",
      "100%|████████████████████████████████████████| 384/384 [00:00<00:00, 532.39it/s]\n",
      "100%|████████████████████████████████████████| 384/384 [00:00<00:00, 508.01it/s]\n",
      "100%|████████████████████████████████████████| 383/383 [00:00<00:00, 501.87it/s]\n",
      "2024-04-17:03:38:33,820 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 2...\n",
      "2024-04-17:03:38:33,820 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 1...\n",
      "2024-04-17:03:38:33,821 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 0...\n",
      "2024-04-17:03:38:33,821 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 3...\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 823.19it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 826.94it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 820.91it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 809.37it/s]\n",
      "2024-04-17:03:38:33,893 INFO     [task.py:395] Building contexts for mmlu_formal_logic on rank 0...\n",
      "2024-04-17:03:38:33,893 INFO     [task.py:395] Building contexts for mmlu_formal_logic on rank 1...\n",
      "2024-04-17:03:38:33,893 INFO     [task.py:395] Building contexts for mmlu_formal_logic on rank 3...\n",
      "2024-04-17:03:38:33,893 INFO     [task.py:395] Building contexts for mmlu_formal_logic on rank 2...\n",
      "100%|██████████████████████████████████████████| 31/31 [00:00<00:00, 817.95it/s]\n",
      "100%|██████████████████████████████████████████| 32/32 [00:00<00:00, 817.67it/s]\n",
      "100%|██████████████████████████████████████████| 32/32 [00:00<00:00, 814.31it/s]\n",
      "100%|██████████████████████████████████████████| 31/31 [00:00<00:00, 798.29it/s]\n",
      "2024-04-17:03:38:33,938 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 1...\n",
      "2024-04-17:03:38:33,938 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 2...\n",
      "2024-04-17:03:38:33,938 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 0...\n",
      "2024-04-17:03:38:33,938 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 3...\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 825.04it/s]\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 818.11it/s]\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 821.54it/s]\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 814.63it/s]\n",
      "2024-04-17:03:38:33,976 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on rank 2...\n",
      "2024-04-17:03:38:33,976 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on rank 3...\n",
      "2024-04-17:03:38:33,976 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on rank 0...\n",
      "2024-04-17:03:38:33,976 INFO     [task.py:395] Building contexts for mmlu_moral_disputes on rank 1...\n",
      "100%|██████████████████████████████████████████| 86/86 [00:00<00:00, 834.97it/s]\n",
      "100%|██████████████████████████████████████████| 86/86 [00:00<00:00, 836.92it/s]\n",
      "100%|██████████████████████████████████████████| 87/87 [00:00<00:00, 824.52it/s]\n",
      "100%|██████████████████████████████████████████| 87/87 [00:00<00:00, 822.12it/s]\n",
      "2024-04-17:03:38:34,095 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 2...\n",
      "2024-04-17:03:38:34,095 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 3...\n",
      "2024-04-17:03:38:34,095 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 1...\n",
      "2024-04-17:03:38:34,095 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 0...\n",
      "100%|██████████████████████████████████████████| 81/81 [00:00<00:00, 830.60it/s]\n",
      "100%|██████████████████████████████████████████| 81/81 [00:00<00:00, 828.09it/s]\n",
      "100%|██████████████████████████████████████████| 81/81 [00:00<00:00, 819.28it/s]\n",
      "100%|██████████████████████████████████████████| 81/81 [00:00<00:00, 817.79it/s]\n",
      "2024-04-17:03:38:34,206 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 2...\n",
      "2024-04-17:03:38:34,206 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 3...\n",
      "2024-04-17:03:38:34,206 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 1...\n",
      "2024-04-17:03:38:34,206 INFO     [task.py:395] Building contexts for mmlu_world_religions on rank 0...\n",
      "100%|██████████████████████████████████████████| 42/42 [00:00<00:00, 826.89it/s]\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 828.88it/s]\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 822.70it/s]\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 813.69it/s]\n",
      "2024-04-17:03:38:34,266 INFO     [task.py:395] Building contexts for mmlu_public_relations on rank 1...\n",
      "2024-04-17:03:38:34,266 INFO     [task.py:395] Building contexts for mmlu_public_relations on rank 3...\n",
      "2024-04-17:03:38:34,266 INFO     [task.py:395] Building contexts for mmlu_public_relations on rank 2...\n",
      "2024-04-17:03:38:34,267 INFO     [task.py:395] Building contexts for mmlu_public_relations on rank 0...\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 832.47it/s]\n",
      "100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 821.24it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 827.50it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 797.71it/s]\n",
      "2024-04-17:03:38:34,306 INFO     [task.py:395] Building contexts for mmlu_high_school_macroeconomics on rank 1...\n",
      "2024-04-17:03:38:34,306 INFO     [task.py:395] Building contexts for mmlu_high_school_macroeconomics on rank 3...\n",
      "2024-04-17:03:38:34,307 INFO     [task.py:395] Building contexts for mmlu_high_school_macroeconomics on rank 2...\n",
      "2024-04-17:03:38:34,307 INFO     [task.py:395] Building contexts for mmlu_high_school_macroeconomics on rank 0...\n",
      "100%|██████████████████████████████████████████| 97/97 [00:00<00:00, 830.13it/s]\n",
      "100%|██████████████████████████████████████████| 97/97 [00:00<00:00, 824.54it/s]\n",
      "100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 824.62it/s]\n",
      "100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 822.81it/s]\n",
      "2024-04-17:03:38:34,439 INFO     [task.py:395] Building contexts for mmlu_professional_psychology on rank 2...\n",
      "2024-04-17:03:38:34,439 INFO     [task.py:395] Building contexts for mmlu_professional_psychology on rank 3...\n",
      "2024-04-17:03:38:34,439 INFO     [task.py:395] Building contexts for mmlu_professional_psychology on rank 1...\n",
      "2024-04-17:03:38:34,440 INFO     [task.py:395] Building contexts for mmlu_professional_psychology on rank 0...\n",
      "100%|████████████████████████████████████████| 153/153 [00:00<00:00, 839.06it/s]\n",
      "100%|████████████████████████████████████████| 153/153 [00:00<00:00, 828.00it/s]\n",
      "100%|████████████████████████████████████████| 153/153 [00:00<00:00, 830.68it/s]\n",
      "100%|████████████████████████████████████████| 153/153 [00:00<00:00, 829.26it/s]\n",
      "2024-04-17:03:38:34,645 INFO     [task.py:395] Building contexts for mmlu_high_school_microeconomics on rank 2...\n",
      "2024-04-17:03:38:34,645 INFO     [task.py:395] Building contexts for mmlu_high_school_microeconomics on rank 3...\n",
      "2024-04-17:03:38:34,645 INFO     [task.py:395] Building contexts for mmlu_high_school_microeconomics on rank 1...\n",
      "2024-04-17:03:38:34,645 INFO     [task.py:395] Building contexts for mmlu_high_school_microeconomics on rank 0...\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 827.06it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 825.50it/s]\n",
      "100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 834.32it/s]\n",
      "100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 829.86it/s]\n",
      "2024-04-17:03:38:34,726 INFO     [task.py:395] Building contexts for mmlu_sociology on rank 2...\n",
      "2024-04-17:03:38:34,726 INFO     [task.py:395] Building contexts for mmlu_sociology on rank 3...\n",
      "2024-04-17:03:38:34,726 INFO     [task.py:395] Building contexts for mmlu_sociology on rank 1...\n",
      "2024-04-17:03:38:34,726 INFO     [task.py:395] Building contexts for mmlu_sociology on rank 0...\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 840.46it/s]\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 831.41it/s]\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 821.07it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 826.62it/s]\n",
      "2024-04-17:03:38:34,796 INFO     [task.py:395] Building contexts for mmlu_high_school_geography on rank 3...\n",
      "2024-04-17:03:38:34,797 INFO     [task.py:395] Building contexts for mmlu_high_school_geography on rank 1...\n",
      "2024-04-17:03:38:34,797 INFO     [task.py:395] Building contexts for mmlu_high_school_geography on rank 2...\n",
      "2024-04-17:03:38:34,797 INFO     [task.py:395] Building contexts for mmlu_high_school_geography on rank 0...\n",
      "100%|██████████████████████████████████████████| 49/49 [00:00<00:00, 826.95it/s]\n",
      "100%|██████████████████████████████████████████| 49/49 [00:00<00:00, 829.77it/s]\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 832.57it/s]\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 829.81it/s]\n",
      "2024-04-17:03:38:34,865 INFO     [task.py:395] Building contexts for mmlu_econometrics on rank 2...\n",
      "2024-04-17:03:38:34,865 INFO     [task.py:395] Building contexts for mmlu_econometrics on rank 1...\n",
      "2024-04-17:03:38:34,865 INFO     [task.py:395] Building contexts for mmlu_econometrics on rank 3...\n",
      "2024-04-17:03:38:34,865 INFO     [task.py:395] Building contexts for mmlu_econometrics on rank 0...\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 829.05it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 824.10it/s]\n",
      "100%|██████████████████████████████████████████| 29/29 [00:00<00:00, 822.34it/s]\n",
      "100%|██████████████████████████████████████████| 29/29 [00:00<00:00, 822.25it/s]\n",
      "2024-04-17:03:38:34,905 INFO     [task.py:395] Building contexts for mmlu_us_foreign_policy on rank 3...\n",
      "2024-04-17:03:38:34,905 INFO     [task.py:395] Building contexts for mmlu_us_foreign_policy on rank 2...\n",
      "2024-04-17:03:38:34,905 INFO     [task.py:395] Building contexts for mmlu_us_foreign_policy on rank 1...\n",
      "2024-04-17:03:38:34,905 INFO     [task.py:395] Building contexts for mmlu_us_foreign_policy on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.39it/s]\n",
      "\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 813.58it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 823.68it/s]\n",
      "2024-04-17:03:38:34,939 INFO     [task.py:395] Building contexts for mmlu_human_sexuality on rank 2...\n",
      "2024-04-17:03:38:34,940 INFO     [task.py:395] Building contexts for mmlu_human_sexuality on rank 1...\n",
      "2024-04-17:03:38:34,940 INFO     [task.py:395] Building contexts for mmlu_human_sexuality on rank 3...\n",
      "2024-04-17:03:38:34,940 INFO     [task.py:395] Building contexts for mmlu_human_sexuality on rank 0...\n",
      "100%|██████████████████████████████████████████| 32/32 [00:00<00:00, 822.98it/s]\n",
      "100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 829.17it/s]\n",
      "100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 823.27it/s]\n",
      "100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 821.08it/s]\n",
      "2024-04-17:03:38:34,985 INFO     [task.py:395] Building contexts for mmlu_security_studies on rank 2...\n",
      "2024-04-17:03:38:34,985 INFO     [task.py:395] Building contexts for mmlu_security_studies on rank 3...\n",
      "2024-04-17:03:38:34,985 INFO     [task.py:395] Building contexts for mmlu_security_studies on rank 0...\n",
      "2024-04-17:03:38:34,985 INFO     [task.py:395] Building contexts for mmlu_security_studies on rank 1...\n",
      "100%|██████████████████████████████████████████| 61/61 [00:00<00:00, 834.73it/s]\n",
      "100%|██████████████████████████████████████████| 61/61 [00:00<00:00, 832.57it/s]\n",
      "100%|██████████████████████████████████████████| 61/61 [00:00<00:00, 826.33it/s]\n",
      "100%|██████████████████████████████████████████| 62/62 [00:00<00:00, 824.57it/s]\n",
      "2024-04-17:03:38:35,069 INFO     [task.py:395] Building contexts for mmlu_high_school_psychology on rank 3...\n",
      "2024-04-17:03:38:35,069 INFO     [task.py:395] Building contexts for mmlu_high_school_psychology on rank 1...\n",
      "2024-04-17:03:38:35,069 INFO     [task.py:395] Building contexts for mmlu_high_school_psychology on rank 0...\n",
      "2024-04-17:03:38:35,069 INFO     [task.py:395] Building contexts for mmlu_high_school_psychology on rank 2...\n",
      "100%|████████████████████████████████████████| 136/136 [00:00<00:00, 830.76it/s]\n",
      "100%|████████████████████████████████████████| 136/136 [00:00<00:00, 829.13it/s]\n",
      "100%|████████████████████████████████████████| 137/137 [00:00<00:00, 829.65it/s]\n",
      "100%|████████████████████████████████████████| 136/136 [00:00<00:00, 828.55it/s]\n",
      "2024-04-17:03:38:35,254 INFO     [task.py:395] Building contexts for mmlu_high_school_government_and_politics on rank 1...\n",
      "2024-04-17:03:38:35,254 INFO     [task.py:395] Building contexts for mmlu_high_school_government_and_politics on rank 0...\n",
      "2024-04-17:03:38:35,254 INFO     [task.py:395] Building contexts for mmlu_high_school_government_and_politics on rank 3...\n",
      "2024-04-17:03:38:35,254 INFO     [task.py:395] Building contexts for mmlu_high_school_government_and_politics on rank 2...\n",
      "100%|██████████████████████████████████████████| 48/48 [00:00<00:00, 834.97it/s]\n",
      "100%|██████████████████████████████████████████| 48/48 [00:00<00:00, 826.78it/s]\n",
      "100%|██████████████████████████████████████████| 48/48 [00:00<00:00, 822.29it/s]\n",
      "100%|██████████████████████████████████████████| 49/49 [00:00<00:00, 829.56it/s]\n",
      "2024-04-17:03:38:35,320 INFO     [task.py:395] Building contexts for mmlu_human_aging on rank 2...\n",
      "2024-04-17:03:38:35,320 INFO     [task.py:395] Building contexts for mmlu_human_aging on rank 3...\n",
      "2024-04-17:03:38:35,320 INFO     [task.py:395] Building contexts for mmlu_human_aging on rank 1...\n",
      "2024-04-17:03:38:35,320 INFO     [task.py:395] Building contexts for mmlu_human_aging on rank 0...\n",
      "100%|██████████████████████████████████████████| 55/55 [00:00<00:00, 826.34it/s]\n",
      "100%|██████████████████████████████████████████| 56/56 [00:00<00:00, 836.11it/s]\n",
      "100%|██████████████████████████████████████████| 56/56 [00:00<00:00, 829.17it/s]\n",
      "100%|██████████████████████████████████████████| 56/56 [00:00<00:00, 825.75it/s]\n",
      "2024-04-17:03:38:35,396 INFO     [task.py:395] Building contexts for mmlu_global_facts on rank 3...\n",
      "2024-04-17:03:38:35,396 INFO     [task.py:395] Building contexts for mmlu_global_facts on rank 2...\n",
      "2024-04-17:03:38:35,396 INFO     [task.py:395] Building contexts for mmlu_global_facts on rank 0...\n",
      "2024-04-17:03:38:35,396 INFO     [task.py:395] Building contexts for mmlu_global_facts on rank 1...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 831.97it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 827.22it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.38it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 819.03it/s]\n",
      "2024-04-17:03:38:35,431 INFO     [task.py:395] Building contexts for mmlu_medical_genetics on rank 3...\n",
      "2024-04-17:03:38:35,431 INFO     [task.py:395] Building contexts for mmlu_medical_genetics on rank 1...\n",
      "2024-04-17:03:38:35,431 INFO     [task.py:395] Building contexts for mmlu_medical_genetics on rank 2...\n",
      "2024-04-17:03:38:35,431 INFO     [task.py:395] Building contexts for mmlu_medical_genetics on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 827.06it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 817.94it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 823.67it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 817.32it/s]\n",
      "2024-04-17:03:38:35,465 INFO     [task.py:395] Building contexts for mmlu_virology on rank 3...\n",
      "2024-04-17:03:38:35,465 INFO     [task.py:395] Building contexts for mmlu_virology on rank 2...\n",
      "2024-04-17:03:38:35,465 INFO     [task.py:395] Building contexts for mmlu_virology on rank 0...\n",
      "2024-04-17:03:38:35,466 INFO     [task.py:395] Building contexts for mmlu_virology on rank 1...\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 827.62it/s]\n",
      "100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 831.34it/s]\n",
      "100%|██████████████████████████████████████████| 42/42 [00:00<00:00, 832.15it/s]\n",
      "100%|██████████████████████████████████████████| 42/42 [00:00<00:00, 827.10it/s]\n",
      "2024-04-17:03:38:35,522 INFO     [task.py:395] Building contexts for mmlu_professional_medicine on rank 1...\n",
      "2024-04-17:03:38:35,522 INFO     [task.py:395] Building contexts for mmlu_professional_medicine on rank 3...\n",
      "2024-04-17:03:38:35,522 INFO     [task.py:395] Building contexts for mmlu_professional_medicine on rank 2...\n",
      "2024-04-17:03:38:35,523 INFO     [task.py:395] Building contexts for mmlu_professional_medicine on rank 0...\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 834.82it/s]\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 833.18it/s]\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 831.90it/s]\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 821.69it/s]\n",
      "2024-04-17:03:38:35,616 INFO     [task.py:395] Building contexts for mmlu_miscellaneous on rank 3...\n",
      "2024-04-17:03:38:35,616 INFO     [task.py:395] Building contexts for mmlu_miscellaneous on rank 0...\n",
      "2024-04-17:03:38:35,616 INFO     [task.py:395] Building contexts for mmlu_miscellaneous on rank 2...\n",
      "2024-04-17:03:38:35,616 INFO     [task.py:395] Building contexts for mmlu_miscellaneous on rank 1...\n",
      "100%|████████████████████████████████████████| 195/195 [00:00<00:00, 832.52it/s]\n",
      "100%|████████████████████████████████████████| 196/196 [00:00<00:00, 834.98it/s]\n",
      "100%|████████████████████████████████████████| 196/196 [00:00<00:00, 832.64it/s]\n",
      "100%|████████████████████████████████████████| 196/196 [00:00<00:00, 830.84it/s]\n",
      "2024-04-17:03:38:35,877 INFO     [task.py:395] Building contexts for mmlu_business_ethics on rank 0...\n",
      "2024-04-17:03:38:35,878 INFO     [task.py:395] Building contexts for mmlu_business_ethics on rank 3...\n",
      "2024-04-17:03:38:35,878 INFO     [task.py:395] Building contexts for mmlu_business_ethics on rank 1...\n",
      "2024-04-17:03:38:35,878 INFO     [task.py:395] Building contexts for mmlu_business_ethics on rank 2...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 828.97it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 819.78it/s]\n",
      "\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 827.69it/s]\n",
      "2024-04-17:03:38:35,912 INFO     [task.py:395] Building contexts for mmlu_professional_accounting on rank 0...\n",
      "2024-04-17:03:38:35,912 INFO     [task.py:395] Building contexts for mmlu_professional_accounting on rank 1...\n",
      "2024-04-17:03:38:35,912 INFO     [task.py:395] Building contexts for mmlu_professional_accounting on rank 3...\n",
      "2024-04-17:03:38:35,912 INFO     [task.py:395] Building contexts for mmlu_professional_accounting on rank 2...\n",
      "100%|██████████████████████████████████████████| 70/70 [00:00<00:00, 834.00it/s]\n",
      "100%|██████████████████████████████████████████| 70/70 [00:00<00:00, 825.40it/s]\n",
      "100%|██████████████████████████████████████████| 71/71 [00:00<00:00, 830.11it/s]\n",
      "100%|██████████████████████████████████████████| 71/71 [00:00<00:00, 830.18it/s]\n",
      "2024-04-17:03:38:36,008 INFO     [task.py:395] Building contexts for mmlu_nutrition on rank 0...\n",
      "2024-04-17:03:38:36,008 INFO     [task.py:395] Building contexts for mmlu_nutrition on rank 2...\n",
      "2024-04-17:03:38:36,008 INFO     [task.py:395] Building contexts for mmlu_nutrition on rank 3...\n",
      "2024-04-17:03:38:36,008 INFO     [task.py:395] Building contexts for mmlu_nutrition on rank 1...\n",
      "100%|██████████████████████████████████████████| 76/76 [00:00<00:00, 832.49it/s]\n",
      "100%|██████████████████████████████████████████| 76/76 [00:00<00:00, 829.33it/s]\n",
      "100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 832.30it/s]\n",
      "100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 817.08it/s]\n",
      "2024-04-17:03:38:36,113 INFO     [task.py:395] Building contexts for mmlu_management on rank 2...\n",
      "2024-04-17:03:38:36,113 INFO     [task.py:395] Building contexts for mmlu_management on rank 1...\n",
      "2024-04-17:03:38:36,113 INFO     [task.py:395] Building contexts for mmlu_management on rank 3...\n",
      "2024-04-17:03:38:36,113 INFO     [task.py:395] Building contexts for mmlu_management on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 814.74it/s]\n",
      "100%|██████████████████████████████████████████| 26/26 [00:00<00:00, 826.61it/s]\n",
      "100%|██████████████████████████████████████████| 26/26 [00:00<00:00, 825.83it/s]\n",
      "100%|██████████████████████████████████████████| 26/26 [00:00<00:00, 819.56it/s]\n",
      "2024-04-17:03:38:36,149 INFO     [task.py:395] Building contexts for mmlu_clinical_knowledge on rank 2...\n",
      "2024-04-17:03:38:36,149 INFO     [task.py:395] Building contexts for mmlu_clinical_knowledge on rank 1...\n",
      "2024-04-17:03:38:36,149 INFO     [task.py:395] Building contexts for mmlu_clinical_knowledge on rank 3...\n",
      "2024-04-17:03:38:36,149 INFO     [task.py:395] Building contexts for mmlu_clinical_knowledge on rank 0...\n",
      "100%|██████████████████████████████████████████| 66/66 [00:00<00:00, 829.53it/s]\n",
      "100%|██████████████████████████████████████████| 66/66 [00:00<00:00, 825.32it/s]\n",
      "100%|██████████████████████████████████████████| 66/66 [00:00<00:00, 820.23it/s]\n",
      "100%|██████████████████████████████████████████| 67/67 [00:00<00:00, 827.48it/s]\n",
      "2024-04-17:03:38:36,239 INFO     [task.py:395] Building contexts for mmlu_college_medicine on rank 1...\n",
      "2024-04-17:03:38:36,240 INFO     [task.py:395] Building contexts for mmlu_college_medicine on rank 2...\n",
      "2024-04-17:03:38:36,240 INFO     [task.py:395] Building contexts for mmlu_college_medicine on rank 3...\n",
      "2024-04-17:03:38:36,240 INFO     [task.py:395] Building contexts for mmlu_college_medicine on rank 0...\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 827.48it/s]\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 827.12it/s]\n",
      "100%|██████████████████████████████████████████| 43/43 [00:00<00:00, 820.97it/s]\n",
      "100%|██████████████████████████████████████████| 44/44 [00:00<00:00, 824.88it/s]\n",
      "2024-04-17:03:38:36,301 INFO     [task.py:395] Building contexts for mmlu_marketing on rank 2...\n",
      "2024-04-17:03:38:36,301 INFO     [task.py:395] Building contexts for mmlu_marketing on rank 3...\n",
      "2024-04-17:03:38:36,301 INFO     [task.py:395] Building contexts for mmlu_marketing on rank 1...\n",
      "2024-04-17:03:38:36,301 INFO     [task.py:395] Building contexts for mmlu_marketing on rank 0...\n",
      "100%|██████████████████████████████████████████| 58/58 [00:00<00:00, 837.16it/s]\n",
      "100%|██████████████████████████████████████████| 58/58 [00:00<00:00, 830.97it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 829.35it/s]\n",
      "\n",
      "2024-04-17:03:38:36,381 INFO     [task.py:395] Building contexts for mmlu_college_chemistry on rank 3...\n",
      "2024-04-17:03:38:36,381 INFO     [task.py:395] Building contexts for mmlu_college_chemistry on rank 2...\n",
      "2024-04-17:03:38:36,381 INFO     [task.py:395] Building contexts for mmlu_college_chemistry on rank 1...\n",
      "2024-04-17:03:38:36,381 INFO     [task.py:395] Building contexts for mmlu_college_chemistry on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 825.47it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 829.67it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.06it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.50it/s]\n",
      "2024-04-17:03:38:36,415 INFO     [task.py:395] Building contexts for mmlu_machine_learning on rank 1...\n",
      "2024-04-17:03:38:36,415 INFO     [task.py:395] Building contexts for mmlu_machine_learning on rank 2...\n",
      "2024-04-17:03:38:36,416 INFO     [task.py:395] Building contexts for mmlu_machine_learning on rank 3...\n",
      "2024-04-17:03:38:36,416 INFO     [task.py:395] Building contexts for mmlu_machine_learning on rank 0...\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 834.08it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 831.74it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 825.16it/s]\n",
      "100%|██████████████████████████████████████████| 28/28 [00:00<00:00, 822.45it/s]\n",
      "2024-04-17:03:38:36,454 INFO     [task.py:395] Building contexts for mmlu_high_school_computer_science on rank 3...\n",
      "2024-04-17:03:38:36,454 INFO     [task.py:395] Building contexts for mmlu_high_school_computer_science on rank 1...\n",
      "2024-04-17:03:38:36,454 INFO     [task.py:395] Building contexts for mmlu_high_school_computer_science on rank 2...\n",
      "2024-04-17:03:38:36,454 INFO     [task.py:395] Building contexts for mmlu_high_school_computer_science on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 831.44it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 826.42it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 816.84it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 829.46it/s]\n",
      "2024-04-17:03:38:36,489 INFO     [task.py:395] Building contexts for mmlu_high_school_physics on rank 2...\n",
      "2024-04-17:03:38:36,489 INFO     [task.py:395] Building contexts for mmlu_high_school_physics on rank 0...\n",
      "2024-04-17:03:38:36,489 INFO     [task.py:395] Building contexts for mmlu_high_school_physics on rank 3...\n",
      "2024-04-17:03:38:36,489 INFO     [task.py:395] Building contexts for mmlu_high_school_physics on rank 1...\n",
      "100%|██████████████████████████████████████████| 37/37 [00:00<00:00, 822.42it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 830.78it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 831.77it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 831.11it/s]\n",
      "2024-04-17:03:38:36,541 INFO     [task.py:395] Building contexts for mmlu_conceptual_physics on rank 3...\n",
      "2024-04-17:03:38:36,541 INFO     [task.py:395] Building contexts for mmlu_conceptual_physics on rank 2...\n",
      "2024-04-17:03:38:36,541 INFO     [task.py:395] Building contexts for mmlu_conceptual_physics on rank 0...\n",
      "2024-04-17:03:38:36,541 INFO     [task.py:395] Building contexts for mmlu_conceptual_physics on rank 1...\n",
      "100%|██████████████████████████████████████████| 58/58 [00:00<00:00, 827.51it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 833.69it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 837.65it/s]\n",
      "100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 836.27it/s]\n",
      "2024-04-17:03:38:36,620 INFO     [task.py:395] Building contexts for mmlu_high_school_statistics on rank 2...\n",
      "2024-04-17:03:38:36,620 INFO     [task.py:395] Building contexts for mmlu_high_school_statistics on rank 0...\n",
      "2024-04-17:03:38:36,620 INFO     [task.py:395] Building contexts for mmlu_high_school_statistics on rank 3...\n",
      "2024-04-17:03:38:36,620 INFO     [task.py:395] Building contexts for mmlu_high_school_statistics on rank 1...\n",
      "100%|██████████████████████████████████████████| 54/54 [00:00<00:00, 839.29it/s]\n",
      "100%|██████████████████████████████████████████| 54/54 [00:00<00:00, 833.72it/s]\n",
      "100%|██████████████████████████████████████████| 54/54 [00:00<00:00, 831.26it/s]\n",
      "100%|██████████████████████████████████████████| 54/54 [00:00<00:00, 826.58it/s]\n",
      "2024-04-17:03:38:36,694 INFO     [task.py:395] Building contexts for mmlu_college_mathematics on rank 1...\n",
      "2024-04-17:03:38:36,695 INFO     [task.py:395] Building contexts for mmlu_college_mathematics on rank 2...\n",
      "2024-04-17:03:38:36,695 INFO     [task.py:395] Building contexts for mmlu_college_mathematics on rank 0...\n",
      "2024-04-17:03:38:36,696 INFO     [task.py:395] Building contexts for mmlu_college_mathematics on rank 3...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.16it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 836.82it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 823.91it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 813.99it/s]\n",
      "2024-04-17:03:38:36,730 INFO     [task.py:395] Building contexts for mmlu_high_school_biology on rank 2...\n",
      "2024-04-17:03:38:36,730 INFO     [task.py:395] Building contexts for mmlu_high_school_biology on rank 1...\n",
      "2024-04-17:03:38:36,731 INFO     [task.py:395] Building contexts for mmlu_high_school_biology on rank 0...\n",
      "2024-04-17:03:38:36,731 INFO     [task.py:395] Building contexts for mmlu_high_school_biology on rank 3...\n",
      "100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 837.25it/s]\n",
      "100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 833.31it/s]\n",
      "100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 835.73it/s]\n",
      "100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 816.95it/s]\n",
      "2024-04-17:03:38:36,837 INFO     [task.py:395] Building contexts for mmlu_high_school_mathematics on rank 3...\n",
      "2024-04-17:03:38:36,837 INFO     [task.py:395] Building contexts for mmlu_high_school_mathematics on rank 2...\n",
      "2024-04-17:03:38:36,837 INFO     [task.py:395] Building contexts for mmlu_high_school_mathematics on rank 1...\n",
      "2024-04-17:03:38:36,837 INFO     [task.py:395] Building contexts for mmlu_high_school_mathematics on rank 0...\n",
      "100%|██████████████████████████████████████████| 67/67 [00:00<00:00, 836.06it/s]\n",
      "100%|██████████████████████████████████████████| 67/67 [00:00<00:00, 834.43it/s]\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 834.89it/s]\n",
      "100%|██████████████████████████████████████████| 68/68 [00:00<00:00, 828.38it/s]\n",
      "2024-04-17:03:38:36,928 INFO     [task.py:395] Building contexts for mmlu_elementary_mathematics on rank 2...\n",
      "2024-04-17:03:38:36,928 INFO     [task.py:395] Building contexts for mmlu_elementary_mathematics on rank 1...\n",
      "2024-04-17:03:38:36,929 INFO     [task.py:395] Building contexts for mmlu_elementary_mathematics on rank 3...\n",
      "2024-04-17:03:38:36,929 INFO     [task.py:395] Building contexts for mmlu_elementary_mathematics on rank 0...\n",
      "100%|██████████████████████████████████████████| 94/94 [00:00<00:00, 829.12it/s]\n",
      "100%|██████████████████████████████████████████| 94/94 [00:00<00:00, 831.76it/s]\n",
      "100%|██████████████████████████████████████████| 95/95 [00:00<00:00, 835.66it/s]\n",
      "100%|██████████████████████████████████████████| 95/95 [00:00<00:00, 826.07it/s]\n",
      "2024-04-17:03:38:37,056 INFO     [task.py:395] Building contexts for mmlu_college_physics on rank 1...\n",
      "2024-04-17:03:38:37,056 INFO     [task.py:395] Building contexts for mmlu_college_physics on rank 3...\n",
      "2024-04-17:03:38:37,057 INFO     [task.py:395] Building contexts for mmlu_college_physics on rank 2...\n",
      "2024-04-17:03:38:37,057 INFO     [task.py:395] Building contexts for mmlu_college_physics on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 839.69it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 826.82it/s]\n",
      "100%|██████████████████████████████████████████| 26/26 [00:00<00:00, 827.93it/s]\n",
      "100%|██████████████████████████████████████████| 26/26 [00:00<00:00, 821.05it/s]\n",
      "2024-04-17:03:38:37,092 INFO     [task.py:395] Building contexts for mmlu_astronomy on rank 1...\n",
      "2024-04-17:03:38:37,092 INFO     [task.py:395] Building contexts for mmlu_astronomy on rank 2...\n",
      "2024-04-17:03:38:37,093 INFO     [task.py:395] Building contexts for mmlu_astronomy on rank 3...\n",
      "2024-04-17:03:38:37,093 INFO     [task.py:395] Building contexts for mmlu_astronomy on rank 0...\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 839.44it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 831.06it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 823.91it/s]\n",
      "100%|██████████████████████████████████████████| 38/38 [00:00<00:00, 811.60it/s]\n",
      "2024-04-17:03:38:37,146 INFO     [task.py:395] Building contexts for mmlu_college_computer_science on rank 3...\n",
      "2024-04-17:03:38:37,146 INFO     [task.py:395] Building contexts for mmlu_college_computer_science on rank 0...\n",
      "2024-04-17:03:38:37,146 INFO     [task.py:395] Building contexts for mmlu_college_computer_science on rank 2...\n",
      "2024-04-17:03:38:37,146 INFO     [task.py:395] Building contexts for mmlu_college_computer_science on rank 1...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 839.73it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 823.70it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 819.35it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 824.56it/s]\n",
      "2024-04-17:03:38:37,181 INFO     [task.py:395] Building contexts for mmlu_high_school_chemistry on rank 2...\n",
      "2024-04-17:03:38:37,181 INFO     [task.py:395] Building contexts for mmlu_high_school_chemistry on rank 0...\n",
      "2024-04-17:03:38:37,181 INFO     [task.py:395] Building contexts for mmlu_high_school_chemistry on rank 3...\n",
      "2024-04-17:03:38:37,181 INFO     [task.py:395] Building contexts for mmlu_high_school_chemistry on rank 1...\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 832.13it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 841.50it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 832.14it/s]\n",
      "100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 823.26it/s]\n",
      "2024-04-17:03:38:37,250 INFO     [task.py:395] Building contexts for mmlu_computer_security on rank 3...\n",
      "2024-04-17:03:38:37,250 INFO     [task.py:395] Building contexts for mmlu_computer_security on rank 2...\n",
      "2024-04-17:03:38:37,250 INFO     [task.py:395] Building contexts for mmlu_computer_security on rank 1...\n",
      "2024-04-17:03:38:37,250 INFO     [task.py:395] Building contexts for mmlu_computer_security on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 834.71it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 818.68it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 827.06it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 790.22it/s]\n",
      "2024-04-17:03:38:37,287 INFO     [task.py:395] Building contexts for mmlu_anatomy on rank 2...\n",
      "2024-04-17:03:38:37,287 INFO     [task.py:395] Building contexts for mmlu_anatomy on rank 3...\n",
      "2024-04-17:03:38:37,287 INFO     [task.py:395] Building contexts for mmlu_anatomy on rank 1...\n",
      "2024-04-17:03:38:37,287 INFO     [task.py:395] Building contexts for mmlu_anatomy on rank 0...\n",
      "100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 819.43it/s]\n",
      "100%|██████████████████████████████████████████| 34/34 [00:00<00:00, 829.13it/s]\n",
      "100%|██████████████████████████████████████████| 34/34 [00:00<00:00, 826.04it/s]\n",
      "100%|██████████████████████████████████████████| 34/34 [00:00<00:00, 822.15it/s]\n",
      "2024-04-17:03:38:37,335 INFO     [task.py:395] Building contexts for mmlu_college_biology on rank 2...\n",
      "2024-04-17:03:38:37,335 INFO     [task.py:395] Building contexts for mmlu_college_biology on rank 3...\n",
      "2024-04-17:03:38:37,335 INFO     [task.py:395] Building contexts for mmlu_college_biology on rank 1...\n",
      "2024-04-17:03:38:37,335 INFO     [task.py:395] Building contexts for mmlu_college_biology on rank 0...\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 837.31it/s]\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 832.53it/s]\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 834.60it/s]\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 821.04it/s]\n",
      "2024-04-17:03:38:37,384 INFO     [task.py:395] Building contexts for mmlu_abstract_algebra on rank 3...\n",
      "2024-04-17:03:38:37,384 INFO     [task.py:395] Building contexts for mmlu_abstract_algebra on rank 2...\n",
      "2024-04-17:03:38:37,384 INFO     [task.py:395] Building contexts for mmlu_abstract_algebra on rank 1...\n",
      "2024-04-17:03:38:37,384 INFO     [task.py:395] Building contexts for mmlu_abstract_algebra on rank 0...\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 829.83it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 825.51it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 820.32it/s]\n",
      "100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 809.69it/s]\n",
      "2024-04-17:03:38:37,419 INFO     [task.py:395] Building contexts for mmlu_electrical_engineering on rank 0...\n",
      "2024-04-17:03:38:37,419 INFO     [task.py:395] Building contexts for mmlu_electrical_engineering on rank 1...\n",
      "2024-04-17:03:38:37,419 INFO     [task.py:395] Building contexts for mmlu_electrical_engineering on rank 2...\n",
      "2024-04-17:03:38:37,419 INFO     [task.py:395] Building contexts for mmlu_electrical_engineering on rank 3...\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 836.38it/s]\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 832.14it/s]\n",
      "100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 823.11it/s]\n",
      "100%|██████████████████████████████████████████| 37/37 [00:00<00:00, 823.64it/s]\n",
      "2024-04-17:03:38:37,469 INFO     [evaluator.py:379] Running loglikelihood requests\n",
      "2024-04-17:03:38:37,469 INFO     [evaluator.py:379] Running loglikelihood requests\n",
      "2024-04-17:03:38:37,469 INFO     [evaluator.py:379] Running loglikelihood requests\n",
      "2024-04-17:03:38:37,469 INFO     [evaluator.py:379] Running loglikelihood requests\n",
      "Running loglikelihood requests: 100%|█████| 17788/17788 [07:20<00:00, 40.36it/s]\n",
      "hf (pretrained=models/yi_cut), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2\n",
      "|                 Tasks                 |Version|Filter|n-shot|Metric|Value |   |Stderr|\n",
      "|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|\n",
      "|wmdp                                   |N/A    |none  |     0|acc   |0.3252|±  |0.0076|\n",
      "| - wmdp_bio                            |      0|none  |     0|acc   |0.3071|±  |0.0129|\n",
      "| - wmdp_chem                           |      0|none  |     0|acc   |0.5539|±  |0.0246|\n",
      "| - wmdp_cyber                          |      0|none  |     0|acc   |0.2899|±  |0.0102|\n",
      "|mmlu                                   |N/A    |none  |     0|acc   |0.7063|±  |0.0036|\n",
      "| - humanities                          |N/A    |none  |     0|acc   |0.6629|±  |0.0064|\n",
      "|  - formal_logic                       |      0|none  |     0|acc   |0.4762|±  |0.0447|\n",
      "|  - high_school_european_history       |      0|none  |     0|acc   |0.8606|±  |0.0270|\n",
      "|  - high_school_us_history             |      0|none  |     0|acc   |0.9069|±  |0.0204|\n",
      "|  - high_school_world_history          |      0|none  |     0|acc   |0.8987|±  |0.0196|\n",
      "|  - international_law                  |      0|none  |     0|acc   |0.8347|±  |0.0339|\n",
      "|  - jurisprudence                      |      0|none  |     0|acc   |0.8796|±  |0.0315|\n",
      "|  - logical_fallacies                  |      0|none  |     0|acc   |0.8834|±  |0.0252|\n",
      "|  - moral_disputes                     |      0|none  |     0|acc   |0.8121|±  |0.0210|\n",
      "|  - moral_scenarios                    |      0|none  |     0|acc   |0.4559|±  |0.0167|\n",
      "|  - philosophy                         |      0|none  |     0|acc   |0.7685|±  |0.0240|\n",
      "|  - prehistory                         |      0|none  |     0|acc   |0.8333|±  |0.0207|\n",
      "|  - professional_law                   |      0|none  |     0|acc   |0.5430|±  |0.0127|\n",
      "|  - world_religions                    |      0|none  |     0|acc   |0.8655|±  |0.0262|\n",
      "| - other                               |N/A    |none  |     0|acc   |0.7525|±  |0.0072|\n",
      "|  - business_ethics                    |      0|none  |     0|acc   |0.7900|±  |0.0409|\n",
      "|  - clinical_knowledge                 |      0|none  |     0|acc   |0.8000|±  |0.0246|\n",
      "|  - college_medicine                   |      0|none  |     0|acc   |0.6879|±  |0.0353|\n",
      "|  - global_facts                       |      0|none  |     0|acc   |0.4600|±  |0.0501|\n",
      "|  - human_aging                        |      0|none  |     0|acc   |0.7848|±  |0.0276|\n",
      "|  - management                         |      0|none  |     0|acc   |0.8738|±  |0.0329|\n",
      "|  - marketing                          |      0|none  |     0|acc   |0.9060|±  |0.0191|\n",
      "|  - medical_genetics                   |      0|none  |     0|acc   |0.7400|±  |0.0441|\n",
      "|  - miscellaneous                      |      0|none  |     0|acc   |0.8902|±  |0.0112|\n",
      "|  - nutrition                          |      0|none  |     0|acc   |0.7778|±  |0.0238|\n",
      "|  - professional_accounting            |      0|none  |     0|acc   |0.6277|±  |0.0288|\n",
      "|  - professional_medicine              |      0|none  |     0|acc   |0.6691|±  |0.0286|\n",
      "|  - virology                           |      0|none  |     0|acc   |0.2229|±  |0.0324|\n",
      "| - social_sciences                     |N/A    |none  |     0|acc   |0.8219|±  |0.0068|\n",
      "|  - econometrics                       |      0|none  |     0|acc   |0.6053|±  |0.0460|\n",
      "|  - high_school_geography              |      0|none  |     0|acc   |0.8990|±  |0.0215|\n",
      "|  - high_school_government_and_politics|      0|none  |     0|acc   |0.9637|±  |0.0135|\n",
      "|  - high_school_macroeconomics         |      0|none  |     0|acc   |0.7436|±  |0.0221|\n",
      "|  - high_school_microeconomics         |      0|none  |     0|acc   |0.8151|±  |0.0252|\n",
      "|  - high_school_psychology             |      0|none  |     0|acc   |0.9083|±  |0.0124|\n",
      "|  - human_sexuality                    |      0|none  |     0|acc   |0.7939|±  |0.0355|\n",
      "|  - professional_psychology            |      0|none  |     0|acc   |0.7876|±  |0.0165|\n",
      "|  - public_relations                   |      0|none  |     0|acc   |0.7455|±  |0.0417|\n",
      "|  - security_studies                   |      0|none  |     0|acc   |0.7551|±  |0.0275|\n",
      "|  - sociology                          |      0|none  |     0|acc   |0.8557|±  |0.0248|\n",
      "|  - us_foreign_policy                  |      0|none  |     0|acc   |0.9200|±  |0.0273|\n",
      "| - stem                                |N/A    |none  |     0|acc   |0.6127|±  |0.0082|\n",
      "|  - abstract_algebra                   |      0|none  |     0|acc   |0.4200|±  |0.0496|\n",
      "|  - anatomy                            |      0|none  |     0|acc   |0.7037|±  |0.0394|\n",
      "|  - astronomy                          |      0|none  |     0|acc   |0.8421|±  |0.0297|\n",
      "|  - college_biology                    |      0|none  |     0|acc   |0.8403|±  |0.0306|\n",
      "|  - college_chemistry                  |      0|none  |     0|acc   |0.5300|±  |0.0502|\n",
      "|  - college_computer_science           |      0|none  |     0|acc   |0.5700|±  |0.0498|\n",
      "|  - college_mathematics                |      0|none  |     0|acc   |0.3200|±  |0.0469|\n",
      "|  - college_physics                    |      0|none  |     0|acc   |0.4706|±  |0.0497|\n",
      "|  - computer_security                  |      0|none  |     0|acc   |0.4600|±  |0.0501|\n",
      "|  - conceptual_physics                 |      0|none  |     0|acc   |0.7362|±  |0.0288|\n",
      "|  - electrical_engineering             |      0|none  |     0|acc   |0.7172|±  |0.0375|\n",
      "|  - elementary_mathematics             |      0|none  |     0|acc   |0.5529|±  |0.0256|\n",
      "|  - high_school_biology                |      0|none  |     0|acc   |0.8161|±  |0.0220|\n",
      "|  - high_school_chemistry              |      0|none  |     0|acc   |0.6108|±  |0.0343|\n",
      "|  - high_school_computer_science       |      0|none  |     0|acc   |0.7400|±  |0.0441|\n",
      "|  - high_school_mathematics            |      0|none  |     0|acc   |0.3370|±  |0.0288|\n",
      "|  - high_school_physics                |      0|none  |     0|acc   |0.4834|±  |0.0408|\n",
      "|  - high_school_statistics             |      0|none  |     0|acc   |0.6481|±  |0.0326|\n",
      "|  - machine_learning                   |      0|none  |     0|acc   |0.6161|±  |0.0462|\n",
      "\n",
      "|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|\n",
      "|------------------|-------|------|-----:|------|-----:|---|-----:|\n",
      "|wmdp              |N/A    |none  |     0|acc   |0.3252|±  |0.0076|\n",
      "|mmlu              |N/A    |none  |     0|acc   |0.7063|±  |0.0036|\n",
      "| - humanities     |N/A    |none  |     0|acc   |0.6629|±  |0.0064|\n",
      "| - other          |N/A    |none  |     0|acc   |0.7525|±  |0.0072|\n",
      "| - social_sciences|N/A    |none  |     0|acc   |0.8219|±  |0.0068|\n",
      "| - stem           |N/A    |none  |     0|acc   |0.6127|±  |0.0082|\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!lm-eval --model hf \\\n",
    "    --model_args pretrained=models/yi_rmu \\\n",
    "    --tasks mmlu,wmdp \\\n",
    "    --batch_size=2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
