{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-28T07:09:17.442787Z",
     "iopub.status.busy": "2025-04-28T07:09:17.442498Z",
     "iopub.status.idle": "2025-04-28T08:18:09.083491Z",
     "shell.execute_reply": "2025-04-28T08:18:09.082803Z",
     "shell.execute_reply.started": "2025-04-28T07:09:17.442687Z"
    }
   },
   "outputs": [],
   "source": [
    "# DEPTH 2\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import numpy as np\n",
    "import math\n",
    "import os\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "# Install TensorFlow Addons\n",
    "!pip install -q tensorflow-addons\n",
    "\n",
    "# Import AdamW optimizer\n",
    "from tensorflow_addons.optimizers import AdamW\n",
    "\n",
    "# Data loading and preprocessing\n",
    "with open('./data/t8.shakespeare.txt', 'r', encoding='utf-8') as f:\n",
    "    text = f.read()\n",
    "\n",
    "print(\"length of this dataset in chars: \", len(text))\n",
    "chars = sorted(list(set(text)))\n",
    "vocab_size = len(chars)\n",
    "print(''.join(chars))\n",
    "print(vocab_size)\n",
    "\n",
    "stoi = {ch: i for i, ch in enumerate(chars)}\n",
    "itos = {i: ch for i, ch in enumerate(chars)}\n",
    "encode = lambda s: [stoi[c] for c in s]\n",
    "decode = lambda l: ''.join([itos[i] for i in l])\n",
    "data = tf.convert_to_tensor(encode(text), dtype=tf.int32)\n",
    "\n",
    "\n",
    "# Train/validation split\n",
    "split_n = int(0.9 * len(data))\n",
    "train_data = data[:split_n]\n",
    "val_data = data[split_n:]\n",
    "BLOCK_SIZE = 256\n",
    "BATCH_SIZE = 32\n",
    "SEED = 123\n",
    "\n",
    "def get_batch(split):\n",
    "    d = train_data if split == \"train\" else val_data\n",
    "    ix = tf.random.uniform((BATCH_SIZE,), minval=0, maxval=len(d) - BLOCK_SIZE, seed=SEED, dtype=tf.int32)\n",
    "    x = tf.stack([d[i:i+BLOCK_SIZE] for i in ix])\n",
    "    y = tf.stack([d[i+1:i+BLOCK_SIZE+1] for i in ix])\n",
    "    return x, y\n",
    "\n",
    "# Training and model hyperparameters\n",
    "max_iters = 3000\n",
    "eval_interval = 20\n",
    "eval_iters = 20\n",
    "n_embed = 384\n",
    "n_layers = 6\n",
    "n_head = 12\n",
    "dropout = 0.2\n",
    "depth = 2   # number of factors (if depth == 1, no additional scaling)\n",
    "learning_rate = 1e-3\n",
    "initial_lr = learning_rate\n",
    "min_lr = 1e-4\n",
    "warmup_iters = 100\n",
    "lr_decay_iters = max_iters\n",
    "opt = 'Adam'\n",
    "wd_strength = 0\n",
    "if opt == 'AdamW':\n",
    "    la_other = 0  # fixed regularization for other parts\n",
    "    weight_decay = wd_strength\n",
    "else: \n",
    "    weight_decay = 0\n",
    "    la_other = 0.01 * wd_strength\n",
    "\n",
    "# Define working cosine schedule with linear warmup\n",
    "class CosineWarmupDecay(tf.keras.optimizers.schedules.LearningRateSchedule):\n",
    "    def __init__(self, initial_learning_rate, min_lr, warmup_iters, lr_decay_iters):\n",
    "        super().__init__()\n",
    "        self.initial_learning_rate = initial_learning_rate\n",
    "        self.min_lr = min_lr\n",
    "        self.warmup_iters = tf.cast(warmup_iters, tf.float32)\n",
    "        self.lr_decay_iters = tf.cast(lr_decay_iters, tf.float32)\n",
    "\n",
    "    def __call__(self, step):\n",
    "        step = tf.cast(step, tf.float32)\n",
    "        # Linear warmup: lr increases linearly for warmup_iters steps.\n",
    "        warmup_lr = self.initial_learning_rate * (step + 1) / (self.warmup_iters + 1)\n",
    "        # Cosine decay between warmup_iters and lr_decay_iters.\n",
    "        decay_ratio = (step - self.warmup_iters) / (self.lr_decay_iters - self.warmup_iters)\n",
    "        cosine_decay = 0.5 * (1.0 + tf.cos(math.pi * decay_ratio))\n",
    "        decay_lr = self.min_lr + cosine_decay * (self.initial_learning_rate - self.min_lr)\n",
    "        # Select lr based on current step.\n",
    "        lr = tf.where(step < self.warmup_iters, warmup_lr, decay_lr)\n",
    "        lr = tf.where(step > self.lr_decay_iters, self.min_lr, lr)\n",
    "        return lr\n",
    "\n",
    "    def get_config(self):\n",
    "        return {\n",
    "            \"initial_learning_rate\": self.initial_learning_rate,\n",
    "            \"min_lr\": self.min_lr,\n",
    "            \"warmup_iters\": self.warmup_iters.numpy(),\n",
    "            \"lr_decay_iters\": self.lr_decay_iters.numpy()\n",
    "        }\n",
    "\n",
    "# Model definition (NanoGPT with D-Gated AttentionHeads)\n",
    "class AttentionHead(keras.layers.Layer):\n",
    "    def __init__(self, head_size, la, depth):\n",
    "        super().__init__()\n",
    "        reg_strength = la / depth\n",
    "        self.key = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.query = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.value = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        self.tril = tf.constant(tf.linalg.band_part(tf.ones([BLOCK_SIZE, BLOCK_SIZE]), -1, 0))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "        if depth > 1:\n",
    "            self.scalars = self.add_weight(name=\"scalars\", shape=(depth - 1,), initializer=tf.ones_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        else:\n",
    "            self.scalars = None\n",
    "    def call(self, x):\n",
    "        B, T, C = x.shape\n",
    "        q = self.query(x)\n",
    "        k = self.key(x)\n",
    "        v = self.value(x)\n",
    "        scale = tf.reduce_prod(self.scalars) if self.scalars is not None else 1.0\n",
    "        v = v * scale\n",
    "        wei = tf.matmul(q, k, transpose_b=True) * C**-0.5\n",
    "        mask = tf.cast(tf.linalg.band_part(tf.ones([T, T]), -1, 0), dtype=tf.bool)\n",
    "        wei = tf.where(mask, wei, tf.fill(tf.shape(wei), float('-inf')))\n",
    "        wei = tf.nn.softmax(wei, axis=-1)\n",
    "        wei = self.dropout(wei)\n",
    "        out = tf.matmul(wei, v)\n",
    "        return out\n",
    "\n",
    "class MultiHeadedAttention(keras.layers.Layer):\n",
    "    def __init__(self, head_size, num_heads, depth, la):\n",
    "        super().__init__()\n",
    "        self.heads = [AttentionHead(head_size, la, depth) for _ in range(num_heads)]\n",
    "        self.proj = keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "    def call(self, x):\n",
    "        out = tf.concat([head(x) for head in self.heads], axis=-1)\n",
    "        out = self.proj(out)\n",
    "        out = self.dropout(out)\n",
    "        return out\n",
    "\n",
    "class FeedForward(keras.layers.Layer):\n",
    "    def __init__(self, n_embed):\n",
    "        super().__init__()\n",
    "        self.net = keras.Sequential([\n",
    "            keras.layers.Dense(n_embed * 4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dropout(dropout)\n",
    "        ])\n",
    "    def call(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "class Block(keras.layers.Layer):\n",
    "    def __init__(self, n_embed, n_head, depth, la):\n",
    "        super().__init__()\n",
    "        head_size = n_embed // n_head\n",
    "        self.sa = MultiHeadedAttention(head_size, n_head, depth, la)\n",
    "        self.ffwd = FeedForward(n_embed)\n",
    "        self.ln1 = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.ln2 = keras.layers.LayerNormalization(axis=-1)\n",
    "    def call(self, x):\n",
    "        x = x + self.sa(self.ln1(x))\n",
    "        x = x + self.ffwd(self.ln2(x))\n",
    "        return x\n",
    "\n",
    "class NanoGPT(keras.Model):\n",
    "    def __init__(self, la):\n",
    "        super().__init__()\n",
    "        self.token_embedding_table = keras.layers.Embedding(vocab_size, n_embed)\n",
    "        self.position_embedding_table = keras.layers.Embedding(BLOCK_SIZE, n_embed)\n",
    "        self.blocks = keras.Sequential([Block(n_embed, n_head, depth, la) for _ in range(n_layers)])\n",
    "        self.ln_f = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.lm_head = keras.layers.Dense(vocab_size, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "    def call(self, idx, targets=None):\n",
    "        B, T = idx.shape\n",
    "        token_emb = self.token_embedding_table(idx)\n",
    "        position_emb = self.position_embedding_table(tf.range(T))\n",
    "        x = token_emb + position_emb\n",
    "        x = self.blocks(x)\n",
    "        x = self.ln_f(x)\n",
    "        logits = self.lm_head(x)\n",
    "        if targets is None:\n",
    "            return logits, None\n",
    "        else:\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(targets, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            reg_loss = tf.add_n(self.losses) if self.losses else 0.0\n",
    "            total_loss = base_loss + reg_loss\n",
    "            return logits, total_loss\n",
    "    def generate(self, idx, max_new_tokens):\n",
    "        for _ in range(max_new_tokens):\n",
    "            idx_cond = idx[:, -BLOCK_SIZE:]\n",
    "            logits, _ = self(idx_cond)\n",
    "            logits = logits[:, -1, :]\n",
    "            idx_next = tf.random.categorical(logits, num_samples=1, seed=SEED, dtype=tf.int64)\n",
    "            idx = tf.concat([idx, idx_next], axis=1)\n",
    "        return idx\n",
    "\n",
    "def estimate_loss(model):\n",
    "    losses = {}\n",
    "    base_losses = {}\n",
    "    accuracies = {}\n",
    "    model.trainable = False\n",
    "    for split in ['train', 'val']:\n",
    "        total_loss = []\n",
    "        total_base_loss = []\n",
    "        total_acc = []\n",
    "        for _ in range(eval_iters):\n",
    "            X, Y = get_batch(split)\n",
    "            logits, loss = model(X, Y)\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(Y, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            # Compute accuracy for the batch\n",
    "            preds = tf.argmax(logits, axis=-1, output_type=tf.int32)\n",
    "            acc = tf.reduce_mean(tf.cast(tf.equal(preds, Y), tf.float32))\n",
    "            total_loss.append(loss.numpy())\n",
    "            total_base_loss.append(base_loss.numpy())\n",
    "            total_acc.append(acc.numpy())\n",
    "        losses[split] = np.mean(total_loss)\n",
    "        base_losses[split] = np.mean(total_base_loss)\n",
    "        accuracies[split] = np.mean(total_acc)\n",
    "    model.trainable = True\n",
    "    perplexity_val = np.exp(base_losses['val'])\n",
    "    return losses, perplexity_val, accuracies\n",
    "\n",
    "# Define device\n",
    "gpus = tf.config.list_physical_devices('GPU')\n",
    "device = '/GPU:0' if gpus else '/CPU:0'\n",
    "\n",
    "def run_experiment(lambda_value):\n",
    "    \n",
    "    # Set random seeds for reproducibility\n",
    "    SEED = 42\n",
    "    np.random.seed(SEED)\n",
    "    random.seed(SEED)\n",
    "    tf.random.set_seed(SEED)\n",
    "    \n",
    "    la = lambda_value\n",
    "    lr_schedule = CosineWarmupDecay(initial_lr, min_lr, warmup_iters, lr_decay_iters)\n",
    "    if opt == 'AdamW':\n",
    "        optimizer = AdamW(learning_rate=lr_schedule, weight_decay=weight_decay, beta_1=0.9, beta_2=0.99, epsilon=1e-07)\n",
    "    else:\n",
    "        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)\n",
    "    \n",
    "    with tf.device(device):\n",
    "        model = NanoGPT(la)\n",
    "        model.build(input_shape=(BATCH_SIZE, BLOCK_SIZE))\n",
    "        model.summary()\n",
    "    history_records = []\n",
    "    for iter in range(max_iters):\n",
    "        if iter % eval_interval == 0:\n",
    "            losses, perplexity_val, accuracies = estimate_loss(model)\n",
    "            head_norms = {}\n",
    "            for block_idx, block in enumerate(model.blocks.layers):\n",
    "                for head_idx, head in enumerate(block.sa.heads):\n",
    "                    scale = tf.reduce_prod(head.scalars) if head.scalars is not None else 1.0\n",
    "                    effective_v = head.value.kernel * scale\n",
    "                    norm = tf.norm(effective_v, ord=2).numpy()\n",
    "                    head_norms[f\"norm_B{block_idx}_H{head_idx}\"] = norm\n",
    "            norms_values = list(head_norms.values())\n",
    "            min_norm = min(norms_values)\n",
    "            max_norm = max(norms_values)\n",
    "            eps = 1e-5\n",
    "            head_sparsity = sum(1 for norm in norms_values if norm < eps) / len(norms_values)\n",
    "            \n",
    "            record = {\n",
    "                \"iter\": iter,\n",
    "                \"train_loss\": losses['train'],\n",
    "                \"val_loss\": losses['val'],\n",
    "                \"val_perplexity\": perplexity_val,\n",
    "                \"val_accuracy\": accuracies['val'],\n",
    "                \"depth\": depth,\n",
    "                \"lambda\": la,\n",
    "                \"min_norm\": min_norm,\n",
    "                \"max_norm\": max_norm,\n",
    "                \"head_sparsity\": head_sparsity\n",
    "            }\n",
    "            record.update(head_norms)\n",
    "            history_records.append(record)\n",
    "            \n",
    "            print(f\"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, \"\n",
    "                  f\"val ppl {perplexity_val:.4f}, val acc {accuracies['val']:.4f}, \"\n",
    "                  f\"min norm {min_norm:.2e}, max norm {max_norm:.2e}, head sparsity {head_sparsity:4f}\")\n",
    "                  \n",
    "        with tf.GradientTape() as tape:\n",
    "            xb, yb = get_batch('train')\n",
    "            _, loss = model(xb, yb)\n",
    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
    "    \n",
    "    history_df = pd.DataFrame(history_records)\n",
    "    RUN_NAME = f\"dep{depth}-la{la:.2e}-nlyr{n_layers}-nhead{n_head}-nemb{n_embed}-bs{BATCH_SIZE}-blk{BLOCK_SIZE}-iter{max_iters}-lr{learning_rate:.1e}-opt{opt}-wd{weight_decay:.2e}_laother{la_other:.3e}\"\n",
    "    results_base_path = \"./results/nanogpt/grid/\"\n",
    "    RUN_PATH = os.path.join(results_base_path, RUN_NAME)\n",
    "    if not os.path.exists(RUN_PATH):\n",
    "        os.makedirs(RUN_PATH)\n",
    "    csv_file_path = os.path.join(RUN_PATH, \"results_iters.csv\")\n",
    "    history_df.to_csv(csv_file_path, index=False)\n",
    "    print(f\"Results saved to {csv_file_path}\")\n",
    "    \n",
    "    print(decode(model.generate(idx=tf.zeros((1,1), dtype=tf.dtypes.int64), max_new_tokens=500)[0].numpy().tolist()))\n",
    "    \n",
    "    return history_df, model\n",
    "\n",
    "# List of lambda values to test\n",
    "lambda_values = [0, 1e-4, 5e-4, 1e-3, 2e-3, 5e-3, 8e-3, 1e-2, 1.25e-2, 1.5e-2, 1.75e-2, 2e-2, 3e-2, 4e-2, 5e-2, 7.5e-2, 0.1, 0.11, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 2.0, 3.0, 4.0, 5.0, 10.0, 100.0]\n",
    "\n",
    "\n",
    "lambda_values.reverse()\n",
    "experiment_results = {}\n",
    "\n",
    "for lam in lambda_values:\n",
    "    print(f\"Running experiment with lambda = {lam}\")\n",
    "    history_df, model = run_experiment(lam)\n",
    "    experiment_results[lam] = history_df\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "for lam, df in experiment_results.items():\n",
    "    plt.plot(df[\"iter\"], df[\"train_loss\"], label=f'λ = {lam}')\n",
    "plt.xlabel('Iteration')\n",
    "plt.ylabel('Training Loss')\n",
    "plt.title('Training Loss over Iterations for Different λ')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-28T09:05:33.422028Z",
     "iopub.status.busy": "2025-04-28T09:05:33.421723Z",
     "iopub.status.idle": "2025-04-28T10:16:32.900779Z",
     "shell.execute_reply": "2025-04-28T10:16:32.900106Z",
     "shell.execute_reply.started": "2025-04-28T09:05:33.421966Z"
    }
   },
   "outputs": [],
   "source": [
    "# DEPTH 3\n",
    "\n",
    "# new code including val accuracy\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import numpy as np\n",
    "import math\n",
    "import os\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "# Install TensorFlow Addons\n",
    "!pip install -q tensorflow-addons\n",
    "\n",
    "# Import AdamW optimizer\n",
    "from tensorflow_addons.optimizers import AdamW\n",
    "\n",
    "# Data loading and preprocessing\n",
    "with open('./data/t8.shakespeare.txt', 'r', encoding='utf-8') as f:\n",
    "    text = f.read()\n",
    "\n",
    "print(\"length of this dataset in chars: \", len(text))\n",
    "chars = sorted(list(set(text)))\n",
    "vocab_size = len(chars)\n",
    "print(''.join(chars))\n",
    "print(vocab_size)\n",
    "\n",
    "stoi = {ch: i for i, ch in enumerate(chars)}\n",
    "itos = {i: ch for i, ch in enumerate(chars)}\n",
    "encode = lambda s: [stoi[c] for c in s]\n",
    "decode = lambda l: ''.join([itos[i] for i in l])\n",
    "data = tf.convert_to_tensor(encode(text), dtype=tf.int32)\n",
    "\n",
    "\n",
    "# Train/validation split\n",
    "split_n = int(0.9 * len(data))\n",
    "train_data = data[:split_n]\n",
    "val_data = data[split_n:]\n",
    "BLOCK_SIZE = 256\n",
    "BATCH_SIZE = 32\n",
    "SEED = 123\n",
    "\n",
    "def get_batch(split):\n",
    "    d = train_data if split == \"train\" else val_data\n",
    "    ix = tf.random.uniform((BATCH_SIZE,), minval=0, maxval=len(d) - BLOCK_SIZE, seed=SEED, dtype=tf.int32)\n",
    "    x = tf.stack([d[i:i+BLOCK_SIZE] for i in ix])\n",
    "    y = tf.stack([d[i+1:i+BLOCK_SIZE+1] for i in ix])\n",
    "    return x, y\n",
    "\n",
    "# Training and model hyperparameters\n",
    "max_iters = 3000\n",
    "eval_interval = 20\n",
    "eval_iters = 20\n",
    "n_embed = 384\n",
    "n_layers = 6\n",
    "n_head = 12\n",
    "dropout = 0.2\n",
    "depth = 3   # number of factors (if depth == 1, no additional scaling)\n",
    "learning_rate = 1e-3\n",
    "initial_lr = learning_rate\n",
    "min_lr = 1e-4\n",
    "warmup_iters = 100\n",
    "lr_decay_iters = max_iters\n",
    "opt = 'Adam'\n",
    "wd_strength = 0\n",
    "if opt == 'AdamW':\n",
    "    la_other = 0  # fixed regularization for other parts\n",
    "    weight_decay = wd_strength\n",
    "else: \n",
    "    weight_decay = 0\n",
    "    la_other = 0.01 * wd_strength\n",
    "\n",
    "# Define working cosine schedule with linear warmup\n",
    "class CosineWarmupDecay(tf.keras.optimizers.schedules.LearningRateSchedule):\n",
    "    def __init__(self, initial_learning_rate, min_lr, warmup_iters, lr_decay_iters):\n",
    "        super().__init__()\n",
    "        self.initial_learning_rate = initial_learning_rate\n",
    "        self.min_lr = min_lr\n",
    "        self.warmup_iters = tf.cast(warmup_iters, tf.float32)\n",
    "        self.lr_decay_iters = tf.cast(lr_decay_iters, tf.float32)\n",
    "\n",
    "    def __call__(self, step):\n",
    "        step = tf.cast(step, tf.float32)\n",
    "        # Linear warmup: lr increases linearly for warmup_iters steps.\n",
    "        warmup_lr = self.initial_learning_rate * (step + 1) / (self.warmup_iters + 1)\n",
    "        # Cosine decay between warmup_iters and lr_decay_iters.\n",
    "        decay_ratio = (step - self.warmup_iters) / (self.lr_decay_iters - self.warmup_iters)\n",
    "        cosine_decay = 0.5 * (1.0 + tf.cos(math.pi * decay_ratio))\n",
    "        decay_lr = self.min_lr + cosine_decay * (self.initial_learning_rate - self.min_lr)\n",
    "        # Select lr based on current step.\n",
    "        lr = tf.where(step < self.warmup_iters, warmup_lr, decay_lr)\n",
    "        lr = tf.where(step > self.lr_decay_iters, self.min_lr, lr)\n",
    "        return lr\n",
    "\n",
    "    def get_config(self):\n",
    "        return {\n",
    "            \"initial_learning_rate\": self.initial_learning_rate,\n",
    "            \"min_lr\": self.min_lr,\n",
    "            \"warmup_iters\": self.warmup_iters.numpy(),\n",
    "            \"lr_decay_iters\": self.lr_decay_iters.numpy()\n",
    "        }\n",
    "\n",
    "# Model definition (NanoGPT with D-Gated AttentionHeads)\n",
    "class AttentionHead(keras.layers.Layer):\n",
    "    def __init__(self, head_size, la, depth):\n",
    "        super().__init__()\n",
    "        reg_strength = la / depth\n",
    "        self.key = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.query = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.value = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        self.tril = tf.constant(tf.linalg.band_part(tf.ones([BLOCK_SIZE, BLOCK_SIZE]), -1, 0))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "        if depth > 1:\n",
    "            self.scalars = self.add_weight(name=\"scalars\", shape=(depth - 1,), initializer=tf.ones_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        else:\n",
    "            self.scalars = None\n",
    "    def call(self, x):\n",
    "        B, T, C = x.shape\n",
    "        q = self.query(x)\n",
    "        k = self.key(x)\n",
    "        v = self.value(x)\n",
    "        scale = tf.reduce_prod(self.scalars) if self.scalars is not None else 1.0\n",
    "        v = v * scale\n",
    "        wei = tf.matmul(q, k, transpose_b=True) * C**-0.5\n",
    "        mask = tf.cast(tf.linalg.band_part(tf.ones([T, T]), -1, 0), dtype=tf.bool)\n",
    "        wei = tf.where(mask, wei, tf.fill(tf.shape(wei), float('-inf')))\n",
    "        wei = tf.nn.softmax(wei, axis=-1)\n",
    "        wei = self.dropout(wei)\n",
    "        out = tf.matmul(wei, v)\n",
    "        return out\n",
    "\n",
    "class MultiHeadedAttention(keras.layers.Layer):\n",
    "    def __init__(self, head_size, num_heads, depth, la):\n",
    "        super().__init__()\n",
    "        self.heads = [AttentionHead(head_size, la, depth) for _ in range(num_heads)]\n",
    "        self.proj = keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "    def call(self, x):\n",
    "        out = tf.concat([head(x) for head in self.heads], axis=-1)\n",
    "        out = self.proj(out)\n",
    "        out = self.dropout(out)\n",
    "        return out\n",
    "\n",
    "class FeedForward(keras.layers.Layer):\n",
    "    def __init__(self, n_embed):\n",
    "        super().__init__()\n",
    "        self.net = keras.Sequential([\n",
    "            keras.layers.Dense(n_embed * 4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dropout(dropout)\n",
    "        ])\n",
    "    def call(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "class Block(keras.layers.Layer):\n",
    "    def __init__(self, n_embed, n_head, depth, la):\n",
    "        super().__init__()\n",
    "        head_size = n_embed // n_head\n",
    "        self.sa = MultiHeadedAttention(head_size, n_head, depth, la)\n",
    "        self.ffwd = FeedForward(n_embed)\n",
    "        self.ln1 = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.ln2 = keras.layers.LayerNormalization(axis=-1)\n",
    "    def call(self, x):\n",
    "        x = x + self.sa(self.ln1(x))\n",
    "        x = x + self.ffwd(self.ln2(x))\n",
    "        return x\n",
    "\n",
    "class NanoGPT(keras.Model):\n",
    "    def __init__(self, la):\n",
    "        super().__init__()\n",
    "        self.token_embedding_table = keras.layers.Embedding(vocab_size, n_embed)\n",
    "        self.position_embedding_table = keras.layers.Embedding(BLOCK_SIZE, n_embed)\n",
    "        self.blocks = keras.Sequential([Block(n_embed, n_head, depth, la) for _ in range(n_layers)])\n",
    "        self.ln_f = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.lm_head = keras.layers.Dense(vocab_size, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "    def call(self, idx, targets=None):\n",
    "        B, T = idx.shape\n",
    "        token_emb = self.token_embedding_table(idx)\n",
    "        position_emb = self.position_embedding_table(tf.range(T))\n",
    "        x = token_emb + position_emb\n",
    "        x = self.blocks(x)\n",
    "        x = self.ln_f(x)\n",
    "        logits = self.lm_head(x)\n",
    "        if targets is None:\n",
    "            return logits, None\n",
    "        else:\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(targets, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            reg_loss = tf.add_n(self.losses) if self.losses else 0.0\n",
    "            total_loss = base_loss + reg_loss\n",
    "            return logits, total_loss\n",
    "    def generate(self, idx, max_new_tokens):\n",
    "        for _ in range(max_new_tokens):\n",
    "            idx_cond = idx[:, -BLOCK_SIZE:]\n",
    "            logits, _ = self(idx_cond)\n",
    "            logits = logits[:, -1, :]\n",
    "            idx_next = tf.random.categorical(logits, num_samples=1, seed=SEED, dtype=tf.int64)\n",
    "            idx = tf.concat([idx, idx_next], axis=1)\n",
    "        return idx\n",
    "\n",
    "def estimate_loss(model):\n",
    "    losses = {}\n",
    "    base_losses = {}\n",
    "    accuracies = {}\n",
    "    model.trainable = False\n",
    "    for split in ['train', 'val']:\n",
    "        total_loss = []\n",
    "        total_base_loss = []\n",
    "        total_acc = []\n",
    "        for _ in range(eval_iters):\n",
    "            X, Y = get_batch(split)\n",
    "            logits, loss = model(X, Y)\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(Y, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            # Compute accuracy for the batch\n",
    "            preds = tf.argmax(logits, axis=-1, output_type=tf.int32)\n",
    "            acc = tf.reduce_mean(tf.cast(tf.equal(preds, Y), tf.float32))\n",
    "            total_loss.append(loss.numpy())\n",
    "            total_base_loss.append(base_loss.numpy())\n",
    "            total_acc.append(acc.numpy())\n",
    "        losses[split] = np.mean(total_loss)\n",
    "        base_losses[split] = np.mean(total_base_loss)\n",
    "        accuracies[split] = np.mean(total_acc)\n",
    "    model.trainable = True\n",
    "    perplexity_val = np.exp(base_losses['val'])\n",
    "    return losses, perplexity_val, accuracies\n",
    "\n",
    "# Define device\n",
    "gpus = tf.config.list_physical_devices('GPU')\n",
    "device = '/GPU:0' if gpus else '/CPU:0'\n",
    "\n",
    "def run_experiment(lambda_value):\n",
    "    \n",
    "    # Set random seeds for reproducibility\n",
    "    SEED = 42\n",
    "    np.random.seed(SEED)\n",
    "    random.seed(SEED)\n",
    "    tf.random.set_seed(SEED)\n",
    "    \n",
    "    la = lambda_value\n",
    "    lr_schedule = CosineWarmupDecay(initial_lr, min_lr, warmup_iters, lr_decay_iters)\n",
    "    if opt == 'AdamW':\n",
    "        optimizer = AdamW(learning_rate=lr_schedule, weight_decay=weight_decay, beta_1=0.9, beta_2=0.99, epsilon=1e-07)\n",
    "    else:\n",
    "        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)\n",
    "    \n",
    "    with tf.device(device):\n",
    "        model = NanoGPT(la)\n",
    "        model.build(input_shape=(BATCH_SIZE, BLOCK_SIZE))\n",
    "        model.summary()\n",
    "    history_records = []\n",
    "    for iter in range(max_iters):\n",
    "        if iter % eval_interval == 0:\n",
    "            losses, perplexity_val, accuracies = estimate_loss(model)\n",
    "            head_norms = {}\n",
    "            for block_idx, block in enumerate(model.blocks.layers):\n",
    "                for head_idx, head in enumerate(block.sa.heads):\n",
    "                    scale = tf.reduce_prod(head.scalars) if head.scalars is not None else 1.0\n",
    "                    effective_v = head.value.kernel * scale\n",
    "                    norm = tf.norm(effective_v, ord=2).numpy()\n",
    "                    head_norms[f\"norm_B{block_idx}_H{head_idx}\"] = norm\n",
    "            norms_values = list(head_norms.values())\n",
    "            min_norm = min(norms_values)\n",
    "            max_norm = max(norms_values)\n",
    "            eps = 1e-5\n",
    "            head_sparsity = sum(1 for norm in norms_values if norm < eps) / len(norms_values)\n",
    "            \n",
    "            record = {\n",
    "                \"iter\": iter,\n",
    "                \"train_loss\": losses['train'],\n",
    "                \"val_loss\": losses['val'],\n",
    "                \"val_perplexity\": perplexity_val,\n",
    "                \"val_accuracy\": accuracies['val'],\n",
    "                \"depth\": depth,\n",
    "                \"lambda\": la,\n",
    "                \"min_norm\": min_norm,\n",
    "                \"max_norm\": max_norm,\n",
    "                \"head_sparsity\": head_sparsity\n",
    "            }\n",
    "            record.update(head_norms)\n",
    "            history_records.append(record)\n",
    "            \n",
    "            print(f\"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, \"\n",
    "                  f\"val ppl {perplexity_val:.4f}, val acc {accuracies['val']:.4f}, \"\n",
    "                  f\"min norm {min_norm:.2e}, max norm {max_norm:.2e}, head sparsity {head_sparsity:4f}\")\n",
    "                  \n",
    "        with tf.GradientTape() as tape:\n",
    "            xb, yb = get_batch('train')\n",
    "            _, loss = model(xb, yb)\n",
    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
    "    \n",
    "    history_df = pd.DataFrame(history_records)\n",
    "    RUN_NAME = f\"dep{depth}-la{la:.2e}-nlyr{n_layers}-nhead{n_head}-nemb{n_embed}-bs{BATCH_SIZE}-blk{BLOCK_SIZE}-iter{max_iters}-lr{learning_rate:.1e}-opt{opt}-wd{weight_decay:.2e}_laother{la_other:.3e}\"\n",
    "    results_base_path = \"./results/nanogpt/grid/\"\n",
    "    RUN_PATH = os.path.join(results_base_path, RUN_NAME)\n",
    "    if not os.path.exists(RUN_PATH):\n",
    "        os.makedirs(RUN_PATH)\n",
    "    csv_file_path = os.path.join(RUN_PATH, \"results_iters.csv\")\n",
    "    history_df.to_csv(csv_file_path, index=False)\n",
    "    print(f\"Results saved to {csv_file_path}\")\n",
    "    \n",
    "    print(decode(model.generate(idx=tf.zeros((1,1), dtype=tf.dtypes.int64), max_new_tokens=500)[0].numpy().tolist()))\n",
    "    \n",
    "    return history_df, model\n",
    "\n",
    "# List of lambda values to test\n",
    "lambda_values = [0,1e-4,5e-4,1e-3,2e-3,5e-3,8e-3,1e-2,1.1e-2,1.2e-2,1.25e-2,1.5e-2,1.75e-2,2e-2,3e-2,4e-2,5e-2,6e-2,7.5e-2,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.8,2.0,3.0,4.0,5.0,10.0,100.0]\n",
    "\n",
    "lambda_values.reverse()\n",
    "experiment_results = {}\n",
    "\n",
    "for lam in lambda_values:\n",
    "    print(f\"Running experiment with lambda = {lam}\")\n",
    "    history_df, model = run_experiment(lam)\n",
    "    experiment_results[lam] = history_df\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "for lam, df in experiment_results.items():\n",
    "    plt.plot(df[\"iter\"], df[\"train_loss\"], label=f'λ = {lam}')\n",
    "plt.xlabel('Iteration')\n",
    "plt.ylabel('Training Loss')\n",
    "plt.title('Training Loss over Iterations for Different λ')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-28T12:41:54.817039Z",
     "iopub.status.busy": "2025-04-28T12:41:54.816782Z",
     "iopub.status.idle": "2025-04-28T14:32:02.865270Z",
     "shell.execute_reply": "2025-04-28T14:32:02.864515Z",
     "shell.execute_reply.started": "2025-04-28T12:41:54.817023Z"
    }
   },
   "outputs": [],
   "source": [
    "# DEPTH 4\n",
    "\n",
    "# new code including val accuracy\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import numpy as np\n",
    "import math\n",
    "import os\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "# Install TensorFlow Addons\n",
    "!pip install -q tensorflow-addons\n",
    "\n",
    "# Import AdamW optimizer\n",
    "from tensorflow_addons.optimizers import AdamW\n",
    "\n",
    "# Data loading and preprocessing\n",
    "with open('./data/t8.shakespeare.txt', 'r', encoding='utf-8') as f:\n",
    "    text = f.read()\n",
    "\n",
    "print(\"length of this dataset in chars: \", len(text))\n",
    "chars = sorted(list(set(text)))\n",
    "vocab_size = len(chars)\n",
    "print(''.join(chars))\n",
    "print(vocab_size)\n",
    "\n",
    "stoi = {ch: i for i, ch in enumerate(chars)}\n",
    "itos = {i: ch for i, ch in enumerate(chars)}\n",
    "encode = lambda s: [stoi[c] for c in s]\n",
    "decode = lambda l: ''.join([itos[i] for i in l])\n",
    "data = tf.convert_to_tensor(encode(text), dtype=tf.int32)\n",
    "\n",
    "\n",
    "# Train/validation split\n",
    "split_n = int(0.9 * len(data))\n",
    "train_data = data[:split_n]\n",
    "val_data = data[split_n:]\n",
    "BLOCK_SIZE = 256\n",
    "BATCH_SIZE = 32\n",
    "SEED = 123\n",
    "\n",
    "def get_batch(split):\n",
    "    d = train_data if split == \"train\" else val_data\n",
    "    ix = tf.random.uniform((BATCH_SIZE,), minval=0, maxval=len(d) - BLOCK_SIZE, seed=SEED, dtype=tf.int32)\n",
    "    x = tf.stack([d[i:i+BLOCK_SIZE] for i in ix])\n",
    "    y = tf.stack([d[i+1:i+BLOCK_SIZE+1] for i in ix])\n",
    "    return x, y\n",
    "\n",
    "# Training and model hyperparameters\n",
    "max_iters = 3000\n",
    "eval_interval = 20\n",
    "eval_iters = 20\n",
    "n_embed = 384\n",
    "n_layers = 6\n",
    "n_head = 12\n",
    "dropout = 0.2\n",
    "depth = 4   # number of factors (if depth == 1, no additional scaling)\n",
    "learning_rate = 1e-3\n",
    "initial_lr = learning_rate\n",
    "min_lr = 1e-4\n",
    "warmup_iters = 100\n",
    "lr_decay_iters = max_iters\n",
    "opt = 'Adam'\n",
    "wd_strength = 0\n",
    "if opt == 'AdamW':\n",
    "    la_other = 0  # fixed regularization for other parts\n",
    "    weight_decay = wd_strength\n",
    "else: \n",
    "    weight_decay = 0\n",
    "    la_other = 0.01 * wd_strength\n",
    "\n",
    "# Define working cosine schedule with linear warmup\n",
    "class CosineWarmupDecay(tf.keras.optimizers.schedules.LearningRateSchedule):\n",
    "    def __init__(self, initial_learning_rate, min_lr, warmup_iters, lr_decay_iters):\n",
    "        super().__init__()\n",
    "        self.initial_learning_rate = initial_learning_rate\n",
    "        self.min_lr = min_lr\n",
    "        self.warmup_iters = tf.cast(warmup_iters, tf.float32)\n",
    "        self.lr_decay_iters = tf.cast(lr_decay_iters, tf.float32)\n",
    "\n",
    "    def __call__(self, step):\n",
    "        step = tf.cast(step, tf.float32)\n",
    "        # Linear warmup: lr increases linearly for warmup_iters steps.\n",
    "        warmup_lr = self.initial_learning_rate * (step + 1) / (self.warmup_iters + 1)\n",
    "        # Cosine decay between warmup_iters and lr_decay_iters.\n",
    "        decay_ratio = (step - self.warmup_iters) / (self.lr_decay_iters - self.warmup_iters)\n",
    "        cosine_decay = 0.5 * (1.0 + tf.cos(math.pi * decay_ratio))\n",
    "        decay_lr = self.min_lr + cosine_decay * (self.initial_learning_rate - self.min_lr)\n",
    "        # Select lr based on current step.\n",
    "        lr = tf.where(step < self.warmup_iters, warmup_lr, decay_lr)\n",
    "        lr = tf.where(step > self.lr_decay_iters, self.min_lr, lr)\n",
    "        return lr\n",
    "\n",
    "    def get_config(self):\n",
    "        return {\n",
    "            \"initial_learning_rate\": self.initial_learning_rate,\n",
    "            \"min_lr\": self.min_lr,\n",
    "            \"warmup_iters\": self.warmup_iters.numpy(),\n",
    "            \"lr_decay_iters\": self.lr_decay_iters.numpy()\n",
    "        }\n",
    "\n",
    "# Model definition (NanoGPT with D-Gated AttentionHeads)\n",
    "class AttentionHead(keras.layers.Layer):\n",
    "    def __init__(self, head_size, la, depth):\n",
    "        super().__init__()\n",
    "        reg_strength = la / depth\n",
    "        self.key = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.query = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.value = tf.keras.layers.Dense(head_size, use_bias=False, kernel_regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        self.tril = tf.constant(tf.linalg.band_part(tf.ones([BLOCK_SIZE, BLOCK_SIZE]), -1, 0))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "        if depth > 1:\n",
    "            self.scalars = self.add_weight(name=\"scalars\", shape=(depth - 1,), initializer=tf.ones_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(reg_strength))\n",
    "        else:\n",
    "            self.scalars = None\n",
    "    def call(self, x):\n",
    "        B, T, C = x.shape\n",
    "        q = self.query(x)\n",
    "        k = self.key(x)\n",
    "        v = self.value(x)\n",
    "        scale = tf.reduce_prod(self.scalars) if self.scalars is not None else 1.0\n",
    "        v = v * scale\n",
    "        wei = tf.matmul(q, k, transpose_b=True) * C**-0.5\n",
    "        mask = tf.cast(tf.linalg.band_part(tf.ones([T, T]), -1, 0), dtype=tf.bool)\n",
    "        wei = tf.where(mask, wei, tf.fill(tf.shape(wei), float('-inf')))\n",
    "        wei = tf.nn.softmax(wei, axis=-1)\n",
    "        wei = self.dropout(wei)\n",
    "        out = tf.matmul(wei, v)\n",
    "        return out\n",
    "\n",
    "class MultiHeadedAttention(keras.layers.Layer):\n",
    "    def __init__(self, head_size, num_heads, depth, la):\n",
    "        super().__init__()\n",
    "        self.heads = [AttentionHead(head_size, la, depth) for _ in range(num_heads)]\n",
    "        self.proj = keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "    def call(self, x):\n",
    "        out = tf.concat([head(x) for head in self.heads], axis=-1)\n",
    "        out = self.proj(out)\n",
    "        out = self.dropout(out)\n",
    "        return out\n",
    "\n",
    "class FeedForward(keras.layers.Layer):\n",
    "    def __init__(self, n_embed):\n",
    "        super().__init__()\n",
    "        self.net = keras.Sequential([\n",
    "            keras.layers.Dense(n_embed * 4, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dense(n_embed, kernel_regularizer=tf.keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dropout(dropout)\n",
    "        ])\n",
    "    def call(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "class Block(keras.layers.Layer):\n",
    "    def __init__(self, n_embed, n_head, depth, la):\n",
    "        super().__init__()\n",
    "        head_size = n_embed // n_head\n",
    "        self.sa = MultiHeadedAttention(head_size, n_head, depth, la)\n",
    "        self.ffwd = FeedForward(n_embed)\n",
    "        self.ln1 = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.ln2 = keras.layers.LayerNormalization(axis=-1)\n",
    "    def call(self, x):\n",
    "        x = x + self.sa(self.ln1(x))\n",
    "        x = x + self.ffwd(self.ln2(x))\n",
    "        return x\n",
    "\n",
    "class NanoGPT(keras.Model):\n",
    "    def __init__(self, la):\n",
    "        super().__init__()\n",
    "        self.token_embedding_table = keras.layers.Embedding(vocab_size, n_embed)\n",
    "        self.position_embedding_table = keras.layers.Embedding(BLOCK_SIZE, n_embed)\n",
    "        self.blocks = keras.Sequential([Block(n_embed, n_head, depth, la) for _ in range(n_layers)])\n",
    "        self.ln_f = keras.layers.LayerNormalization(axis=-1)\n",
    "        self.lm_head = keras.layers.Dense(vocab_size, kernel_regularizer=tf.keras.regularizers.l2(la_other))\n",
    "    def call(self, idx, targets=None):\n",
    "        B, T = idx.shape\n",
    "        token_emb = self.token_embedding_table(idx)\n",
    "        position_emb = self.position_embedding_table(tf.range(T))\n",
    "        x = token_emb + position_emb\n",
    "        x = self.blocks(x)\n",
    "        x = self.ln_f(x)\n",
    "        logits = self.lm_head(x)\n",
    "        if targets is None:\n",
    "            return logits, None\n",
    "        else:\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(targets, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            reg_loss = tf.add_n(self.losses) if self.losses else 0.0\n",
    "            total_loss = base_loss + reg_loss\n",
    "            return logits, total_loss\n",
    "    def generate(self, idx, max_new_tokens):\n",
    "        for _ in range(max_new_tokens):\n",
    "            idx_cond = idx[:, -BLOCK_SIZE:]\n",
    "            logits, _ = self(idx_cond)\n",
    "            logits = logits[:, -1, :]\n",
    "            idx_next = tf.random.categorical(logits, num_samples=1, seed=SEED, dtype=tf.int64)\n",
    "            idx = tf.concat([idx, idx_next], axis=1)\n",
    "        return idx\n",
    "\n",
    "def estimate_loss(model):\n",
    "    losses = {}\n",
    "    base_losses = {}\n",
    "    accuracies = {}\n",
    "    model.trainable = False\n",
    "    for split in ['train', 'val']:\n",
    "        total_loss = []\n",
    "        total_base_loss = []\n",
    "        total_acc = []\n",
    "        for _ in range(eval_iters):\n",
    "            X, Y = get_batch(split)\n",
    "            logits, loss = model(X, Y)\n",
    "            B, T, C = logits.shape\n",
    "            logits_flat = tf.reshape(logits, (B * T, C))\n",
    "            targets_flat = tf.reshape(Y, (B * T,))\n",
    "            base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "            # Compute accuracy for the batch\n",
    "            preds = tf.argmax(logits, axis=-1, output_type=tf.int32)\n",
    "            acc = tf.reduce_mean(tf.cast(tf.equal(preds, Y), tf.float32))\n",
    "            total_loss.append(loss.numpy())\n",
    "            total_base_loss.append(base_loss.numpy())\n",
    "            total_acc.append(acc.numpy())\n",
    "        losses[split] = np.mean(total_loss)\n",
    "        base_losses[split] = np.mean(total_base_loss)\n",
    "        accuracies[split] = np.mean(total_acc)\n",
    "    model.trainable = True\n",
    "    perplexity_val = np.exp(base_losses['val'])\n",
    "    return losses, perplexity_val, accuracies\n",
    "\n",
    "# Define device\n",
    "gpus = tf.config.list_physical_devices('GPU')\n",
    "device = '/GPU:0' if gpus else '/CPU:0'\n",
    "\n",
    "def run_experiment(lambda_value):\n",
    "    \n",
    "    # Set random seeds for reproducibility\n",
    "    SEED = 42\n",
    "    np.random.seed(SEED)\n",
    "    random.seed(SEED)\n",
    "    tf.random.set_seed(SEED)\n",
    "    \n",
    "    la = lambda_value\n",
    "    lr_schedule = CosineWarmupDecay(initial_lr, min_lr, warmup_iters, lr_decay_iters)\n",
    "    if opt == 'AdamW':\n",
    "        optimizer = AdamW(learning_rate=lr_schedule, weight_decay=weight_decay, beta_1=0.9, beta_2=0.99, epsilon=1e-07)\n",
    "    else:\n",
    "        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)\n",
    "    \n",
    "    with tf.device(device):\n",
    "        model = NanoGPT(la)\n",
    "        model.build(input_shape=(BATCH_SIZE, BLOCK_SIZE))\n",
    "        model.summary()\n",
    "    history_records = []\n",
    "    for iter in range(max_iters):\n",
    "        if iter % eval_interval == 0:\n",
    "            losses, perplexity_val, accuracies = estimate_loss(model)\n",
    "            head_norms = {}\n",
    "            for block_idx, block in enumerate(model.blocks.layers):\n",
    "                for head_idx, head in enumerate(block.sa.heads):\n",
    "                    scale = tf.reduce_prod(head.scalars) if head.scalars is not None else 1.0\n",
    "                    effective_v = head.value.kernel * scale\n",
    "                    norm = tf.norm(effective_v, ord=2).numpy()\n",
    "                    head_norms[f\"norm_B{block_idx}_H{head_idx}\"] = norm\n",
    "            norms_values = list(head_norms.values())\n",
    "            min_norm = min(norms_values)\n",
    "            max_norm = max(norms_values)\n",
    "            eps = 1e-5\n",
    "            head_sparsity = sum(1 for norm in norms_values if norm < eps) / len(norms_values)\n",
    "            \n",
    "            record = {\n",
    "                \"iter\": iter,\n",
    "                \"train_loss\": losses['train'],\n",
    "                \"val_loss\": losses['val'],\n",
    "                \"val_perplexity\": perplexity_val,\n",
    "                \"val_accuracy\": accuracies['val'],\n",
    "                \"depth\": depth,\n",
    "                \"lambda\": la,\n",
    "                \"min_norm\": min_norm,\n",
    "                \"max_norm\": max_norm,\n",
    "                \"head_sparsity\": head_sparsity\n",
    "            }\n",
    "            record.update(head_norms)\n",
    "            history_records.append(record)\n",
    "            \n",
    "            print(f\"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, \"\n",
    "                  f\"val ppl {perplexity_val:.4f}, val acc {accuracies['val']:.4f}, \"\n",
    "                  f\"min norm {min_norm:.2e}, max norm {max_norm:.2e}, head sparsity {head_sparsity:4f}\")\n",
    "                  \n",
    "        with tf.GradientTape() as tape:\n",
    "            xb, yb = get_batch('train')\n",
    "            _, loss = model(xb, yb)\n",
    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
    "    \n",
    "    history_df = pd.DataFrame(history_records)\n",
    "    RUN_NAME = f\"dep{depth}-la{la:.2e}-nlyr{n_layers}-nhead{n_head}-nemb{n_embed}-bs{BATCH_SIZE}-blk{BLOCK_SIZE}-iter{max_iters}-lr{learning_rate:.1e}-opt{opt}-wd{weight_decay:.2e}_laother{la_other:.3e}\"\n",
    "    results_base_path = \"./results/nanogpt/grid/\"\n",
    "    RUN_PATH = os.path.join(results_base_path, RUN_NAME)\n",
    "    if not os.path.exists(RUN_PATH):\n",
    "        os.makedirs(RUN_PATH)\n",
    "    csv_file_path = os.path.join(RUN_PATH, \"results_iters.csv\")\n",
    "    history_df.to_csv(csv_file_path, index=False)\n",
    "    print(f\"Results saved to {csv_file_path}\")\n",
    "    \n",
    "    print(decode(model.generate(idx=tf.zeros((1,1), dtype=tf.dtypes.int64), max_new_tokens=500)[0].numpy().tolist()))\n",
    "    \n",
    "    return history_df, model\n",
    "\n",
    "# List of lambda values to test\n",
    "lambda_values = [0,1e-4,5e-4,1e-3,2e-3,5e-3,8e-3,1e-2,1.1e-2,1.2e-2,1.25e-2,1.5e-2,1.75e-2,2e-2,3e-2,4e-2,5e-2,6e-2,7.5e-2,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.8,2.0,3.0,4.0,5.0,10.0,100.0]\n",
    "\n",
    "\n",
    "lambda_values.reverse()\n",
    "experiment_results = {}\n",
    "\n",
    "for lam in lambda_values:\n",
    "    print(f\"Running experiment with lambda = {lam}\")\n",
    "    history_df, model = run_experiment(lam)\n",
    "    experiment_results[lam] = history_df\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "for lam, df in experiment_results.items():\n",
    "    plt.plot(df[\"iter\"], df[\"train_loss\"], label=f'λ = {lam}')\n",
    "plt.xlabel('Iteration')\n",
    "plt.ylabel('Training Loss')\n",
    "plt.title('Training Loss over Iterations for Different λ')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-28T16:35:37.978680Z",
     "iopub.status.busy": "2025-04-28T16:35:37.977996Z",
     "iopub.status.idle": "2025-04-28T17:34:30.771652Z",
     "shell.execute_reply": "2025-04-28T17:34:30.771134Z",
     "shell.execute_reply.started": "2025-04-28T16:35:37.978625Z"
    }
   },
   "outputs": [],
   "source": [
    "# Direct L21 regularization via custom Frobenius regularizer and additional post-hoc pruning\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import numpy as np\n",
    "import math\n",
    "import os\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "# Install TensorFlow Addons\n",
    "!pip install -q tensorflow-addons\n",
    "from tensorflow_addons.optimizers import AdamW\n",
    "\n",
    "# Custom Group Frobenius Regularizer for V matrices\n",
    "class GroupFrobeniusRegularizer(tf.keras.regularizers.Regularizer):\n",
    "    def __init__(self, strength, epsilon=1e-12):\n",
    "        self.strength = strength\n",
    "        self.epsilon = epsilon\n",
    "    def __call__(self, x):\n",
    "        frob = tf.sqrt(tf.reduce_sum(tf.square(x)) + self.epsilon)\n",
    "        return self.strength * frob\n",
    "    def get_config(self):\n",
    "        return {\"strength\": float(self.strength), \"epsilon\": float(self.epsilon)}\n",
    "\n",
    "# Data loading\n",
    "with open('./data/t8.shakespeare.txt', 'r', encoding='utf-8') as f:\n",
    "    text = f.read()\n",
    "print(\"length of this dataset in chars: \", len(text))\n",
    "chars = sorted(set(text))\n",
    "vocab_size = len(chars)\n",
    "print(''.join(chars), vocab_size)\n",
    "\n",
    "stoi = {ch:i for i,ch in enumerate(chars)}\n",
    "itos = {i:ch for ch,i in stoi.items()}\n",
    "encode = lambda s: [stoi[c] for c in s]\n",
    "decode = lambda l: ''.join(itos[i] for i in l)\n",
    "data = tf.convert_to_tensor(encode(text), tf.int32)\n",
    "\n",
    "# Split\n",
    "split = int(0.9*len(data))\n",
    "train_data = data[:split]\n",
    "val_data   = data[split:]\n",
    "BLOCK_SIZE = 256\n",
    "BATCH_SIZE = 32\n",
    "SEED       = 123\n",
    "\n",
    "def get_batch(split):\n",
    "    d = train_data if split==\"train\" else val_data\n",
    "    ix = tf.random.uniform((BATCH_SIZE,),0,len(d)-BLOCK_SIZE,seed=SEED,dtype=tf.int32)\n",
    "    x = tf.stack([d[i:i+BLOCK_SIZE]     for i in ix])\n",
    "    y = tf.stack([d[i+1:i+BLOCK_SIZE+1] for i in ix])\n",
    "    return x,y\n",
    "\n",
    "# Hyperparams\n",
    "max_iters      = 3000 #3000\n",
    "eval_interval  = 100\n",
    "eval_iters     = 20\n",
    "n_embed        = 384\n",
    "n_layers       = 6\n",
    "n_head         = 12\n",
    "dropout        = 0.2\n",
    "depth          = 1\n",
    "learning_rate  = 1e-3\n",
    "min_lr         = 1e-4\n",
    "warmup_iters   = 100\n",
    "lr_decay_iters = max_iters\n",
    "opt            = 'Adam'\n",
    "wd_strength    = 0\n",
    "if opt=='AdamW':\n",
    "    la_other     = 0\n",
    "    weight_decay = wd_strength\n",
    "else:\n",
    "    weight_decay = 0\n",
    "    la_other     = 0.01*wd_strength\n",
    "\n",
    "# LR schedule\n",
    "class CosineWarmupDecay(tf.keras.optimizers.schedules.LearningRateSchedule):\n",
    "    def __init__(self, init_lr, min_lr, warmup, decay):\n",
    "        super().__init__()\n",
    "        self.init_lr = init_lr\n",
    "        self.min_lr  = min_lr\n",
    "        self.warmup  = tf.cast(warmup, tf.float32)\n",
    "        self.decay   = tf.cast(decay, tf.float32)\n",
    "    def __call__(self, step):\n",
    "        s = tf.cast(step, tf.float32)\n",
    "        warm = self.init_lr * (s+1)/(self.warmup+1)\n",
    "        ratio = (s - self.warmup)/(self.decay - self.warmup)\n",
    "        cosd  = 0.5*(1+tf.cos(math.pi*ratio))\n",
    "        dec  = self.min_lr + cosd*(self.init_lr - self.min_lr)\n",
    "        lr   = tf.where(s<self.warmup, warm, dec)\n",
    "        return tf.where(s>self.decay, self.min_lr, lr)\n",
    "    def get_config(self):\n",
    "        return {\"init_lr\":self.init_lr,\"min_lr\":self.min_lr,\n",
    "                \"warmup\":int(self.warmup.numpy()),\"decay\":int(self.decay.numpy())}\n",
    "\n",
    "# Model\n",
    "class AttentionHead(keras.layers.Layer):\n",
    "    def __init__(self, head_size, la, depth):\n",
    "        super().__init__()\n",
    "        self.key   = keras.layers.Dense(head_size,use_bias=False,\n",
    "                                        kernel_regularizer=keras.regularizers.l2(la_other))\n",
    "        self.query = keras.layers.Dense(head_size,use_bias=False,\n",
    "                                        kernel_regularizer=keras.regularizers.l2(la_other))\n",
    "        self.value = keras.layers.Dense(head_size,use_bias=False,\n",
    "                                        kernel_regularizer=GroupFrobeniusRegularizer(strength=la))\n",
    "        self.tril  = tf.constant(tf.linalg.band_part(tf.ones([BLOCK_SIZE,BLOCK_SIZE]),-1,0))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "    def call(self, x):\n",
    "        B,T,C = x.shape\n",
    "        q = self.query(x); k = self.key(x); v = self.value(x)\n",
    "        wei = tf.matmul(q,k,transpose_b=True)*C**-0.5\n",
    "        mask= tf.cast(self.tril[:T,:T], tf.bool)\n",
    "        wei = tf.where(mask, wei, tf.fill(tf.shape(wei), float('-inf')))\n",
    "        wei = tf.nn.softmax(wei, axis=-1)\n",
    "        wei = self.dropout(wei)\n",
    "        return tf.matmul(wei, v)\n",
    "\n",
    "class MultiHeadedAttention(keras.layers.Layer):\n",
    "    def __init__(self, head_size, num_heads, depth, la):\n",
    "        super().__init__()\n",
    "        self.heads = [AttentionHead(head_size, la, depth) for _ in range(num_heads)]\n",
    "        self.proj  = keras.layers.Dense(n_embed, kernel_regularizer=keras.regularizers.l2(la_other))\n",
    "        self.dropout = keras.layers.Dropout(dropout)\n",
    "    def call(self, x):\n",
    "        out = tf.concat([h(x) for h in self.heads], axis=-1)\n",
    "        return self.dropout(self.proj(out))\n",
    "\n",
    "class FeedForward(keras.layers.Layer):\n",
    "    def __init__(self, n_embed):\n",
    "        super().__init__()\n",
    "        self.net = keras.Sequential([\n",
    "            keras.layers.Dense(n_embed*4, activation='relu', kernel_regularizer=keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dense(n_embed, kernel_regularizer=keras.regularizers.l2(la_other)),\n",
    "            keras.layers.Dropout(dropout),\n",
    "        ])\n",
    "    def call(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "class Block(keras.layers.Layer):\n",
    "    def __init__(self, n_embed, n_head, depth, la):\n",
    "        super().__init__()\n",
    "        hs = n_embed//n_head\n",
    "        self.sa  = MultiHeadedAttention(hs, n_head, depth, la)\n",
    "        self.ff  = FeedForward(n_embed)\n",
    "        self.ln1 = keras.layers.LayerNormalization()\n",
    "        self.ln2 = keras.layers.LayerNormalization()\n",
    "    def call(self, x):\n",
    "        x = x + self.sa(self.ln1(x))\n",
    "        return x + self.ff(self.ln2(x))\n",
    "\n",
    "class NanoGPT(keras.Model):\n",
    "    def __init__(self, la):\n",
    "        super().__init__()\n",
    "        self.tok_emb = keras.layers.Embedding(vocab_size,n_embed)\n",
    "        self.pos_emb = keras.layers.Embedding(BLOCK_SIZE,n_embed)\n",
    "        self.blocks  = keras.Sequential([Block(n_embed,n_head,depth,la) for _ in range(n_layers)])\n",
    "        self.ln_f    = keras.layers.LayerNormalization()\n",
    "        self.lm_head = keras.layers.Dense(vocab_size, kernel_regularizer=keras.regularizers.l2(la_other))\n",
    "    def call(self, idx, targets=None):\n",
    "        B,T = idx.shape\n",
    "        x = self.tok_emb(idx) + self.pos_emb(tf.range(T))\n",
    "        x = self.blocks(x)\n",
    "        x = self.ln_f(x)\n",
    "        logits = self.lm_head(x)\n",
    "        if targets is None:\n",
    "            return logits, None\n",
    "        logits_flat  = tf.reshape(logits,(B*T,vocab_size))\n",
    "        targets_flat = tf.reshape(targets,(B*T,))\n",
    "        base_loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True))\n",
    "        return logits, base_loss + tf.add_n(self.losses)\n",
    "\n",
    "def estimate_loss(model):\n",
    "    model.trainable = False\n",
    "    losses, base, accs = {}, {}, {}\n",
    "    for split in ['train','val']:\n",
    "        L,B,A = [],[],[]\n",
    "        for _ in range(eval_iters):\n",
    "            X,Y = get_batch(split)\n",
    "            logits, loss = model(X,Y)\n",
    "            b = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(tf.reshape(Y,(-1,)), tf.reshape(logits,(-1,vocab_size)), from_logits=True))\n",
    "            p = tf.argmax(logits,axis=-1,output_type=tf.int32)\n",
    "            a = tf.reduce_mean(tf.cast(tf.equal(p,Y),tf.float32))\n",
    "            L.append(loss.numpy()); B.append(b.numpy()); A.append(a.numpy())\n",
    "        losses[split] = np.mean(L)\n",
    "        base[split]   = np.mean(B)\n",
    "        accs[split]   = np.mean(A)\n",
    "    model.trainable = True\n",
    "    ppl = np.exp(base['val'])\n",
    "    return losses, ppl, accs\n",
    "\n",
    "device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'\n",
    "\n",
    "def run_experiment(la):\n",
    "    SEED=42; np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)\n",
    "    lr_sched = CosineWarmupDecay(learning_rate, min_lr, warmup_iters, lr_decay_iters)\n",
    "    opti     = AdamW(lr_sched, weight_decay) if opt=='AdamW' else tf.keras.optimizers.Adam(lr_sched)\n",
    "    with tf.device(device):\n",
    "        model = NanoGPT(la); model.build((BATCH_SIZE,BLOCK_SIZE))\n",
    "        model.summary()\n",
    "    history=[]\n",
    "    for it in range(max_iters):\n",
    "        if it%eval_interval==0:\n",
    "            losses,ppl,accs = estimate_loss(model)\n",
    "            norms = [tf.norm(h.value.kernel,2).numpy()\n",
    "                     for b in model.blocks.layers for h in b.sa.heads]\n",
    "            mn, mx = min(norms), max(norms)\n",
    "            sp = sum(n<1e-5 for n in norms)/len(norms)\n",
    "            rec = {\"iter\":it,\"train_loss\":losses['train'],\"val_loss\":losses['val'],\n",
    "                   \"val_perplexity\":ppl,\"val_accuracy\":accs['val'],\n",
    "                   \"depth\":depth,\"lambda\":la,\"min_norm\":mn,\"max_norm\":mx,\"head_sparsity\":sp}\n",
    "            for bi, b in enumerate(model.blocks.layers):\n",
    "                for hi, h in enumerate(b.sa.heads):\n",
    "                    rec[f\"norm_B{bi}_H{hi}\"] = tf.norm(h.value.kernel,2).numpy()\n",
    "            history.append(rec)\n",
    "            print(f\"Step {it}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, \"\n",
    "                  f\"val ppl {ppl:.4f}, val acc {accs['val']:.4f}, \"\n",
    "                  f\"min norm {mn:.2e}, max norm {mx:.2e}, head sparsity {sp:.4f}\")\n",
    "        xb,yb = get_batch('train')\n",
    "        with tf.GradientTape() as tape:\n",
    "            _, loss = model(xb,yb)\n",
    "        grads = tape.gradient(loss,model.trainable_variables)\n",
    "        opti.apply_gradients(zip(grads,model.trainable_variables))\n",
    "    df_iter = pd.DataFrame(history)\n",
    "    run_name= f\"dep{depth}-la{la:.2e}-nlyr{n_layers}-nhead{n_head}-nemb{n_embed}-bs{BATCH_SIZE}-blk{BLOCK_SIZE}-iter{max_iters}-lr{learning_rate:.1e}-opt{opt}-wd{weight_decay:.2e}_laother{la_other:.3e}\"\n",
    "    run_path= os.path.join(\"./results/nanogpt/grid/\",run_name); os.makedirs(run_path,exist_ok=True)\n",
    "    iter_csv= os.path.join(run_path,\"results_iters.csv\"); df_iter.to_csv(iter_csv, index=False)\n",
    "    print(f\"Results saved to {iter_csv}\")\n",
    "    return model, run_path, df_iter\n",
    "\n",
    "lambda_values = [0, 1e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1, 3e-1, 0.5, 0.7, 1.0, 2.0, 5.0, 10.0, 100.0]\n",
    "\n",
    "#lambda_values.reverse()\n",
    "sparsities   = [0.02,0.05,0.1,0.2,0.4,0.6,0.7,0.8,0.9,0.95]\n",
    "experiment_results = {}\n",
    "\n",
    "for la in lambda_values:\n",
    "    print(f\"\\nRunning experiment with lambda = {la}\")\n",
    "    model, path, df_iter = run_experiment(la)\n",
    "    experiment_results[la] = df_iter\n",
    "\n",
    "    heads, orig = [], []\n",
    "    for b in model.blocks.layers:\n",
    "        for h in b.sa.heads:\n",
    "            heads.append(h.value.kernel)\n",
    "            orig.append(h.value.kernel.numpy().copy())\n",
    "    norms = [np.linalg.norm(o) for o in orig]\n",
    "    N     = len(heads)\n",
    "    order = np.argsort(norms) # smallest first\n",
    "    orig_sp = sum(n<1e-5 for n in norms)/N\n",
    "\n",
    "    prune_records = []\n",
    "    for s in sparsities:\n",
    "        for var,o in zip(heads,orig): var.assign(o)\n",
    "        n_pr = int(N*s)\n",
    "        for idx in order[:n_pr]: heads[idx].assign(tf.zeros_like(heads[idx]))\n",
    "        losses,ppl,accs = estimate_loss(model)\n",
    "        sp = n_pr/N\n",
    "        prune_records.append({\n",
    "            \"depth\": depth, \"lambda\": la,\n",
    "            \"orig_head_sparsity\": orig_sp,\n",
    "            \"head_sparsity\": sp,\n",
    "            \"val_perplexity\": ppl, \"val_accuracy\": accs['val']\n",
    "        })\n",
    "        print(f\"Pruning {s*100:.0f}%: orig sparsity {orig_sp:.4f}, \"\n",
    "              f\"head sparsity {sp:.4f}, val ppl {ppl:.4f}, val acc {accs['val']:.4f}\")\n",
    "    df_prune   = pd.DataFrame(prune_records)\n",
    "    prune_csv  = os.path.join(path,\"results_pruning.csv\")\n",
    "    df_prune.to_csv(prune_csv,index=False)\n",
    "    print(f\"Saved pruning results to {prune_csv}\")\n",
    "\n",
    "plt.figure(figsize=(10,5))\n",
    "for la, df in experiment_results.items():\n",
    "    plt.plot(df[\"iter\"], df[\"train_loss\"], label=f'λ = {la}')\n",
    "plt.xlabel('Iteration'); plt.ylabel('Training Loss'); plt.title('Training Loss over Iterations for Different λ'); plt.legend()\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "V100",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
