{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from collections import defaultdict\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import f1_score, accuracy_score\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "import clip\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "import torchvision.models as models\n",
    "import torchvision.transforms as transforms\n",
    "from PIL import Image\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from torch.utils.data import DataLoader, TensorDataset\n",
    "import random, numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.stats import wilcoxon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "json_path = \"/media/SSD_2TB/datasets/coco/coco2014/annotations/instances_train2014.json\"\n",
    "target_file = \"COCO_train2014_000000122688.jpg\"\n",
    "\n",
    "with open(json_path, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "cat_id_to_name = {cat[\"id\"]: cat[\"name\"] for cat in data[\"categories\"]}\n",
    "file_to_image_id = {img[\"file_name\"]: img[\"id\"] for img in data[\"images\"]}\n",
    "image_id = file_to_image_id.get(target_file)\n",
    "\n",
    "if image_id is None:\n",
    "    print(f\"Image {target_file} not found.\")\n",
    "    exit()\n",
    "\n",
    "image_annotations = [ann for ann in data[\"annotations\"] if ann[\"image_id\"] == image_id]\n",
    "print(f\"Annotations for image: {target_file}\")\n",
    "if not image_annotations:\n",
    "    print(\"No annotations found.\")\n",
    "else:\n",
    "    for ann in image_annotations:\n",
    "        cat_name = cat_id_to_name.get(ann[\"category_id\"], \"Unknown\")\n",
    "        bbox = ann[\"bbox\"]\n",
    "        print(f\" - Label: {cat_name} (ID: {ann['category_id']})\")\n",
    "        print(f\"   BBox: x={bbox[0]}, y={bbox[1]}, w={bbox[2]}, h={bbox[3]}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(json_path, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "cat_id_to_name = {cat[\"id\"]: cat[\"name\"] for cat in data[\"categories\"]}\n",
    "file_to_image_id = {img[\"file_name\"]: img[\"id\"] for img in data[\"images\"]}\n",
    "image_id = file_to_image_id.get(target_file)\n",
    "\n",
    "if image_id is None:\n",
    "    print(f\"Image {target_file} not found.\")\n",
    "    exit()\n",
    "\n",
    "annotations = [ann for ann in data[\"annotations\"] if ann[\"image_id\"] == image_id]\n",
    "unique_cat_ids = set(ann[\"category_id\"] for ann in annotations)\n",
    "tag_labels = [cat_id_to_name[cid] for cid in sorted(unique_cat_ids)]\n",
    "print(f\"Image: {target_file}\")\n",
    "print(\"Tags:\", tag_labels)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {},
   "source": [
    "# Make a CSV with Images and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_coco_split(json_path, output_csv_raw, output_csv_encoded):\n",
    "    with open(json_path, \"r\") as f:\n",
    "        coco = json.load(f)\n",
    "    image_id_to_file = {img[\"id\"]: img[\"file_name\"] for img in coco[\"images\"]}\n",
    "    cat_id_to_name = {cat[\"id\"]: cat[\"name\"] for cat in coco[\"categories\"]}\n",
    "\n",
    "    image_labels = defaultdict(set)\n",
    "    for ann in coco[\"annotations\"]:\n",
    "        image_id = ann[\"image_id\"]\n",
    "        label = cat_id_to_name[ann[\"category_id\"]]\n",
    "        image_labels[image_id].add(label)\n",
    "\n",
    "    rows = []\n",
    "    for image_id, labels in image_labels.items():\n",
    "        filename = image_id_to_file[image_id]\n",
    "        rows.append({\"filename\": filename, \"labels\": list(labels)})\n",
    "\n",
    "    df = pd.DataFrame(rows)\n",
    "    df.to_csv(output_csv_raw, index=False)\n",
    "\n",
    "    mlb = MultiLabelBinarizer()\n",
    "    Y = mlb.fit_transform(df[\"labels\"])\n",
    "    label_df = pd.DataFrame(Y, columns=mlb.classes_)\n",
    "    label_df.insert(0, \"filename\", df[\"filename\"])\n",
    "    label_df.to_csv(output_csv_encoded, index=False)\n",
    "\n",
    "    print(f\"Saved: {output_csv_raw}, {output_csv_encoded}\")\n",
    "    return mlb.classes_\n",
    "\n",
    "train_json = \"/media/SSD_2TB/datasets/coco/coco2014/annotations/instances_train2014.json\"\n",
    "val_json = \"/media/SSD_2TB/datasets/coco/coco2014/annotations/instances_val2014.json\"\n",
    "\n",
    "train_classes = process_coco_split(\n",
    "    train_json,\n",
    "    \"coco_multilabel_train.csv\",\n",
    "    \"coco_multilabel_train_encoded.csv\"\n",
    ")\n",
    "\n",
    "val_classes = process_coco_split(\n",
    "    val_json,\n",
    "    \"coco_multilabel_val.csv\",\n",
    "    \"coco_multilabel_val_encoded.csv\"\n",
    ")\n",
    "assert set(train_classes) == set(val_classes), \"Train/val label mismatch!\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "## Generate Embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7",
   "metadata": {},
   "source": [
    "## with CLIP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "model, preprocess = clip.load(\"ViT-B/32\", device=device)\n",
    "\n",
    "image_root_train = \"/media/SSD_2TB/datasets/coco/coco2014/train2014\"\n",
    "image_root_val = \"/media/SSD_2TB/datasets/coco/coco2014/val2014\"\n",
    "\n",
    "def get_clip_embeddings(df, image_root, output_name):\n",
    "    embeddings = []\n",
    "    filenames = []\n",
    "    \n",
    "    for fname in tqdm(df[\"filename\"]):\n",
    "        path = os.path.join(image_root, fname)\n",
    "        try:\n",
    "            image = preprocess(Image.open(path).convert(\"RGB\")).unsqueeze(0).to(device)\n",
    "            with torch.no_grad():\n",
    "                emb = model.encode_image(image).cpu().numpy()\n",
    "            embeddings.append(emb.squeeze())\n",
    "            filenames.append(fname)\n",
    "        except Exception as e:\n",
    "            print(f\"❌ Failed: {fname} — {e}\")\n",
    "    \n",
    "    embeddings = np.vstack(embeddings)\n",
    "    np.save(f\"{output_name}_embeddings.npy\", embeddings)\n",
    "    np.save(f\"{output_name}_filenames.npy\", filenames)\n",
    "    print(f\"✅ Saved: {output_name}_embeddings.npy\")\n",
    "\n",
    "get_clip_embeddings(df_train, image_root_train, \"clip_train\")\n",
    "get_clip_embeddings(df_val, image_root_val, \"clip_val\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "## With EfficientNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "efficientnet = models.efficientnet_b0(pretrained=True).to(device)\n",
    "efficientnet.eval()\n",
    "efficientnet_features = torch.nn.Sequential(*(list(efficientnet.children())[:-1]))\n",
    "\n",
    "preprocess_effnet = transforms.Compose([\n",
    "    transforms.Resize(256),\n",
    "    transforms.CenterCrop(224),\n",
    "    transforms.ToTensor(),\n",
    "    transforms.Normalize(\n",
    "        mean=[0.485, 0.456, 0.406],\n",
    "        std=[0.229, 0.224, 0.225]\n",
    "    )\n",
    "])\n",
    "def get_effnet_embeddings(df, image_root, output_name):\n",
    "    embeddings = []\n",
    "    filenames = []\n",
    "\n",
    "    for fname in tqdm(df[\"filename\"]):\n",
    "        path = os.path.join(image_root, fname)\n",
    "        try:\n",
    "            image = preprocess_effnet(Image.open(path).convert(\"RGB\")).unsqueeze(0).to(device)\n",
    "            with torch.no_grad():\n",
    "                features = efficientnet_features(image).squeeze().cpu().numpy()\n",
    "            embeddings.append(features)\n",
    "            filenames.append(fname)\n",
    "        except Exception as e:\n",
    "            print(f\"❌ Failed: {fname} — {e}\")\n",
    "\n",
    "    embeddings = np.vstack(embeddings)\n",
    "    np.save(f\"{output_name}_embeddings.npy\", embeddings)\n",
    "    np.save(f\"{output_name}_filenames.npy\", filenames)\n",
    "    print(f\"✅ Saved: {output_name}_embeddings.npy\")\n",
    "\n",
    "get_effnet_embeddings(df_train, image_root_train, \"effnet_train\")\n",
    "get_effnet_embeddings(df_val, image_root_val, \"effnet_val\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "# Baselines"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {},
   "source": [
    "## BR + LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "Y_train = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "print(\"🚀 Training Binary Relevance (LR + CLIP)...\")\n",
    "preds = []\n",
    "\n",
    "for i, label in enumerate(tqdm(label_columns)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    clf = LogisticRegression(max_iter=1000)\n",
    "    clf.fit(X_train, y_train_i)\n",
    "    \n",
    "    y_pred_i = clf.predict(X_val)\n",
    "    preds.append(y_pred_i)\n",
    "\n",
    "Y_pred = np.array(preds).T \n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Evaluation (Binary Relevance + LR + CLIP):\")\n",
    "print(f\"Macro-F1 Score: {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy: {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {},
   "source": [
    "## EfficientNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"🚀 Training Binary Relevance (LR + EfficientNet)...\")\n",
    "preds = []\n",
    "\n",
    "for i, label in enumerate(tqdm(label_columns)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    clf = LogisticRegression(max_iter=1000)\n",
    "    clf.fit(X_train, y_train_i)\n",
    "    \n",
    "    y_pred_i = clf.predict(X_val)\n",
    "    preds.append(y_pred_i)\n",
    "\n",
    "Y_pred = np.array(preds).T \n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Evaluation (Binary Relevance + LR + EfficientNet):\")\n",
    "print(f\"Macro-F1 Score: {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy: {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {},
   "source": [
    "## CNN-RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_tensor = torch.FloatTensor(X_train)\n",
    "Y_train_tensor = torch.FloatTensor(Y_train)\n",
    "X_val_tensor = torch.FloatTensor(X_val)\n",
    "Y_val_tensor = torch.FloatTensor(Y_val)\n",
    "\n",
    "train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)\n",
    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
    "\n",
    "class CLIP_RNN(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim, num_labels):\n",
    "        super().__init__()\n",
    "        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)\n",
    "        self.classifier = nn.Linear(hidden_dim, num_labels)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = x.unsqueeze(1) \n",
    "        rnn_out, _ = self.rnn(x)\n",
    "        logits = self.classifier(rnn_out.squeeze(1))\n",
    "        return torch.sigmoid(logits)\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "input_dim = X_train.shape[1]\n",
    "hidden_dim = 256\n",
    "num_labels = len(label_columns)\n",
    "\n",
    "model = CLIP_RNN(input_dim, hidden_dim, num_labels).to(device)\n",
    "criterion = nn.BCELoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
    "\n",
    "print(\"🚀 Training CLIP-RNN (Multilabel)...\")\n",
    "for epoch in range(10):\n",
    "    model.train()\n",
    "    epoch_loss = 0.0\n",
    "    for batch_x, batch_y in tqdm(train_loader, desc=f\"Epoch {epoch+1}\"):\n",
    "        batch_x, batch_y = batch_x.to(device), batch_y.to(device)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(batch_x)\n",
    "        loss = criterion(outputs, batch_y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        epoch_loss += loss.item()\n",
    "    print(f\"Epoch {epoch+1} Loss: {epoch_loss / len(train_loader):.4f}\")\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    val_outputs = model(X_val_tensor.to(device))\n",
    "    val_probs = val_outputs.cpu().numpy()\n",
    "    Y_pred = (val_probs > 0.5).astype(int)\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Evaluation (CLIP-RNN):\")\n",
    "print(f\"Macro-F1 Score: {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy: {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18",
   "metadata": {},
   "source": [
    "### Function for Statistical Significance "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_experiment(seed):\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    if torch.cuda.is_available():\n",
    "        torch.cuda.manual_seed_all(seed)\n",
    "    X_train = np.load(\"clip_train_embeddings.npy\")\n",
    "    X_val   = np.load(\"clip_val_embeddings.npy\")\n",
    "    df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "    df_val   = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "    label_cols = [c for c in df_train.columns if c!=\"filename\"]\n",
    "    Y_train = df_train[label_cols].values.astype(np.float32)\n",
    "    Y_val   = df_val[label_cols].values.astype(np.float32)\n",
    "    train_ds = TensorDataset(torch.FloatTensor(X_train),\n",
    "                             torch.FloatTensor(Y_train))\n",
    "    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)\n",
    "    model = CLIP_RNN(input_dim=X_train.shape[1],\n",
    "                     hidden_dim=256,\n",
    "                     num_labels=len(label_cols)\n",
    "                    ).to(device)\n",
    "    optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
    "    criterion = nn.BCELoss()\n",
    "    model.train()\n",
    "    for epoch in range(10):\n",
    "        for xb, yb in train_loader:\n",
    "            xb, yb = xb.to(device), yb.to(device)\n",
    "            optimizer.zero_grad()\n",
    "            loss = criterion(model(xb), yb)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "    model.eval()\n",
    "    with torch.no_grad():\n",
    "        probs = model(torch.FloatTensor(X_val).to(device)).cpu().numpy()\n",
    "    preds = (probs > 0.5).astype(int)\n",
    "    macro_f1 = f1_score(Y_val, preds, average=\"macro\")\n",
    "    return macro_f1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "seeds = [42, 100, 2021, 7, 999]\n",
    "f1_single = [] \n",
    "f1_ensemble = []\n",
    "\n",
    "for seed in seeds:\n",
    "    f1 = run_experiment(seed)\n",
    "    f1_single.append(f1)\n",
    "\n",
    "mean_f1 = np.mean(f1_single)\n",
    "std_f1  = np.std(f1_single, ddof=1)\n",
    "print(f\"COCO Macro-F1 over {len(seeds)} seeds: {mean_f1:.3f} ± {std_f1:.3f}\")\n",
    "# stat, p = wilcoxon(f1_single, f1_ensemble)\n",
    "# print(\"Wilcoxon p-value:\", p)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {},
   "source": [
    "## Mondrian CP with Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "print(\"🚀 Training Mondrian CP models...\")\n",
    "n_labels = Y.shape[1]\n",
    "thresholds = np.zeros(n_labels)\n",
    "Y_pred_sets = []\n",
    "\n",
    "for i in tqdm(range(n_labels)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    y_calib_i = Y_calib[:, i]\n",
    "\n",
    "    clf = LogisticRegression(max_iter=1000)\n",
    "    clf.fit(X_train, y_train_i)\n",
    "    calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "    calib_nonconformity = 1 - calib_probs \n",
    "    thresholds[i] = np.quantile(calib_nonconformity[y_calib_i == 1], 1 - alpha)\n",
    "    val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "    val_nonconformity = 1 - val_probs\n",
    "    val_included = val_nonconformity <= thresholds[i]\n",
    "    Y_pred_sets.append(val_included.astype(int))\n",
    "\n",
    "Y_pred_sets = np.array(Y_pred_sets).T \n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true_labels = set(np.where(Y_val[i] == 1)[0])\n",
    "    predicted_labels = set(np.where(Y_pred_sets[i] == 1)[0])\n",
    "    covered = len(true_labels & predicted_labels) / max(len(true_labels), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(predicted_labels))\n",
    "\n",
    "mean_coverage = np.mean(coverages)\n",
    "avg_set_size = np.mean(set_sizes)\n",
    "\n",
    "print(\"\\n📊 Evaluation (Mondrian CP + LR):\")\n",
    "print(f\"Target Coverage (1 - α): {1 - alpha:.2f}\")\n",
    "print(f\"Empirical Coverage:      {mean_coverage:.4f}\")\n",
    "print(f\"Average Set Size:        {avg_set_size:.2f}\")\n",
    "marginal_coverages = []\n",
    "\n",
    "for j in range(Y_val.shape[1]):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred_sets[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "    \n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)\n",
    "\n",
    "marginal_coverages = np.array(marginal_coverages)\n",
    "marginal_coverage_mean = np.nanmean(marginal_coverages)\n",
    "print(f\"Marginal Coverage:       {marginal_coverage_mean:.4f}\")\n",
    "Y_true = Y_val\n",
    "Y_pred = Y_pred_sets\n",
    "macro_f1 = f1_score(Y_true, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_true == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(f\"Macro-F1 Score:           {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:     {exact_match:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23",
   "metadata": {},
   "source": [
    "### with CLIP embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "print(\"🚀 Training Mondrian CP models...\")\n",
    "n_labels = Y.shape[1]\n",
    "thresholds = np.zeros(n_labels)\n",
    "Y_pred_sets = []\n",
    "\n",
    "for i in tqdm(range(n_labels)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    y_calib_i = Y_calib[:, i]\n",
    "\n",
    "    clf = LogisticRegression(max_iter=1000)\n",
    "    clf.fit(X_train, y_train_i)\n",
    "    calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "    calib_nonconformity = 1 - calib_probs \n",
    "    thresholds[i] = np.quantile(calib_nonconformity[y_calib_i == 1], 1 - alpha)\n",
    "    val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "    val_nonconformity = 1 - val_probs\n",
    "    val_included = val_nonconformity <= thresholds[i]\n",
    "    Y_pred_sets.append(val_included.astype(int))\n",
    "\n",
    "Y_pred_sets = np.array(Y_pred_sets).T \n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true_labels = set(np.where(Y_val[i] == 1)[0])\n",
    "    predicted_labels = set(np.where(Y_pred_sets[i] == 1)[0])\n",
    "    covered = len(true_labels & predicted_labels) / max(len(true_labels), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(predicted_labels))\n",
    "\n",
    "mean_coverage = np.mean(coverages)\n",
    "avg_set_size = np.mean(set_sizes)\n",
    "\n",
    "print(\"\\n📊 Evaluation (Mondrian CP + LR):\")\n",
    "print(f\"Target Coverage (1 - α): {1 - alpha:.2f}\")\n",
    "print(f\"Empirical Coverage:      {mean_coverage:.4f}\")\n",
    "print(f\"Average Set Size:        {avg_set_size:.2f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tqdm import tqdm\n",
    "\n",
    "# === CONFIG ===\n",
    "alpha = 0.1  # Target miscoverage rate → 90% coverage\n",
    "\n",
    "# === Load Data ===\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "# === Split training data into proper training and calibration ===\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "\n",
    "# === Mondrian CP ===\n",
    "print(\"🚀 Training Mondrian CP models...\")\n",
    "n_labels = Y.shape[1]\n",
    "thresholds = np.zeros(n_labels)\n",
    "Y_pred_sets = []\n",
    "\n",
    "for i in tqdm(range(n_labels)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    y_calib_i = Y_calib[:, i]\n",
    "\n",
    "    clf = LogisticRegression(max_iter=1000)\n",
    "    clf.fit(X_train, y_train_i)\n",
    "\n",
    "    # Calibrate using probabilities\n",
    "    calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "\n",
    "    # Mondrian CP: separate calibration for class 1\n",
    "    calib_nonconformity = 1 - calib_probs  # Lower prob → higher nonconformity\n",
    "    thresholds[i] = np.quantile(calib_nonconformity[y_calib_i == 1], 1 - alpha)\n",
    "\n",
    "    # Predict on validation\n",
    "    val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "    val_nonconformity = 1 - val_probs\n",
    "    val_included = val_nonconformity <= thresholds[i]\n",
    "    Y_pred_sets.append(val_included.astype(int))\n",
    "\n",
    "# === Convert list to matrix ===\n",
    "Y_pred_sets = np.array(Y_pred_sets).T  # shape: (n_val, n_labels)\n",
    "\n",
    "# === Evaluate CP metrics ===\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true_labels = set(np.where(Y_val[i] == 1)[0])\n",
    "    predicted_labels = set(np.where(Y_pred_sets[i] == 1)[0])\n",
    "    \n",
    "    # Coverage: % of true labels captured\n",
    "    covered = len(true_labels & predicted_labels) / max(len(true_labels), 1)\n",
    "    coverages.append(covered)\n",
    "    \n",
    "    # Set size\n",
    "    set_sizes.append(len(predicted_labels))\n",
    "\n",
    "mean_coverage = np.mean(coverages)\n",
    "avg_set_size = np.mean(set_sizes)\n",
    "\n",
    "print(\"\\n📊 Evaluation (Mondrian CP + LR):\")\n",
    "print(f\"Target Coverage (1 - α): {1 - alpha:.2f}\")\n",
    "print(f\"Empirical Coverage:      {mean_coverage:.4f}\")\n",
    "print(f\"Average Set Size:        {avg_set_size:.2f}\")\n",
    "\n",
    "\n",
    "# === Compute Marginal Coverage ===\n",
    "marginal_coverages = []\n",
    "\n",
    "for j in range(Y_val.shape[1]):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred_sets[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "    \n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)  # Label not present in val set\n",
    "\n",
    "marginal_coverages = np.array(marginal_coverages)\n",
    "marginal_coverage_mean = np.nanmean(marginal_coverages)\n",
    "print(f\"Marginal Coverage:       {marginal_coverage_mean:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_true = Y_val\n",
    "Y_pred = Y_pred_sets\n",
    "macro_f1 = f1_score(Y_true, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_true == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(f\"Macro-F1 Score:           {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:     {exact_match:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27",
   "metadata": {},
   "source": [
    "## Mondrian CP with RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "class CLIP_RNN(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim, num_labels):\n",
    "        super().__init__()\n",
    "        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)\n",
    "        self.classifier = nn.Linear(hidden_dim, num_labels)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = x.unsqueeze(1)\n",
    "        rnn_out, _ = self.rnn(x)\n",
    "        logits = self.classifier(rnn_out.squeeze(1))\n",
    "        return torch.sigmoid(logits)\n",
    "\n",
    "input_dim = X.shape[1]\n",
    "hidden_dim = 256\n",
    "num_labels = len(label_columns)\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "model = CLIP_RNN(input_dim, hidden_dim, num_labels).to(device)\n",
    "criterion = nn.BCELoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
    "train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(Y_train))\n",
    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
    "print(\"🚀 Training CLIP-RNN model...\")\n",
    "for epoch in range(10):\n",
    "    model.train()\n",
    "    total_loss = 0.0\n",
    "    for batch_x, batch_y in tqdm(train_loader, desc=f\"Epoch {epoch+1}\"):\n",
    "        batch_x, batch_y = batch_x.to(device), batch_y.to(device)\n",
    "        optimizer.zero_grad()\n",
    "        outputs = model(batch_x)\n",
    "        loss = criterion(outputs, batch_y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        total_loss += loss.item()\n",
    "    print(f\"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}\")\n",
    "\n",
    "print(\"\\n⚙️ Applying Mondrian CP on CLIP-RNN...\")\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    calib_probs = model(torch.FloatTensor(X_calib).to(device)).cpu().numpy()\n",
    "    val_probs = model(torch.FloatTensor(X_val).to(device)).cpu().numpy()\n",
    "\n",
    "thresholds = np.zeros(num_labels)\n",
    "for i in range(num_labels):\n",
    "    calib_nc = 1 - calib_probs[:, i]\n",
    "    label_mask = Y_calib[:, i] == 1\n",
    "    if np.sum(label_mask) > 0:\n",
    "        thresholds[i] = np.quantile(calib_nc[label_mask], 1 - alpha)\n",
    "    else:\n",
    "        thresholds[i] = 1.0\n",
    "\n",
    "val_nc = 1 - val_probs\n",
    "Y_pred_sets = (val_nc <= thresholds).astype(int)\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true_labels = set(np.where(Y_val[i] == 1)[0])\n",
    "    predicted_labels = set(np.where(Y_pred_sets[i] == 1)[0])\n",
    "    covered = len(true_labels & predicted_labels) / max(len(true_labels), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(predicted_labels))\n",
    "\n",
    "mean_coverage = np.mean(coverages)\n",
    "avg_set_size = np.mean(set_sizes)\n",
    "marginal_coverages = []\n",
    "for j in range(num_labels):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred_sets[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)\n",
    "marginal_coverage_mean = np.nanmean(marginal_coverages)\n",
    "macro_f1 = f1_score(Y_val, Y_pred_sets, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred_sets).all(axis=1).mean()\n",
    "print(\"\\n📊 Evaluation (CLIP-RNN + Mondrian CP):\")\n",
    "print(f\"Target Coverage (1 - α): {1 - alpha:.2f}\")\n",
    "print(f\"Empirical Coverage:       {mean_coverage:.4f}\")\n",
    "print(f\"Marginal Coverage:        {marginal_coverage_mean:.4f}\")\n",
    "print(f\"Average Prediction Set:   {avg_set_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:           {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:     {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29",
   "metadata": {},
   "source": [
    "### Function for Statistical Significance "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_cp_experiment(seed, alpha=0.1):\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    if torch.cuda.is_available():\n",
    "        torch.cuda.manual_seed_all(seed)\n",
    "    X = np.load(\"clip_train_embeddings.npy\")\n",
    "    Y = pd.read_csv(\"coco_multilabel_train_encoded.csv\").drop(\"filename\", axis=1).values.astype(np.float32)\n",
    "    X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "    Y_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\").drop(\"filename\", axis=1).values.astype(np.float32)\n",
    "\n",
    "    X_train, X_calib, Y_train, Y_calib = train_test_split(\n",
    "        X, Y, test_size=0.3, random_state=seed\n",
    "    )\n",
    "\n",
    "    train_ds = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(Y_train))\n",
    "    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)\n",
    "    model = CLIP_RNN(input_dim=X.shape[1], hidden_dim=256, num_labels=Y.shape[1]).to(device)\n",
    "    optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
    "    criterion = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in train_loader:\n",
    "            xb, yb = xb.to(device), yb.to(device)\n",
    "            optimizer.zero_grad()\n",
    "            loss = criterion(model(xb), yb)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "    model.eval()\n",
    "    with torch.no_grad():\n",
    "        calib_probs = model(torch.FloatTensor(X_calib).to(device)).cpu().numpy()\n",
    "        val_probs   = model(torch.FloatTensor(X_val).to(device)).cpu().numpy()\n",
    "\n",
    "    thresholds = np.zeros(Y.shape[1])\n",
    "    for i in range(Y.shape[1]):\n",
    "        nc = 1 - calib_probs[:, i]\n",
    "        mask = Y_calib[:, i] == 1\n",
    "        thresholds[i] = np.quantile(nc[mask], 1 - alpha) if mask.sum() else 1.0\n",
    "\n",
    "    val_nc = 1 - val_probs\n",
    "    Y_pred_sets = (val_nc <= thresholds).astype(int)\n",
    "    macro_f1 = f1_score(Y_val, Y_pred_sets, average=\"macro\")\n",
    "    return macro_f1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {},
   "outputs": [],
   "source": [
    "seeds = [42, 100, 2021, 7, 999]\n",
    "f1_single = []\n",
    "\n",
    "for s in seeds:\n",
    "    f1_single.append(run_cp_experiment(s, alpha=0.1))\n",
    "\n",
    "mean_f1 = np.mean(f1_single)\n",
    "std_f1  = np.std(f1_single, ddof=1)\n",
    "print(f\"CP Macro-F1 over {len(seeds)} seeds: {mean_f1:.3f} ± {std_f1:.3f}\")\n",
    "# stat, p = wilcoxon(f1_single, f1_ensemble)\n",
    "# print(f\"Wilcoxon p-value: {p:.3f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32",
   "metadata": {},
   "source": [
    "## Mondrian CP with MLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "class SimpleMLP(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super().__init__()\n",
    "        self.net = nn.Sequential(\n",
    "            nn.Linear(input_dim, 256),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(256, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "\n",
    "    def forward(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "def train_mlp(X, y, input_dim):\n",
    "    model = SimpleMLP(input_dim).to(device)\n",
    "    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)\n",
    "    y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)\n",
    "\n",
    "    loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=128, shuffle=True)\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "\n",
    "    model.train()\n",
    "    for epoch in range(10):\n",
    "        for xb, yb in loader:\n",
    "            pred = model(xb)\n",
    "            loss = loss_fn(pred, yb)\n",
    "            opt.zero_grad()\n",
    "            loss.backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "\n",
    "print(\"🚀 Training Mondrian CP + MLP...\")\n",
    "thresholds = np.zeros(len(label_columns))\n",
    "Y_pred_sets = []\n",
    "\n",
    "for i, label in enumerate(tqdm(label_columns)):\n",
    "    y_train_i = Y_train[:, i]\n",
    "    y_calib_i = Y_calib[:, i]\n",
    "\n",
    "    model = train_mlp(X_train, y_train_i, X.shape[1])\n",
    "\n",
    "    with torch.no_grad():\n",
    "        X_calib_tensor = torch.tensor(X_calib, dtype=torch.float32).to(device)\n",
    "        probs_calib = model(X_calib_tensor).cpu().numpy().squeeze()\n",
    "\n",
    "    nc_scores = 1 - probs_calib\n",
    "    thresholds[i] = np.quantile(nc_scores[y_calib_i == 1], 1 - alpha)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)\n",
    "        probs_val = model(X_val_tensor).cpu().numpy().squeeze()\n",
    "\n",
    "    Y_pred = (1 - probs_val <= thresholds[i]).astype(int)\n",
    "    Y_pred_sets.append(Y_pred)\n",
    "\n",
    "Y_pred_sets = np.array(Y_pred_sets).T \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34",
   "metadata": {},
   "outputs": [],
   "source": [
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred_sets[i] == 1)[0])\n",
    "    coverages.append(len(true & pred) / max(len(true), 1))\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred_sets, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred_sets).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Mondrian CP + MLP Results:\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "marginal_coverages = []\n",
    "\n",
    "for j in range(Y_val.shape[1]):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred_sets[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "\n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)\n",
    "\n",
    "marginal_coverages = np.array(marginal_coverages)\n",
    "marginal_coverage = np.nanmean(marginal_coverages)\n",
    "print(\"\\n📊 Mondrian CP + MLP Results:\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Marginal Coverage:         {marginal_coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35",
   "metadata": {},
   "source": [
    "## Modrian CP with SGD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "n_val = Y_val.shape[0]\n",
    "Y_pred = np.zeros_like(Y_val)\n",
    "\n",
    "print(\"🚀 Running Mondrian CP with SGDClassifier per label...\")\n",
    "for j, label in enumerate(tqdm(label_columns)):\n",
    "    y = Y[:, j]\n",
    "    X_train, X_calib, y_train, y_calib = train_test_split(X, y, test_size=0.3, random_state=42)\n",
    "    clf = SGDClassifier(loss=\"log_loss\", max_iter=1000, random_state=0)\n",
    "    clf.fit(X_train, y_train)\n",
    "    calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "    calib_nc = 1 - calib_probs\n",
    "    q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "    val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "    val_nc = 1 - val_probs\n",
    "    Y_pred[:, j] = (val_nc <= q).astype(int)\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Single-model CP (SGD) Results:\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "marginal_coverages = []\n",
    "\n",
    "for j in range(Y_val.shape[1]):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "\n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)  \n",
    "\n",
    "marginal_coverages = np.array(marginal_coverages)\n",
    "marginal_coverage = np.nanmean(marginal_coverages)\n",
    "\n",
    "print(\"\\n📊 Single-model CP (SGD) Results:\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Marginal Coverage:         {marginal_coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37",
   "metadata": {},
   "source": [
    "## Homogenous Ensemble 5 Logistic Regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import resample\n",
    "n_models = 5\n",
    "threshold = 0.5\n",
    "X_train = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y_train = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "print(f\"🚀 Training {n_models}x Logistic Regression Ensemble per label...\")\n",
    "ensemble_preds = np.zeros((Y_val.shape[0], len(label_columns)))\n",
    "\n",
    "for i, label in enumerate(tqdm(label_columns)):\n",
    "    val_probs_accum = np.zeros(Y_val.shape[0])\n",
    "\n",
    "    for m in range(n_models):\n",
    "        X_boot, y_boot = resample(X_train, Y_train[:, i], replace=True, random_state=m)\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_boot, y_boot)\n",
    "        val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_probs_accum += val_probs\n",
    "\n",
    "    val_probs_mean = val_probs_accum / n_models\n",
    "    ensemble_preds[:, i] = (val_probs_mean >= threshold).astype(int)\n",
    "\n",
    "macro_f1 = f1_score(Y_val, ensemble_preds, average=\"macro\")\n",
    "exact_match = (Y_val == ensemble_preds).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Standard Ensemble (LR, Majority Voting):\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39",
   "metadata": {},
   "source": [
    "## Label Bagging with LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_models = 10\n",
    "label_subset_size = 10\n",
    "threshold = 0.5\n",
    "random.seed(42)\n",
    "\n",
    "X_train = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y_train = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "n_labels = len(label_columns)\n",
    "val_label_votes = defaultdict(list)\n",
    "\n",
    "print(f\"🚀 Training {n_models} label-bagging models...\")\n",
    "for m in tqdm(range(n_models)):\n",
    "    selected_labels = random.sample(label_columns, label_subset_size)\n",
    "    selected_indices = [label_columns.index(lbl) for lbl in selected_labels]\n",
    "    X_boot, Y_boot = resample(X_train, Y_train, replace=True, random_state=m)\n",
    "    models = {}\n",
    "    for i, label in zip(selected_indices, selected_labels):\n",
    "        y_boot_i = Y_boot[:, i]\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_boot, y_boot_i)\n",
    "        models[label] = clf\n",
    "\n",
    "    for label in selected_labels:\n",
    "        i = label_columns.index(label)\n",
    "        clf = models[label]\n",
    "        probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_label_votes[label].append(probs)\n",
    "\n",
    "Y_pred = np.zeros_like(Y_val)\n",
    "\n",
    "for j, label in enumerate(label_columns):\n",
    "    if label in val_label_votes:\n",
    "        probs = np.mean(val_label_votes[label], axis=0)\n",
    "        Y_pred[:, j] = (probs >= threshold).astype(int)\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Label Bagging (LR, no CP):\")\n",
    "print(f\"Macro-F1 Score:        {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:  {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41",
   "metadata": {},
   "source": [
    "## Ensemble and Afterwards conformal"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42",
   "metadata": {},
   "source": [
    "### One vs Rest Way"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.calibration import CalibratedClassifierCV\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "n_labels = Y.shape[1]\n",
    "alpha = 0.1\n",
    "models = [\n",
    "    (\"LR\", OneVsRestClassifier(LogisticRegression(max_iter=1000))),\n",
    "    (\"MLP\", OneVsRestClassifier(MLPClassifier(max_iter=1000))),\n",
    "    (\"SGD\", OneVsRestClassifier(CalibratedClassifierCV(SGDClassifier(loss=\"log_loss\", max_iter=1000), cv=3)))\n",
    "]\n",
    "\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "\n",
    "val_probs_ensemble = np.zeros((X_val.shape[0], n_labels, len(models)))\n",
    "calib_probs_ensemble = np.zeros((X_calib.shape[0], n_labels, len(models)))\n",
    "\n",
    "print(\"🚀 Training ensemble models...\")\n",
    "for idx, (name, clf) in enumerate(models):\n",
    "    print(f\"Training {name}...\")\n",
    "    clf.fit(X_train, Y_train)\n",
    "    calib_probs_ensemble[:, :, idx] = clf.predict_proba(X_calib)\n",
    "    val_probs_ensemble[:, :, idx] = clf.predict_proba(X_val)\n",
    "\n",
    "val_probs_avg = np.mean(val_probs_ensemble, axis=2)\n",
    "calib_probs_avg = np.mean(calib_probs_ensemble, axis=2)\n",
    "\n",
    "# === Step 3: Apply conformal prediction to averaged probabilities ===\n",
    "thresholds = np.zeros(n_labels)\n",
    "for j in range(n_labels):\n",
    "    nonconformity = 1 - calib_probs_avg[:, j]\n",
    "    mask = Y_calib[:, j] == 1\n",
    "    if np.sum(mask) == 0:\n",
    "        thresholds[j] = 1.0\n",
    "    else:\n",
    "        q = np.quantile(nonconformity[mask], 1 - alpha)\n",
    "        thresholds[j] = q\n",
    "\n",
    "nonconformity_val = 1 - val_probs_avg\n",
    "Y_pred = (nonconformity_val <= thresholds).astype(int)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "coverage = np.mean([\n",
    "    bool(set(np.where(Y_val[i])[0]) & set(np.where(Y_pred[i])[0]))\n",
    "    for i in range(Y_val.shape[0])\n",
    "])\n",
    "avg_set_size = np.mean([np.sum(Y_pred[i]) for i in range(Y_val.shape[0])])\n",
    "print(\"\\n📊 Baseline: Ensemble → Then CP (Averaged Output)\")\n",
    "print(f\"Macro F1:           {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Acc:    {exact_match:.4f}\")\n",
    "print(f\"Marginal Coverage:  {coverage:.4f}\")\n",
    "print(f\"Avg. Prediction Set Size: {avg_set_size:.2f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44",
   "metadata": {},
   "source": [
    "### Normal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "# device = torch.device(\"cpu\")\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "n_labels = Y.shape[1]\n",
    "class MLP(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super().__init__()\n",
    "        self.net = nn.Sequential(\n",
    "            nn.Linear(input_dim, 256),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(256, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "    def forward(self, x): return self.net(x)\n",
    "\n",
    "def train_mlp(X, y):\n",
    "    model = MLP(X.shape[1]).to(device)\n",
    "    X_t = torch.tensor(X, dtype=torch.float32).to(device)\n",
    "    y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)\n",
    "    loader = DataLoader(TensorDataset(X_t, y_t), batch_size=128, shuffle=True)\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in loader:\n",
    "            opt.zero_grad()\n",
    "            loss_fn(model(xb), yb).backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "X_train, X_calib, Y_train, Y_calib = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "val_probs_models = np.zeros((X_val.shape[0], n_labels, 3))\n",
    "calib_probs_models = np.zeros((X_calib.shape[0], n_labels, 3))\n",
    "\n",
    "print(\"🚀 Training LR, MLP, SGD per label with consistent logic...\")\n",
    "for label_idx in tqdm(range(n_labels)):\n",
    "    y_train_bin = Y_train[:, label_idx]\n",
    "    y_calib_bin = Y_calib[:, label_idx]\n",
    "\n",
    "    # === LR ===\n",
    "    lr = LogisticRegression(max_iter=1000)\n",
    "    lr.fit(X_train, y_train_bin)\n",
    "    calib_probs_models[:, label_idx, 0] = lr.predict_proba(X_calib)[:, 1]\n",
    "    val_probs_models[:, label_idx, 0] = lr.predict_proba(X_val)[:, 1]\n",
    "\n",
    "    # === MLP ===\n",
    "    mlp = train_mlp(X_train, y_train_bin)\n",
    "    with torch.no_grad():\n",
    "        calib_probs_models[:, label_idx, 1] = mlp(torch.tensor(X_calib, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "        val_probs_models[:, label_idx, 1] = mlp(torch.tensor(X_val, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "\n",
    "    # === SGD ===\n",
    "    sgd = CalibratedClassifierCV(SGDClassifier(loss=\"log_loss\", max_iter=1000), cv=3)\n",
    "    sgd.fit(X_train, y_train_bin)\n",
    "    calib_probs_models[:, label_idx, 2] = sgd.predict_proba(X_calib)[:, 1]\n",
    "    val_probs_models[:, label_idx, 2] = sgd.predict_proba(X_val)[:, 1]\n",
    "val_probs_avg = np.mean(val_probs_models, axis=2)\n",
    "calib_probs_avg = np.mean(calib_probs_models, axis=2)\n",
    "thresholds = np.zeros(n_labels)\n",
    "print(\"📏 Calibrating thresholds...\")\n",
    "for j in tqdm(range(n_labels)):\n",
    "    nonconformity = 1 - calib_probs_avg[:, j]\n",
    "    mask = Y_calib[:, j] == 1\n",
    "    if np.sum(mask) == 0:\n",
    "        thresholds[j] = 1.0\n",
    "    else:\n",
    "        thresholds[j] = np.quantile(nonconformity[mask], 1 - alpha)\n",
    "\n",
    "print(\"🔍 Predicting...\")\n",
    "Y_pred = np.zeros_like(Y_val)\n",
    "for j in tqdm(range(n_labels)):\n",
    "    nonconformity_val = 1 - val_probs_avg[:, j]\n",
    "    Y_pred[:, j] = (nonconformity_val <= thresholds[j]).astype(int)\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "coverage = np.mean([\n",
    "    bool(set(np.where(Y_val[i])[0]) & set(np.where(Y_pred[i])[0]))\n",
    "    for i in range(Y_val.shape[0])\n",
    "])\n",
    "avg_set_size = np.mean([np.sum(Y_pred[i]) for i in range(Y_val.shape[0])])\n",
    "print(\"\\n📊 Per-label Ensemble → Then CP (MLP = PyTorch)\")\n",
    "print(f\"Macro F1:                  {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "print(f\"Marginal Coverage:         {coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size: {avg_set_size:.2f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46",
   "metadata": {},
   "outputs": [],
   "source": [
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1) \n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "\n",
    "# === Final Report ===\n",
    "print(\"\\n📊 Per-label Ensemble → Then CP (MLP = PyTorch)\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size: {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47",
   "metadata": {},
   "outputs": [],
   "source": [
    "# === CP-style Empirical Coverage Metrics ===\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)  # per-sample recall\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "\n",
    "# === Final Report ===\n",
    "print(\"\\n📊 Per-label Ensemble → Then CP (MLP = PyTorch)\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size: {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# === CP-style Empirical Coverage Metrics ===\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(Y_val.shape[0]):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)  # per-sample recall\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "\n",
    "# === Final Report ===\n",
    "print(\"\\n📊 Per-label Ensemble → Then CP (MLP = PyTorch)\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size: {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49",
   "metadata": {},
   "source": [
    "# Proposed Models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50",
   "metadata": {},
   "source": [
    "## Homogeneous"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51",
   "metadata": {},
   "source": [
    "### CP Ensemble with LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "n_models = 5\n",
    "alpha = 0.1\n",
    "threshold = 0.5\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "ensemble_votes = np.zeros((n_val, n_labels, n_models), dtype=int)\n",
    "\n",
    "print(f\"🚀 Training Homogeneous CP Ensemble (LR × {n_models}) per label...\")\n",
    "\n",
    "for label_idx, label in enumerate(tqdm(label_columns)):\n",
    "    for m in range(n_models):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, label_idx]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(X_boot, y_boot, test_size=0.3, random_state=m)\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_train, y_train)\n",
    "        calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "        calib_nc = 1 - calib_probs\n",
    "        threshold_q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "        val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_nc = 1 - val_probs\n",
    "        val_included = (val_nc <= threshold_q).astype(int)\n",
    "        ensemble_votes[:, label_idx, m] = val_included\n",
    "\n",
    "Y_pred = (np.sum(ensemble_votes, axis=2) >= ((n_models + 1) // 2)).astype(int)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53",
   "metadata": {},
   "outputs": [],
   "source": [
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Homogeneous CP Ensemble Results (LR):\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_models = 5\n",
    "alpha = 0.1\n",
    "threshold = 0.5\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "ensemble_votes = np.zeros((n_val, n_labels, n_models), dtype=int)\n",
    "\n",
    "print(f\"🚀 Training Homogeneous CP Ensemble (LR × {n_models}) per label...\")\n",
    "\n",
    "for label_idx, label in enumerate(tqdm(label_columns)):\n",
    "    for m in range(n_models):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, label_idx]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(X_boot, y_boot, test_size=0.3, random_state=m)\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_train, y_train)\n",
    "        calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "        calib_nc = 1 - calib_probs\n",
    "        threshold_q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "        val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_nc = 1 - val_probs\n",
    "        val_included = (val_nc <= threshold_q).astype(int)\n",
    "        ensemble_votes[:, label_idx, m] = val_included\n",
    "\n",
    "Y_pred = (np.sum(ensemble_votes, axis=2) >= ((n_models + 1) // 2)).astype(int)\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Homogeneous CP Ensemble Results (LR):\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "marginal_coverages = []\n",
    "\n",
    "for j in range(n_labels):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_positives = (Y_pred[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_positives)\n",
    "\n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)\n",
    "\n",
    "marginal_coverage = np.nanmean(marginal_coverages)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55",
   "metadata": {},
   "source": [
    "## Average-weighted conformal ensemble with logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_models = 5\n",
    "alpha = 0.1\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "val_probs_all = np.zeros((n_val, n_labels, n_models))\n",
    "calib_scores = {label_idx: [] for label_idx in range(n_labels)}\n",
    "\n",
    "print(f\"🚀 Training Homogeneous CP Ensemble (LR × {n_models}) with Averaged Probabilities...\")\n",
    "\n",
    "for label_idx, label in enumerate(tqdm(label_columns)):\n",
    "    for m in range(n_models):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, label_idx]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(\n",
    "            X_boot, y_boot, test_size=0.3, random_state=m\n",
    "        )\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_train, y_train)\n",
    "        calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "        nc = 1 - calib_probs\n",
    "        nc_pos = nc[y_calib == 1]\n",
    "        calib_scores[label_idx].extend(nc_pos.tolist())\n",
    "        val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_probs_all[:, label_idx, m] = val_probs\n",
    "thresholds = {}\n",
    "for label_idx in range(n_labels):\n",
    "    scores = calib_scores[label_idx]\n",
    "    if len(scores) > 0:\n",
    "        thresholds[label_idx] = np.quantile(scores, 1 - alpha)\n",
    "    else:\n",
    "        thresholds[label_idx] = 1.0 \n",
    "\n",
    "Y_pred = np.zeros((n_val, n_labels), dtype=int)\n",
    "for label_idx in range(n_labels):\n",
    "    avg_probs = np.mean(val_probs_all[:, label_idx, :], axis=1)\n",
    "    avg_nc = 1 - avg_probs\n",
    "    Y_pred[:, label_idx] = (avg_nc <= thresholds[label_idx]).astype(int)\n",
    "\n",
    "def coverage(y_true, y_pred):\n",
    "    return np.mean([\n",
    "        any(y_pred[i, j] and y_true[i, j] for j in range(y_true.shape[1]))\n",
    "        for i in range(y_true.shape[0])\n",
    "    ])\n",
    "\n",
    "def avg_set_size(y_pred):\n",
    "    return np.mean(np.sum(y_pred, axis=1))\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "macro_prec = precision_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "macro_rec = recall_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "subset_acc = accuracy_score(Y_val, Y_pred)\n",
    "cov = coverage(Y_val, Y_pred)\n",
    "set_sz = avg_set_size(Y_pred)\n",
    "\n",
    "print(\"\\n=== Averaged CP Ensemble Results ===\")\n",
    "print(f\"Macro-F1        : {macro_f1:.4f}\")\n",
    "print(f\"Macro-Precision : {macro_prec:.4f}\")\n",
    "print(f\"Macro-Recall    : {macro_rec:.4f}\")\n",
    "print(f\"Subset Accuracy : {subset_acc:.4f}\")\n",
    "print(f\"Coverage        : {cov:.4f}\")\n",
    "print(f\"Avg Set Size    : {set_sz:.2f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57",
   "metadata": {},
   "outputs": [],
   "source": [
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Homogeneous CP Ensemble Results (LR):\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score\n",
    "n_models = 5\n",
    "alpha = 0.1\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "val_probs_all = np.zeros((n_val, n_labels, n_models))\n",
    "calib_scores = {label_idx: [] for label_idx in range(n_labels)}\n",
    "\n",
    "print(f\"🚀 Training Homogeneous CP Ensemble (LR × {n_models}) with Averaged Probabilities...\")\n",
    "\n",
    "for label_idx, label in enumerate(tqdm(label_columns)):\n",
    "    for m in range(n_models):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, label_idx]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(\n",
    "            X_boot, y_boot, test_size=0.3, random_state=m\n",
    "        )\n",
    "        clf = LogisticRegression(max_iter=1000)\n",
    "        clf.fit(X_train, y_train)\n",
    "        calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "        nc = 1 - calib_probs\n",
    "        nc_pos = nc[y_calib == 1]\n",
    "        calib_scores[label_idx].extend(nc_pos.tolist())\n",
    "        val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "        val_probs_all[:, label_idx, m] = val_probs\n",
    "\n",
    "thresholds = {}\n",
    "for label_idx in range(n_labels):\n",
    "    scores = calib_scores[label_idx]\n",
    "    if len(scores) > 0:\n",
    "        thresholds[label_idx] = np.quantile(scores, 1 - alpha)\n",
    "    else:\n",
    "        thresholds[label_idx] = 1.0 \n",
    "Y_pred = np.zeros((n_val, n_labels), dtype=int)\n",
    "for label_idx in range(n_labels):\n",
    "    avg_probs = np.mean(val_probs_all[:, label_idx, :], axis=1)\n",
    "    avg_nc = 1 - avg_probs\n",
    "    Y_pred[:, label_idx] = (avg_nc <= thresholds[label_idx]).astype(int)\n",
    "\n",
    "def coverage(y_true, y_pred):\n",
    "    return np.mean([\n",
    "        any(y_pred[i, j] and y_true[i, j] for j in range(y_true.shape[1]))\n",
    "        for i in range(y_true.shape[0])\n",
    "    ])\n",
    "\n",
    "def avg_set_size(y_pred):\n",
    "    return np.mean(np.sum(y_pred, axis=1))\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "macro_prec = precision_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "macro_rec = recall_score(Y_val, Y_pred, average=\"macro\", zero_division=0)\n",
    "subset_acc = accuracy_score(Y_val, Y_pred)\n",
    "cov = coverage(Y_val, Y_pred)\n",
    "set_sz = avg_set_size(Y_pred)\n",
    "\n",
    "print(\"\\n=== Averaged CP Ensemble Results ===\")\n",
    "print(f\"Macro-F1        : {macro_f1:.4f}\")\n",
    "print(f\"Macro-Precision : {macro_prec:.4f}\")\n",
    "print(f\"Macro-Recall    : {macro_rec:.4f}\")\n",
    "print(f\"Subset Accuracy : {subset_acc:.4f}\")\n",
    "print(f\"Coverage        : {cov:.4f}\")\n",
    "print(f\"Avg Set Size    : {set_sz:.2f}\")\n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "\n",
    "print(\"\\n📊 Homogeneous CP Ensemble Results (LR):\")\n",
    "print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "marginal_coverages = []\n",
    "for j in range(n_labels):\n",
    "    true_positives = (Y_val[:, j] == 1)\n",
    "    predicted_as_positive = (Y_pred[:, j] == 1)\n",
    "    covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "    \n",
    "    if true_positives.sum() > 0:\n",
    "        marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan) \n",
    "\n",
    "marginal_coverage = np.nanmean(marginal_coverages)\n",
    "print(f\"Marginal Coverage:         {marginal_coverage:.4f}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "59",
   "metadata": {},
   "source": [
    "## Heterogeneous"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60",
   "metadata": {},
   "source": [
    "## exactly 3 models one MLP one LR and one SGD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "class MLP(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super().__init__()\n",
    "        self.net = nn.Sequential(\n",
    "            nn.Linear(input_dim, 256),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(256, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "    def forward(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "def train_mlp(X, y):\n",
    "    model = MLP(X.shape[1]).to(device)\n",
    "    X_t = torch.tensor(X, dtype=torch.float32).to(device)\n",
    "    y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)\n",
    "    loader = DataLoader(TensorDataset(X_t, y_t), batch_size=128, shuffle=True)\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in loader:\n",
    "            opt.zero_grad()\n",
    "            loss_fn(model(xb), yb).backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "print(f\"🚀 Training Fixed 3-Model CP Ensemble (1×LR, 1×SGD, 1×MLP) per label...\")\n",
    "votes = np.zeros((n_val, n_labels, 3), dtype=int)\n",
    "model_types = [\"lr\", \"sgd\", \"mlp\"]\n",
    "\n",
    "for j, label in enumerate(tqdm(label_columns)):\n",
    "    for m, model_type in enumerate(model_types):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, j]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(X_boot, y_boot, test_size=0.3, random_state=m)\n",
    "\n",
    "        if model_type == \"lr\":\n",
    "            model = LogisticRegression(max_iter=1000)\n",
    "            model.fit(X_train, y_train)\n",
    "            calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "            val_probs = model.predict_proba(X_val)[:, 1]\n",
    "\n",
    "        elif model_type == \"sgd\":\n",
    "            model = SGDClassifier(loss=\"log_loss\", max_iter=1000)\n",
    "            model.fit(X_train, y_train)\n",
    "            calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "            val_probs = model.predict_proba(X_val)[:, 1]\n",
    "\n",
    "        elif model_type == \"mlp\":\n",
    "            model = train_mlp(X_train, y_train)\n",
    "            with torch.no_grad():\n",
    "                calib_probs = model(torch.tensor(X_calib, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "                val_probs = model(torch.tensor(X_val, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "        calib_nc = 1 - calib_probs\n",
    "        threshold_q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "        val_nc = 1 - val_probs\n",
    "        votes[:, j, m] = (val_nc <= threshold_q).astype(int)\n",
    "\n",
    "Y_pred = (votes.sum(axis=2) >= 2).astype(int) \n",
    "coverages = []\n",
    "set_sizes = []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    covered = len(true & pred) / max(len(true), 1)\n",
    "    coverages.append(covered)\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "print(\"\\n📊 COCO: 3-Model Heterogeneous CP Ensemble\")\n",
    "print(f\"Empirical Coverage:           {coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size:  {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:               {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:         {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62",
   "metadata": {},
   "source": [
    "## Statistic Significance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MLP(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super().__init__()\n",
    "        self.net = nn.Sequential(\n",
    "            nn.Linear(input_dim, 256),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(256, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "    def forward(self, x): return self.net(x)\n",
    "\n",
    "def train_mlp(X, y, device):\n",
    "    model = MLP(X.shape[1]).to(device)\n",
    "    loader = DataLoader(\n",
    "        TensorDataset(torch.FloatTensor(X).to(device),\n",
    "                      torch.FloatTensor(y).unsqueeze(1).to(device)),\n",
    "        batch_size=128, shuffle=True\n",
    "    )\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in loader:\n",
    "            opt.zero_grad()\n",
    "            loss_fn(model(xb), yb).backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "\n",
    "def run_hetero_cp(seed, alpha=0.1):\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)\n",
    "    X      = np.load(\"clip_train_embeddings.npy\")\n",
    "    X_val  = np.load(\"clip_val_embeddings.npy\")\n",
    "    df_tr  = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "    df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "    labels = [c for c in df_tr.columns if c!=\"filename\"]\n",
    "    Y      = df_tr[labels].values\n",
    "    Y_val  = df_val[labels].values\n",
    "\n",
    "    n_labels = Y.shape[1]\n",
    "    n_val    = Y_val.shape[0]\n",
    "    device   = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "    votes = np.zeros((n_val, n_labels, 3), dtype=int)\n",
    "    model_types = [\"lr\",\"sgd\",\"mlp\"]\n",
    "    for j in range(n_labels):\n",
    "        for m, mtype in enumerate(model_types):\n",
    "            idxs   = np.random.choice(len(X), len(X), replace=True)\n",
    "            X_boot = X[idxs]; y_boot = Y[idxs,j]\n",
    "            Xt, Xc, yt, yc = train_test_split(X_boot, y_boot, test_size=0.3, random_state=seed+m)\n",
    "            if mtype==\"lr\":\n",
    "                mdl = LogisticRegression(max_iter=1000).fit(Xt,yt)\n",
    "                calib_p = mdl.predict_proba(Xc)[:,1]\n",
    "                val_p   = mdl.predict_proba(X_val)[:,1]\n",
    "            elif mtype==\"sgd\":\n",
    "                mdl = SGDClassifier(loss=\"log_loss\", max_iter=1000).fit(Xt,yt)\n",
    "                calib_p = mdl.predict_proba(Xc)[:,1]\n",
    "                val_p   = mdl.predict_proba(X_val)[:,1]\n",
    "            else: \n",
    "                mdl = train_mlp(Xt, yt, device)\n",
    "                with torch.no_grad():\n",
    "                    calib_p = mdl(torch.FloatTensor(Xc).to(device)).cpu().numpy().squeeze()\n",
    "                    val_p   = mdl(torch.FloatTensor(X_val).to(device)).cpu().numpy().squeeze()\n",
    "            nc_c = 1 - calib_p\n",
    "            thresh = np.quantile(nc_c[yc==1], 1-alpha) if (yc==1).sum()>0 else 1.0\n",
    "            votes[:,j,m] = ((1 - val_p) <= thresh).astype(int)\n",
    "    Y_pred = (votes.sum(axis=2) >= 2).astype(int)\n",
    "    return f1_score(Y_val, Y_pred, average=\"macro\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64",
   "metadata": {},
   "outputs": [],
   "source": [
    "f1_ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Single   CP: {:.3f} ± {:.3f}\".format(np.mean(f1_single),   np.std(f1_single,   ddof=1)))\n",
    "print(\"Ensemble CP: {:.3f} ± {:.3f}\".format(np.mean(f1_ensemble), np.std(f1_ensemble, ddof=1)))\n",
    "stat, p = wilcoxon(f1_single, f1_ensemble)\n",
    "print(f\"Wilcoxon p-value = {p:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66",
   "metadata": {},
   "outputs": [],
   "source": [
    "alphas = [0.01, 0.05, 0.1, 0.2]\n",
    "results = []\n",
    "\n",
    "for alpha in alphas:\n",
    "    print(f\"\\n🔁 Running for alpha = {alpha}\")\n",
    "    nc_scores = np.zeros((n_val, n_labels, n_models))\n",
    "\n",
    "    for j, label in enumerate(tqdm(label_columns)):\n",
    "        for m in range(n_models):\n",
    "            idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "            X_boot = X[idxs]\n",
    "            y_boot = Y[idxs, j]\n",
    "            X_train, X_calib, y_train, y_calib = train_test_split(X_boot, y_boot, test_size=0.3, random_state=m)\n",
    "            model_type = random.choice(model_choices)\n",
    "\n",
    "            if model_type == \"lr\":\n",
    "                model = LogisticRegression(max_iter=1000)\n",
    "                model.fit(X_train, y_train)\n",
    "                calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "                val_probs = model.predict_proba(X_val)[:, 1]\n",
    "            elif model_type == \"sgd\":\n",
    "                model = SGDClassifier(loss=\"log_loss\", max_iter=1000)\n",
    "                model.fit(X_train, y_train)\n",
    "                calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "                val_probs = model.predict_proba(X_val)[:, 1]\n",
    "            elif model_type == \"mlp\":\n",
    "                model = train_mlp(X_train, y_train)\n",
    "                with torch.no_grad():\n",
    "                    calib_probs = model(torch.tensor(X_calib, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "                    val_probs = model(torch.tensor(X_val, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "\n",
    "            calib_nc = 1 - calib_probs\n",
    "            q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "            val_nc = 1 - val_probs\n",
    "            nc_scores[:, j, m] = val_nc <= q \n",
    "\n",
    "    Y_pred = (nc_scores.sum(axis=2) >= (n_models // 2 + 1)).astype(int)\n",
    "    coverages = []\n",
    "    for i in range(n_val):\n",
    "        true = set(np.where(Y_val[i] == 1)[0])\n",
    "        pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "        if len(true) > 0:\n",
    "            coverages.append(len(true & pred) / len(true))\n",
    "        else:\n",
    "            coverages.append(1.0)\n",
    "    coverage = np.mean(coverages)\n",
    "\n",
    "    macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "\n",
    "    results.append({\n",
    "        \"alpha\": alpha,\n",
    "        \"target_coverage\": 1 - alpha,\n",
    "        \"empirical_coverage\": coverage,\n",
    "        \"macro_f1\": macro_f1\n",
    "    })\n",
    "\n",
    "df_ablation = pd.DataFrame(results)\n",
    "print(df_ablation.to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot(df_ablation[\"alpha\"], df_ablation[\"empirical_coverage\"], label=\"Empirical Coverage\")\n",
    "plt.plot(df_ablation[\"alpha\"], df_ablation[\"macro_f1\"], label=\"Macro-F1\")\n",
    "plt.xlabel(\"Alpha (Significance Level)\")\n",
    "plt.ylabel(\"Metric Value\")\n",
    "plt.title(\"Alpha vs Coverage and F1\")\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_models = 5\n",
    "alpha = 0.1\n",
    "top_k = 3\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "n_val = Y_val.shape[0]\n",
    "n_labels = len(label_columns)\n",
    "class MLP(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super().__init__()\n",
    "        self.net = nn.Sequential(\n",
    "            nn.Linear(input_dim, 256),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(256, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "    def forward(self, x): return self.net(x)\n",
    "\n",
    "def train_mlp(X, y):\n",
    "    model = MLP(X.shape[1]).to(device)\n",
    "    X_t = torch.tensor(X, dtype=torch.float32).to(device)\n",
    "    y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)\n",
    "    loader = DataLoader(TensorDataset(X_t, y_t), batch_size=128, shuffle=True)\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in loader:\n",
    "            opt.zero_grad()\n",
    "            loss = loss_fn(model(xb), yb)\n",
    "            loss.backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "\n",
    "nc_scores = np.zeros((n_val, n_labels, n_models))\n",
    "model_choices = [\"lr\", \"sgd\", \"mlp\"]\n",
    "\n",
    "alpha_values = [0.25, 0.3]\n",
    "ablation_results = []\n",
    "\n",
    "for alpha in alpha_values:\n",
    "    print(f\"\\n🔁 Running Heterogeneous CP Ensemble for α = {alpha}\")\n",
    "    prediction_sets = np.zeros((n_val, n_labels, n_models))\n",
    "\n",
    "    for label_idx, label in enumerate(tqdm(label_columns)):\n",
    "        for model_idx in range(n_models):\n",
    "            idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "            X_boot = X[idxs]\n",
    "            y_boot = Y[idxs, label_idx]\n",
    "\n",
    "            X_train, X_calib, y_train, y_calib = train_test_split(\n",
    "                X_boot, y_boot, test_size=0.3, random_state=model_idx\n",
    "            )\n",
    "\n",
    "            model_type = random.choice(model_choices)\n",
    "\n",
    "            if model_type == \"lr\":\n",
    "                model = LogisticRegression(max_iter=1000)\n",
    "                model.fit(X_train, y_train)\n",
    "                calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "                val_probs = model.predict_proba(X_val)[:, 1]\n",
    "\n",
    "            elif model_type == \"sgd\":\n",
    "                model = SGDClassifier(loss=\"log_loss\", max_iter=1000)\n",
    "                model.fit(X_train, y_train)\n",
    "                calib_probs = model.predict_proba(X_calib)[:, 1]\n",
    "                val_probs = model.predict_proba(X_val)[:, 1]\n",
    "\n",
    "            elif model_type == \"mlp\":\n",
    "                model = train_mlp(X_train, y_train)\n",
    "                with torch.no_grad():\n",
    "                    calib_probs = model(torch.tensor(X_calib, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "                    val_probs = model(torch.tensor(X_val, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "\n",
    "            calib_nc = 1 - calib_probs\n",
    "            q_hat = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "            val_nc = 1 - val_probs\n",
    "            prediction_sets[:, label_idx, model_idx] = (val_nc <= q_hat)\n",
    "    Y_pred = (prediction_sets.sum(axis=2) >= (n_models // 2 + 1)).astype(int)\n",
    "    image_coverages = []\n",
    "    for i in range(n_val):\n",
    "        true_labels = set(np.where(Y_val[i] == 1)[0])\n",
    "        pred_labels = set(np.where(Y_pred[i] == 1)[0])\n",
    "        if len(true_labels) > 0:\n",
    "            image_coverages.append(len(true_labels & pred_labels) / len(true_labels))\n",
    "        else:\n",
    "            image_coverages.append(1.0)\n",
    "\n",
    "    empirical_coverage = np.mean(image_coverages)\n",
    "    macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "\n",
    "    ablation_results.append({\n",
    "        \"alpha\": alpha,\n",
    "        \"target_coverage\": 1 - alpha,\n",
    "        \"empirical_coverage\": empirical_coverage,\n",
    "        \"macro_f1\": macro_f1\n",
    "    })\n",
    "    df_ablation_2 = pd.DataFrame(ablation_results)\n",
    "    print(df_ablation_2.to_string(index=False))\n",
    "\n",
    "df_ablation_2 = pd.DataFrame(ablation_results)\n",
    "print(df_ablation_2.to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69",
   "metadata": {},
   "source": [
    "## Heterogeneous with SOTA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = 0.1\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "Y = df_train[label_columns].values.astype(np.float32)\n",
    "Y_val = df_val[label_columns].values.astype(np.float32)\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "\n",
    "# 1. RNN\n",
    "class RNN(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim=128, num_labels=1):\n",
    "        super().__init__()\n",
    "        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)\n",
    "        self.classifier = nn.Linear(hidden_dim, num_labels)\n",
    "    def forward(self, x):\n",
    "        x = x.unsqueeze(1)\n",
    "        h, _ = self.rnn(x)\n",
    "        return torch.sigmoid(self.classifier(h[:, -1]))\n",
    "\n",
    "# 2. Transformer-based\n",
    "class Transformer(nn.Module):\n",
    "    def __init__(self, input_dim, num_labels=1, n_heads=4):\n",
    "        super().__init__()\n",
    "        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=n_heads)\n",
    "        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)\n",
    "        self.classifier = nn.Linear(input_dim, num_labels)\n",
    "    def forward(self, x):\n",
    "        x = x.unsqueeze(1)  \n",
    "        x = self.encoder(x) \n",
    "        return torch.sigmoid(self.classifier(x.squeeze(1))) \n",
    "\n",
    "class MLPMixer(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim=256):\n",
    "        super().__init__()\n",
    "        self.model = nn.Sequential(\n",
    "            nn.LayerNorm(input_dim),\n",
    "            nn.Linear(input_dim, hidden_dim),\n",
    "            nn.GELU(),\n",
    "            nn.Linear(hidden_dim, 1),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "    def forward(self, x):\n",
    "        return self.model(x)\n",
    "\n",
    "\n",
    "def train_model(model, X, y, num_labels=1):\n",
    "    model = model.to(device)\n",
    "    X_t = torch.tensor(X, dtype=torch.float32).to(device)\n",
    "    y_t = torch.tensor(y, dtype=torch.float32).to(device)\n",
    "    if y.ndim == 1:\n",
    "        y_t = y_t.unsqueeze(1)\n",
    "    loader = DataLoader(TensorDataset(X_t, y_t), batch_size=128, shuffle=True)\n",
    "    opt = torch.optim.Adam(model.parameters(), lr=1e-3)\n",
    "    loss_fn = nn.BCELoss()\n",
    "    model.train()\n",
    "    for _ in range(10):\n",
    "        for xb, yb in loader:\n",
    "            opt.zero_grad()\n",
    "            loss = loss_fn(model(xb), yb)\n",
    "            loss.backward()\n",
    "            opt.step()\n",
    "    return model\n",
    "\n",
    "model_types = [\"rnn\", \"transformer\", \"mlpmixer\"]\n",
    "n_models = len(model_types)\n",
    "votes = np.zeros((n_val, n_labels, n_models))\n",
    "f1_scores = np.zeros((n_labels, n_models))\n",
    "\n",
    "print(\"🚀 Training CP Ensemble (RNN + Transformer + ML-GCN)\")\n",
    "\n",
    "for j in tqdm(range(n_labels)):\n",
    "    for m_idx, m_type in enumerate(model_types):\n",
    "        idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "        X_boot = X[idxs]\n",
    "        y_boot = Y[idxs, j]\n",
    "        X_train, X_calib, y_train, y_calib = train_test_split(X_boot, y_boot, test_size=0.3, random_state=m_idx)\n",
    "        if m_type == \"rnn\":\n",
    "            model = RNN(X.shape[1])\n",
    "        elif m_type == \"transformer\":\n",
    "            model = Transformer(X.shape[1])\n",
    "        elif m_type == \"mlpmixer\":\n",
    "            model = MLPMixer(X.shape[1])\n",
    "\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        model = train_model(model, X_train, y_train)\n",
    "        with torch.no_grad():\n",
    "            val_probs = model(torch.tensor(X_val, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "            calib_probs = model(torch.tensor(X_calib, dtype=torch.float32).to(device)).cpu().numpy().squeeze()\n",
    "        calib_nc = 1 - calib_probs\n",
    "        threshold_q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "        val_nc = 1 - val_probs\n",
    "        pred = (val_nc <= threshold_q).astype(int)\n",
    "        votes[:, j, m_idx] = pred\n",
    "        f1_scores[j, m_idx] = f1_score(Y_val[:, j], pred, zero_division=0)\n",
    "weights = f1_scores / (np.sum(f1_scores, axis=1, keepdims=True) + 1e-6)\n",
    "Y_pred = (np.sum(votes * weights[None, :, :], axis=2) >= 0.5).astype(int)\n",
    "coverages, set_sizes = [], []\n",
    "\n",
    "for i in range(n_val):\n",
    "    true = set(np.where(Y_val[i] == 1)[0])\n",
    "    pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "    coverages.append(len(true & pred) / max(len(true), 1))\n",
    "    set_sizes.append(len(pred))\n",
    "\n",
    "macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "coverage = np.mean(coverages)\n",
    "avg_size = np.mean(set_sizes)\n",
    "marginal_coverages = []\n",
    "for j in range(n_labels):\n",
    "    true_pos = Y_val[:, j] == 1\n",
    "    predicted = Y_pred[:, j] == 1\n",
    "    if np.sum(true_pos) > 0:\n",
    "        marginal_coverages.append(np.sum(true_pos & predicted) / np.sum(true_pos))\n",
    "    else:\n",
    "        marginal_coverages.append(np.nan)\n",
    "marginal_coverage = np.nanmean(marginal_coverages)\n",
    "print(\"\\n📊 CP Ensemble (RNN + Transformer + MLP mixer)\")\n",
    "print(f\"Empirical Coverage:           {coverage:.4f}\")\n",
    "print(f\"Marginal Coverage:            {marginal_coverage:.4f}\")\n",
    "print(f\"Average Prediction Set Size:  {avg_size:.2f}\")\n",
    "print(f\"Macro-F1 Score:               {macro_f1:.4f}\")\n",
    "print(f\"Exact Match Accuracy:         {exact_match:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71",
   "metadata": {},
   "source": [
    "## Ablation Study"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.load(\"clip_train_embeddings.npy\")\n",
    "X_val = np.load(\"clip_val_embeddings.npy\")\n",
    "\n",
    "df_train = pd.read_csv(\"coco_multilabel_train_encoded.csv\")\n",
    "df_val = pd.read_csv(\"coco_multilabel_val_encoded.csv\")\n",
    "label_columns = [col for col in df_train.columns if col != \"filename\"]\n",
    "\n",
    "Y = df_train[label_columns].values\n",
    "Y_val = df_val[label_columns].values\n",
    "\n",
    "n_labels = len(label_columns)\n",
    "n_val = Y_val.shape[0]\n",
    "\n",
    "max_models = 10\n",
    "alpha = 0.1\n",
    "\n",
    "results = []\n",
    "\n",
    "for n_models in range(1, max_models + 1):\n",
    "    print(f\"\\n🚀 Running for M = {n_models} models...\")\n",
    "    ensemble_votes = np.zeros((n_val, n_labels, n_models), dtype=int)\n",
    "\n",
    "    for label_idx, label in enumerate(tqdm(label_columns, desc=f\"M={n_models}\")):\n",
    "        for m in range(n_models):\n",
    "            idxs = np.random.choice(len(X), len(X), replace=True)\n",
    "            X_boot = X[idxs]\n",
    "            y_boot = Y[idxs, label_idx]\n",
    "            X_train, X_calib, y_train, y_calib = train_test_split(\n",
    "                X_boot, y_boot, test_size=0.3, random_state=m\n",
    "            )\n",
    "            clf = LogisticRegression(max_iter=1000)\n",
    "            clf.fit(X_train, y_train)\n",
    "            calib_probs = clf.predict_proba(X_calib)[:, 1]\n",
    "            calib_nc = 1 - calib_probs\n",
    "            threshold_q = np.quantile(calib_nc[y_calib == 1], 1 - alpha)\n",
    "            val_probs = clf.predict_proba(X_val)[:, 1]\n",
    "            val_nc = 1 - val_probs\n",
    "            val_included = (val_nc <= threshold_q).astype(int)\n",
    "            ensemble_votes[:, label_idx, m] = val_included\n",
    "    Y_pred = (np.sum(ensemble_votes, axis=2) >= ((n_models + 1) // 2)).astype(int)\n",
    "    coverages = []\n",
    "    set_sizes = []\n",
    "\n",
    "    for i in range(n_val):\n",
    "        true = set(np.where(Y_val[i] == 1)[0])\n",
    "        pred = set(np.where(Y_pred[i] == 1)[0])\n",
    "        covered = len(true & pred) / max(len(true), 1)\n",
    "        coverages.append(covered)\n",
    "        set_sizes.append(len(pred))\n",
    "\n",
    "    coverage = np.mean(coverages)\n",
    "    avg_size = np.mean(set_sizes)\n",
    "    marginal_coverages = []\n",
    "    for j in range(Y_val.shape[1]):\n",
    "        true_positives = (Y_val[:, j] == 1)\n",
    "        predicted_as_positive = (Y_pred[:, j] == 1)\n",
    "        covered = np.logical_and(true_positives, predicted_as_positive)\n",
    "        \n",
    "        if true_positives.sum() > 0:\n",
    "            marginal_coverages.append(covered.sum() / true_positives.sum())\n",
    "        else:\n",
    "            marginal_coverages.append(np.nan)\n",
    "\n",
    "    marginal_coverages = np.array(marginal_coverages)\n",
    "    marginal_coverage_mean = np.nanmean(marginal_coverages)\n",
    "    macro_f1 = f1_score(Y_val, Y_pred, average=\"macro\")\n",
    "    exact_match = (Y_val == Y_pred).all(axis=1).mean()\n",
    "    results.append({\n",
    "        'n_models': n_models,\n",
    "        'empirical_coverage': coverage,\n",
    "        'avg_set_size': avg_size,\n",
    "        'macro_f1': macro_f1,\n",
    "        'exact_match': exact_match,\n",
    "        'marginal_coverage': marginal_coverage_mean\n",
    "    })\n",
    "\n",
    "    print(f\"\\n📊 Results for M = {n_models}:\")\n",
    "    print(f\"Empirical Coverage:        {coverage:.4f}\")\n",
    "    print(f\"Average Set Size:          {avg_size:.2f}\")\n",
    "    print(f\"Macro-F1 Score:            {macro_f1:.4f}\")\n",
    "    print(f\"Exact Match Accuracy:      {exact_match:.4f}\")\n",
    "    print(f\"Marginal Coverage:         {marginal_coverage_mean:.4f}\")\n",
    "print(\"\\n\\n📈 Final Results Summary:\")\n",
    "print(\"M\\tEmp.Cov\\tAvg.Size\\tMacro-F1\\tExactMatch\\tMarg.Cov\")\n",
    "for res in results:\n",
    "    print(f\"{res['n_models']}\\t{res['empirical_coverage']:.4f}\\t{res['avg_set_size']:.2f}\\t\\t\"\n",
    "          f\"{res['macro_f1']:.4f}\\t\\t{res['exact_match']:.4f}\\t\\t{res['marginal_coverage']:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set(style=\"whitegrid\", context=\"talk\")\n",
    "\n",
    "ensemble_sizes = [1,2,3,4,5,6,7,8,9,10]\n",
    "empirical_coverage = [0.8909, 0.9168, 0.8957, 0.9085, 0.8961, 0.9039, 0.8962, 0.9029, 0.8973, 0.9011]\n",
    "avg_set_size = [7.39, 8.96, 7.25, 8.02, 7.23, 7.66, 7.15, 7.51, 7.18, 7.39]\n",
    "macro_f1 = [0.5458, 0.5145, 0.5630, 0.5428, 0.5677, 0.5556, 0.5697, 0.5593, 0.5701, 0.5643]\n",
    "marginal_coverage = [0.8645, 0.9013, 0.8733, 0.8915, 0.8725, 0.8832, 0.8736, 0.8835, 0.8753, 0.8812]\n",
    "fig, axes = plt.subplots(1, 2, figsize=(13, 5), sharex=True)\n",
    "axes[0].plot(ensemble_sizes, empirical_coverage, '-o', label='Empirical Coverage', linewidth=2.5)\n",
    "axes[0].plot(ensemble_sizes, marginal_coverage, '-s', label='Marginal Coverage', linewidth=2.5)\n",
    "axes[0].set_ylabel('Coverage', fontsize=13)\n",
    "axes[0].set_ylim(0.85, 0.93)\n",
    "axes[0].set_xlabel('Ensemble Size (M)', fontsize=13)\n",
    "axes[0].legend(fontsize=11)\n",
    "axes[0].set_title('Coverage vs Ensemble Size', fontsize=13)\n",
    "axes[0].tick_params(labelsize=11)\n",
    "ax_f1 = axes[1]\n",
    "ax_set = ax_f1.twinx()\n",
    "\n",
    "ln1 = ax_f1.plot(ensemble_sizes, macro_f1, '-^', label='Macro-F1', color='green', linewidth=2.5)\n",
    "ax_f1.set_ylabel('Macro-F1', fontsize=13, color='green')\n",
    "ax_f1.tick_params(axis='y', labelcolor='green', labelsize=11)\n",
    "ax_f1.set_ylim(0.50, 0.58)\n",
    "\n",
    "ln2 = ax_set.plot(ensemble_sizes, avg_set_size, '-d', label='Avg. Set Size', color='red', linewidth=2.5)\n",
    "ax_set.set_ylabel('Avg. Set Size', fontsize=13, color='red')\n",
    "ax_set.tick_params(axis='y', labelcolor='red', labelsize=11)\n",
    "ax_set.set_ylim(6.5, 9.5)\n",
    "\n",
    "ax_f1.set_xlabel('Ensemble Size (M)', fontsize=13)\n",
    "ax_f1.set_title('Macro-F1 and Set Size vs Ensemble Size', fontsize=13)\n",
    "lines = ln1 + ln2\n",
    "labels = [line.get_label() for line in lines]\n",
    "ax_f1.legend(lines, labels, loc='upper left', fontsize=11)\n",
    "\n",
    "plt.suptitle('Impact of Ensemble Size on Multilabel Prediction', fontsize=15, y=1.02)\n",
    "plt.tight_layout()\n",
    "plt.savefig('ensemble_size_dual_right_yaxis.png', dpi=300)\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
