{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c3d6797",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "home = os.path.expanduser(\"~\")\n",
    "dataset_dir = os.path.join(home, \"datasets\")\n",
    "\n",
    "csv_files = [f for f in os.listdir(dataset_dir) if f.endswith(\".csv\") and \"mmlu\" in f.lower() and \"new\" in f.lower()]\n",
    "\n",
    "csv_dfs = {}\n",
    "for f in csv_files:\n",
    "    print(f)\n",
    "    csv_dfs[f] = pd.read_csv(os.path.join(dataset_dir, f), encoding='utf-8')\n",
    "\n",
    "problem_rows_dict = {}\n",
    "\n",
    "valid_choices = {\"A\", \"B\", \"C\", \"D\"}\n",
    "pattern = re.compile(r'\\b([A-D])(?![A-Za-z0-9])')\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    row_count = len(df)\n",
    "\n",
    "    last_col = df.columns[-1]\n",
    "    invalid_indices = []\n",
    "\n",
    "    for idx, row in df.iterrows():\n",
    "        value = str(row[last_col]).strip()\n",
    "\n",
    "        matches = pattern.findall(value)\n",
    "\n",
    "        uniq = set(matches)\n",
    "        if len(uniq) == 1 and next(iter(uniq)) in valid_choices:\n",
    "            df.loc[idx, last_col] = next(iter(uniq))\n",
    "        else:\n",
    "            invalid_indices.append(idx)\n",
    "\n",
    "    invalid_count = len(invalid_indices)\n",
    "    problem_rows_dict[csv_file] = invalid_indices\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d3e99d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "all_indices_sets = []\n",
    "for csv_file in problem_rows_dict:\n",
    "    file_path = os.path.join(dataset_dir, csv_file)\n",
    "    try:\n",
    "        df = pd.read_csv(file_path)\n",
    "        total_indices = set(df.index.tolist())\n",
    "        problem_indices = set(problem_rows_dict[csv_file])\n",
    "        valid_indices = total_indices - problem_indices\n",
    "        all_indices_sets.append(valid_indices)\n",
    "    except Exception as e:\n",
    "        pass\n",
    "\n",
    "if all_indices_sets:\n",
    "    common_valid_indices = set.intersection(*all_indices_sets)\n",
    "else:\n",
    "    common_valid_indices = set()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1a60fb02",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bbb0201",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "data_dict = {}\n",
    "\n",
    "for csv_file, df in csv_dfs.items():\n",
    "    model_name = csv_file.replace(\".csv\", \"\")\n",
    "    model_name = re.sub(r\"(mmlu|new)\", \"\", model_name, flags=re.IGNORECASE).strip(\"_-\")\n",
    "    \n",
    "    df_valid = df.loc[list(common_valid_indices)].copy()\n",
    "    \n",
    "    for idx, row in df_valid.iterrows():\n",
    "        if \"answer\" in df.columns:\n",
    "            correct_answer = str(row[\"answer\"]).strip()\n",
    "            if correct_answer not in {\"A\", \"B\", \"C\", \"D\"}:\n",
    "                correct_answer = \"A\"\n",
    "        else:\n",
    "            correct_answer = \"A\"  \n",
    "        \n",
    "        model_answer = str(row.iloc[-1]).strip()\n",
    "        if model_answer not in {\"A\", \"B\", \"C\", \"D\"}:\n",
    "            model_answer = \"A\"\n",
    "        \n",
    "        if idx not in data_dict:\n",
    "            data_dict[idx] = {\"idx\": idx, \"correct_answer\": correct_answer}\n",
    "        \n",
    "        data_dict[idx][model_name] = model_answer\n",
    "\n",
    "training_df = pd.DataFrame(list(data_dict.values()))\n",
    "\n",
    "print(training_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0372a653",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "training_df.to_csv(os.path.join(dataset_dir, \"mmlu_training_data.csv\"), index=False)\n",
    "training_df['qwen3-4B'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c662ca41",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv('mmlu_training_data.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d0f8792",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_df = data.sample(frac=0.6, random_state=42)  # random_state 确保可复现\n",
    "\n",
    "test_df = data.drop(train_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d6c3eb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_cols = [\n",
    "    'llama3B', 'qwen25-3B', \n",
    "    'qwen3-14B', 'qwen3-0_6B',\n",
    "    'qwen3-8B', 'llama8B', 'with_gpt35', 'qwen25-1_5B', 'qwen3-4B', 'phi4',\n",
    "    'qwen25-14B', 'qwen25-7B', 'llama1B', 'phi4-mini-instruct',\n",
    "    'with_gpt4o', 'qwen3-1_7B', 'qwen25-0_5B'\n",
    "]\n",
    "\n",
    "acc_dict = {\n",
    "    model: (train_df[model] == train_df['correct_answer']).mean()\n",
    "    for model in model_cols\n",
    "}\n",
    "acc_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a786e2f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "def sigma_inv(x):\n",
    "    \"\"\"\n",
    "    Inverse of the sigmoid function.\n",
    "    \n",
    "    Parameters:\n",
    "    x (float): The input value for which to compute the inverse sigmoid.\n",
    "    \n",
    "    Returns:\n",
    "    float: The inverse sigmoid of the input value.\n",
    "    \"\"\"\n",
    "    if x <= 0 or x >= 1:\n",
    "        raise ValueError(\"Input must be in the range (0, 1) exclusive.\")\n",
    "    \n",
    "    return math.log(3*x / (1 - x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c930f12",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "classes = np.array([\"A\", \"B\", \"C\", \"D\"])\n",
    "\n",
    "selected = ['llama8B', 'qwen25-14B', 'qwen3-14B', 'phi4', 'with_gpt4o']\n",
    "\n",
    "weights = np.array([sigma_inv(acc_dict[m]) for m in selected], dtype=float)\n",
    "\n",
    "X = test_df[selected].applymap(lambda s: str(s).strip().upper())\n",
    "\n",
    "scores = np.column_stack([\n",
    "    (X.eq(c)).to_numpy().astype(int) @ weights\n",
    "    for c in classes\n",
    "])\n",
    "\n",
    "pred_idx = scores.argmax(axis=1)\n",
    "pred = classes[pred_idx]\n",
    "\n",
    "y_true = test_df['correct_answer'].astype(str).str.upper().str.strip().to_numpy()\n",
    "test_acc = (pred == y_true).mean()\n",
    "\n",
    "test_acc_dict = {\n",
    "    m: (test_df[m].astype(str).str.upper().str.strip() == y_true).mean()\n",
    "    for m in selected\n",
    "}\n",
    "\n",
    "best_model = max(test_acc_dict, key=test_acc_dict.get)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "930126f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import itertools\n",
    "\n",
    "cols = list(test_df.columns)\n",
    "qwen25_cands = [c for c in cols if c.startswith('qwen25-')]\n",
    "qwen3_cands  = [c for c in cols if c.startswith('qwen3-')]\n",
    "gpt_cands    = ['with_gpt35', 'with_gpt4o']  \n",
    "llama_cands  = [c for c in cols if c.startswith('llama')]\n",
    "phi_cands    = [c for c in cols if c.startswith('phi')]\n",
    "\n",
    "classes = np.array([\"A\", \"B\", \"C\", \"D\"])\n",
    "\n",
    "y_true = test_df['correct_answer'].astype(str).str.upper().str.strip().to_numpy()\n",
    "\n",
    "results = []\n",
    "\n",
    "for selected in itertools.product(qwen25_cands, qwen3_cands, gpt_cands, llama_cands, phi_cands):\n",
    "    selected = list(selected)\n",
    "\n",
    "    weights = np.array([sigma_inv(acc_dict[m]) for m in selected], dtype=float)\n",
    "\n",
    "    X = test_df[selected].applymap(lambda s: str(s).strip().upper())\n",
    "\n",
    "    scores = np.column_stack([\n",
    "        (X.eq(c)).to_numpy().astype(int) @ weights\n",
    "        for c in classes\n",
    "    ])\n",
    "    pred = classes[scores.argmax(axis=1)]\n",
    "\n",
    "    ensemble_acc = (pred == y_true).mean()\n",
    "\n",
    "    test_acc_dict = {\n",
    "        m: (test_df[m].astype(str).str.upper().str.strip() == y_true).mean()\n",
    "        for m in selected\n",
    "    }\n",
    "\n",
    "    best_single_model = max(test_acc_dict, key=test_acc_dict.get)\n",
    "    best_single_acc = test_acc_dict[best_single_model]\n",
    "\n",
    "    best_model_train = max(selected, key=acc_dict.get)\n",
    "    best_model_train_acc = test_acc_dict[best_model_train]\n",
    "    mv_scores = np.column_stack([\n",
    "        (X.eq(c)).to_numpy().astype(int) @ np.ones(len(selected), dtype=float)\n",
    "        for c in classes\n",
    "    ])\n",
    "    mv_pred = classes[mv_scores.argmax(axis=1)]\n",
    "    mv_acc = (mv_pred == y_true).mean()\n",
    "\n",
    "    results.append({\n",
    "        'qwen25': selected[0],\n",
    "        'qwen3':  selected[1],\n",
    "        'gpt':    selected[2],\n",
    "        'llama':  selected[3],\n",
    "        'phi':    selected[4],\n",
    "        'ensemble_acc': ensemble_acc,\n",
    "        'mv_acc': mv_acc,\n",
    "        'best_model_train': best_model_train,\n",
    "        'best_model_train_acc': best_model_train_acc,\n",
    "        'best_single_model': best_single_model,\n",
    "        'best_single_acc': best_single_acc,\n",
    "        **{f'acc::{m}': test_acc_dict[m] for m in selected},\n",
    "    })\n",
    "\n",
    "res_df = pd.DataFrame(results).sort_values('ensemble_acc', ascending=False).reset_index(drop=True)\n",
    "res_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dd6eff7",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['mv_acc'] < 0\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e2b67c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['best_single_acc'] < 0\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "451b4343",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['best_model_train_acc'] < 0\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "071aeed9",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['best_single_acc'] >= 0\n",
    "\n",
    "\n",
    "good_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "good_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "632eaa2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import itertools\n",
    "\n",
    "CLASSES = [\"A\", \"B\", \"C\", \"D\"]\n",
    "VALID = set(CLASSES)\n",
    "\n",
    "def _empirical_conditionals_4(col_1, col_2):\n",
    "    \"\"\"\n",
    "    \"\"\"\n",
    "    s = pd.DataFrame({\"c1\": col_1, \"c2\": col_2}).dropna()\n",
    "    s[\"c1\"] = s[\"c1\"].astype(str).str.upper().str.strip()\n",
    "    s[\"c2\"] = s[\"c2\"].astype(str).str.upper().str.strip()\n",
    "    s = s[s[\"c1\"].isin(VALID) & s[\"c2\"].isin(VALID)]\n",
    "    if len(s) == 0:\n",
    "        return []\n",
    "\n",
    "    out = []\n",
    "    for l2 in CLASSES:\n",
    "        mask = (s[\"c2\"] == l2)\n",
    "        denom = mask.sum()\n",
    "        if denom == 0:\n",
    "            continue\n",
    "        sub = s.loc[mask, \"c1\"]\n",
    "        for l1 in CLASSES:\n",
    "            num = (sub == l1).sum()\n",
    "            out.append((l1, l2, num / denom))\n",
    "    return out\n",
    "\n",
    "def estimate_x(data: pd.DataFrame, selected, steps=4000, lr=1e-2, seed=0, verbose=False, init_from_answer=False):\n",
    "    \"\"\"\n",
    "    \"\"\"\n",
    "    torch.manual_seed(seed)\n",
    "    cols = list(selected)\n",
    "    k = len(cols)\n",
    "\n",
    "    obs = []\n",
    "    for i, j in itertools.combinations(range(k), 2):\n",
    "        conds = _empirical_conditionals_4(data[cols[i]], data[cols[j]])\n",
    "        for l1, l2, p in conds:\n",
    "            obs.append((i, j, l1, l2, float(p)))\n",
    "\n",
    "    if init_from_answer and (\"correct_answer\" in data.columns):\n",
    "        y = data[\"correct_answer\"].astype(str).str.upper().str.strip()\n",
    "        init_x = []\n",
    "        for c in cols:\n",
    "            v = (data[c].astype(str).str.upper().str.strip() == y).mean()\n",
    "            if np.isnan(v) or v <= 0.0 or v >= 1.0:\n",
    "                v = 0.7\n",
    "            init_x.append(v)\n",
    "        init_x = np.array(init_x, dtype=np.float32)\n",
    "    else:\n",
    "        init_x = np.full(k, 0.7, dtype=np.float32)\n",
    "\n",
    "    z0 = np.log(init_x + 1e-6) - np.log(1 - init_x + 1e-6)\n",
    "    z = nn.Parameter(torch.tensor(z0, dtype=torch.float32))\n",
    "    opt = torch.optim.AdamW([z], lr=lr)\n",
    "\n",
    "    def p_cond(x1, x2, same: bool):\n",
    "        if same:\n",
    "            return x1 * x2 + (1 - x1) * (1 - x2) / 3.0\n",
    "        else:\n",
    "            return (x1 * (1 - x2) / 3.0) + (x2 * (1 - x1) / 3.0) + (2.0 * (1 - x1) * (1 - x2) / 9.0)\n",
    "\n",
    "    for t in range(steps):\n",
    "        opt.zero_grad()\n",
    "        x = torch.sigmoid(z)  # shape: [k]\n",
    "        losses = []\n",
    "        for i, j, l1, l2, target in obs:\n",
    "            same = (l1 == l2)\n",
    "            yhat = p_cond(x[i], x[j], same)\n",
    "            losses.append((yhat - target) ** 2)\n",
    "        loss = torch.stack(losses).sum()\n",
    "        loss.backward()\n",
    "        opt.step()\n",
    "        if verbose and (t % 1000 == 0 or t == steps - 1):\n",
    "            print(f\"step {t}, loss={loss.item():.6f}\")\n",
    "\n",
    "    x_hat = torch.sigmoid(z).detach().cpu().numpy()\n",
    "    return {cols[i]: float(x_hat[i]) for i in range(k)}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "331e0b47",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "selected = ['llama8B', 'qwen25-14B', 'qwen3-14B', 'phi4', 'with_gpt4o']\n",
    "x_hat = estimate_x(data, selected, steps=10000, lr=1e-2, verbose=True)\n",
    "print(x_hat)\n",
    "acc_dict = x_hat  \n",
    "\n",
    "weights = np.array([sigma_inv(acc_dict[m]) for m in selected], dtype=float)\n",
    "\n",
    "classes = np.array([\"A\", \"B\", \"C\", \"D\"])\n",
    "X = data[selected].applymap(lambda s: str(s).strip().upper())\n",
    "\n",
    "scores = np.column_stack([\n",
    "    (X.eq(c)).to_numpy().astype(int) @ weights\n",
    "    for c in classes\n",
    "])\n",
    "\n",
    "pred_idx = scores.argmax(axis=1)\n",
    "pred = classes[pred_idx]\n",
    "\n",
    "y_true = data['correct_answer'].astype(str).str.upper().str.strip().to_numpy()\n",
    "ensemble_acc = (pred == y_true).mean()\n",
    "\n",
    "acc_dict_test = {\n",
    "    m: (data[m].astype(str).str.upper().str.strip() == y_true).mean()\n",
    "    for m in selected\n",
    "}\n",
    "best_model = max(acc_dict_test, key=acc_dict_test.get)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27425374",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import itertools\n",
    "\n",
    "\n",
    "cols = list(data.columns)\n",
    "qwen25_cands = [c for c in cols if c.startswith('qwen25-')][:1]\n",
    "qwen3_cands  = [c for c in cols if c.startswith('qwen3-')][:1]\n",
    "gpt_cands    = ['with_gpt35', 'with_gpt4o']   \n",
    "llama_cands  = [c for c in cols if c.startswith('llama')][:1]\n",
    "phi_cands    = [c for c in cols if c.startswith('phi')][:1]\n",
    "\n",
    "classes = np.array([\"A\", \"B\", \"C\", \"D\"])\n",
    "\n",
    "\n",
    "y_true = data['correct_answer'].astype(str).str.upper().str.strip().to_numpy()\n",
    "\n",
    "results = []\n",
    "\n",
    "\n",
    "for selected in tqdm(itertools.product(qwen25_cands, qwen3_cands, gpt_cands, llama_cands, phi_cands)):\n",
    "    selected = list(selected)\n",
    "\n",
    "\n",
    "    x_hat = estimate_x(data, selected, steps=10000, lr=1e-2, verbose=False)\n",
    "    acc_dict = x_hat  \n",
    "\n",
    "\n",
    "    weights = np.array([sigma_inv(acc_dict[m]) for m in selected], dtype=float)\n",
    "\n",
    "\n",
    "    X = data[selected].applymap(lambda s: str(s).strip().upper())\n",
    "    scores = np.column_stack([\n",
    "        (X.eq(c)).to_numpy().astype(int) @ weights\n",
    "        for c in classes\n",
    "    ])\n",
    "    pred = classes[scores.argmax(axis=1)]\n",
    "\n",
    "\n",
    "    ensemble_acc = (pred == y_true).mean()\n",
    "\n",
    "\n",
    "    acc_dict_test = {\n",
    "        m: (data[m].astype(str).str.upper().str.strip() == y_true).mean()\n",
    "        for m in selected\n",
    "    }\n",
    "    best_single_model = max(acc_dict_test, key=acc_dict_test.get)\n",
    "\n",
    "\n",
    "    best_model_train_idx = int(weights.argmax())\n",
    "    best_model_train = selected[best_model_train_idx]\n",
    "    best_model_train_acc = acc_dict_test[best_model_train]\n",
    "\n",
    "\n",
    "    mv_scores = np.column_stack([\n",
    "        (X.eq(c)).to_numpy().astype(int) @ np.ones(len(selected), dtype=float)\n",
    "        for c in classes\n",
    "    ])\n",
    "    mv_pred = classes[mv_scores.argmax(axis=1)]\n",
    "    mv_acc = (mv_pred == y_true).mean()\n",
    "\n",
    "    results.append({\n",
    "        'qwen25': selected[0],\n",
    "        'qwen3':  selected[1],\n",
    "        'gpt':    selected[2],\n",
    "        'llama':  selected[3],\n",
    "        'phi':    selected[4],\n",
    "        'ensemble_acc': ensemble_acc,\n",
    "        'mv_acc': mv_acc,\n",
    "        'best_model_train': best_model_train,\n",
    "        'best_model_train_acc': best_model_train_acc,\n",
    "        'best_single_model': best_single_model,\n",
    "        'best_single_acc': acc_dict_test[best_single_model],\n",
    "        **{f'acc::{m}': acc_dict_test[m] for m in selected},\n",
    "    })\n",
    "\n",
    "res_df = pd.DataFrame(results).sort_values('ensemble_acc', ascending=False).reset_index(drop=True)\n",
    "res_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82e4d909",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['mv_acc'] < 0\n",
    "\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)\n",
    "len(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8b25ab1",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['best_single_acc'] < 0\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)\n",
    "len(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f4352ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = res_df['ensemble_acc'] - res_df['best_model_train_acc'] < 0\n",
    "\n",
    "\n",
    "bad_rows = res_df.loc[mask, ['qwen25', 'qwen3', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)\n",
    "len(bad_rows)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
