{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6b28d22",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "home = os.path.expanduser(\"~\")\n",
    "dataset_dir = os.path.join(home, \"datasets\")\n",
    "\n",
    "csv_files = [f for f in os.listdir(dataset_dir) if f.startswith(\"mmlu_\") and f.endswith(\"new.csv\")]\n",
    "\n",
    "csv_dfs = {}\n",
    "for f in csv_files:\n",
    "    csv_dfs[f] = pd.read_csv(os.path.join(dataset_dir, f))\n",
    "\n",
    "problem_rows_dict = {}\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    row_count = len(df)\n",
    "\n",
    "    last_col = df.columns[-1]\n",
    "    invalid_indices = []\n",
    "    \n",
    "    for idx, row in df.iterrows():\n",
    "        value = str(row[last_col]).strip()\n",
    "    \n",
    "        a_matches = re.findall(r'\\bA(?![A-Za-z0-9])', value)\n",
    "        b_matches = re.findall(r'\\bB(?![A-Za-z0-9])', value)\n",
    "        c_matches = re.findall(r'\\bC(?![A-Za-z0-9])', value)\n",
    "        d_matches = re.findall(r'\\bD(?![A-Za-z0-9])', value)\n",
    "\n",
    "        a_count = len(a_matches)\n",
    "        b_count = len(b_matches)\n",
    "        c_count = len(c_matches)\n",
    "        d_count = len(d_matches)\n",
    "        \n",
    "        if a_count >= 1 and b_count==0 and c_count ==0 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"A\"\n",
    "        elif a_count == 0 and b_count >= 1 and c_count ==0 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"B\"\n",
    "        elif a_count == 0 and b_count == 0 and c_count >= 1 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"C\"\n",
    "        elif a_count == 0 and b_count == 0 and c_count == 0 and d_count >= 1:\n",
    "            df.loc[idx, last_col] = \"D\"\n",
    "        else:\n",
    "            invalid_indices.append(idx)\n",
    "    \n",
    "                \n",
    "    invalid_count = len(invalid_indices)\n",
    "    problem_rows_dict[csv_file] = invalid_indices\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "468dd4f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "all_indices_sets = []\n",
    "for csv_file in problem_rows_dict:\n",
    "    file_path = os.path.join(dataset_dir, csv_file)\n",
    "    try:\n",
    "        df = pd.read_csv(file_path)\n",
    "        total_indices = set(df.index.tolist())\n",
    "        problem_indices = set(problem_rows_dict[csv_file])\n",
    "        valid_indices = total_indices - problem_indices\n",
    "        all_indices_sets.append(valid_indices)\n",
    "    except Exception as e:\n",
    "        print(f\"error\")\n",
    " \n",
    "if all_indices_sets:\n",
    "    common_valid_indices = set.intersection(*all_indices_sets)\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1faa6005",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "df = csv_dfs['mmlu_llama8B_new.csv']\n",
    "a_count=0\n",
    "b_count=0\n",
    "other_count=0\n",
    "for idx in tqdm(sorted(list(common_valid_indices))):\n",
    "    value = str(df.loc[idx, 'answer']).strip().upper()[0]\n",
    "    if value == 'A':\n",
    "        a_count += 1\n",
    "    elif value == 'B':\n",
    "        b_count += 1\n",
    "    elif value == 'C':\n",
    "        c_count += 1\n",
    "    elif value == 'D':\n",
    "        d_count += 1\n",
    "    else:\n",
    "        other_count += 1\n",
    "print(f\"answer_col: A: {a_count}, B: {b_count}, C: {c_count}, D: {d_count}, Other: {other_count}\")\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    last_col = df.columns[-1]\n",
    "    a_count=0\n",
    "    b_count=0\n",
    "    c_count=0\n",
    "    d_count=0\n",
    "    other_count=0\n",
    "    for idx in tqdm(sorted(list(common_valid_indices))):\n",
    "        value = str(df.loc[idx, last_col]).strip().upper()[0]\n",
    "        if value == 'A':\n",
    "            a_count += 1\n",
    "        elif value == 'B':\n",
    "            b_count += 1\n",
    "        elif value == 'C':\n",
    "            c_count += 1\n",
    "        elif value == 'D':\n",
    "            d_count += 1\n",
    "        else:\n",
    "            other_count += 1\n",
    "    print(f\"csv_file: {csv_file}, A: {a_count}, B: {b_count}, C: {c_count}, D: {d_count}, Other: {other_count}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efcea355",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_csv_files = csv_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7216273",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "model_accuracy = {}\n",
    "\n",
    "answer_col = None\n",
    "for f in all_csv_files:\n",
    "    if 'answer' in csv_dfs[f].columns:\n",
    "        answer_col = f\n",
    "        break\n",
    "answer_df = csv_dfs[answer_col]\n",
    "\n",
    "for csv_file in all_csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    last_col = df.columns[-1]\n",
    "    correct = 0\n",
    "    total = 0\n",
    "    for idx in sorted(list(common_valid_indices)):\n",
    "        pred = str(df.loc[idx, last_col]).strip().upper()\n",
    "        pred = pred[0] if len(pred) > 0 else ''\n",
    "        answer_val = str(answer_df.loc[idx, 'answer']).strip().upper()[0]\n",
    "        if pred == answer_val:\n",
    "            correct += 1\n",
    "        total += 1\n",
    "    acc = correct / total if total > 0 else 0\n",
    "    model_accuracy[csv_file] = acc\n",
    "\n",
    "sorted_models = sorted(model_accuracy.items(), key=lambda x: x[1], reverse=True)\n",
    "\n",
    "for i, (model, accuracy) in enumerate(sorted_models, 1):\n",
    "    model_name = model.replace('mmlu_', '').replace('_new.csv', '')\n",
    "    print(f\"{i:2d}. {model_name:<20} {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "985a72e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "import itertools\n",
    "llama_models = [f for f in all_csv_files if 'llama' in f]\n",
    "gpt_models = [f for f in all_csv_files if 'with_gpt' in f]\n",
    "qwen25_models = [f for f in all_csv_files if 'qwen25-' in f]\n",
    "phi_models = [f for f in all_csv_files if 'phi' in f]\n",
    "\n",
    "all_combinations = list(itertools.product(llama_models, gpt_models, qwen25_models, phi_models))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19e88f22",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "def sigma_inv(x):\n",
    "    \"\"\"\n",
    "    Inverse of the sigmoid function.\n",
    "    \n",
    "    Parameters:\n",
    "    x (float): The input value for which to compute the inverse sigmoid.\n",
    "    \n",
    "    Returns:\n",
    "    float: The inverse sigmoid of the input value.\n",
    "    \"\"\"\n",
    "    if x <= 0 or x >= 1:\n",
    "        raise ValueError(\"Input must be in the range (0, 1) exclusive.\")\n",
    "    \n",
    "    return math.log(x / (1 - x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50a0a494",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "results = []\n",
    "\n",
    "for i, combination in enumerate(all_combinations):\n",
    "    print(f\"\\n{'='*80}\")\n",
    "\n",
    "    csv_files = list(combination)\n",
    "    \n",
    "\n",
    "    sample_expected_actual = {}\n",
    "    for sample_idx in sorted(list(common_valid_indices)):\n",
    "        sample_expected_actual[sample_idx] = {\n",
    "        'expected_inverse': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'actual_inverse': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'baseline_count': 0\n",
    "        }\n",
    "\n",
    "    for baseline_model_file in tqdm(csv_files, desc=\"Processing baselines\"):\n",
    "        baseline_df = csv_dfs[baseline_model_file]\n",
    "        baseline_last_col = baseline_df.columns[-1]\n",
    "        correlation_stats = {}\n",
    "        \n",
    "        for other_file in csv_files:\n",
    "            if other_file == baseline_model_file:\n",
    "                continue\n",
    "            other_df = csv_dfs[other_file]\n",
    "            other_last_col = other_df.columns[-1]\n",
    "    \n",
    "            correlation_matrix = {\n",
    "            'A': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'B': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'C': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'D': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0}\n",
    "            }\n",
    "    \n",
    "        \n",
    "            for idx in sorted(list(common_valid_indices)):\n",
    "                baseline_val = str(baseline_df.loc[idx, baseline_last_col]).strip().upper()[0]\n",
    "                other_val = str(other_df.loc[idx, other_last_col]).strip().upper()[0]\n",
    "      \n",
    "                if baseline_val in ['A', 'B', 'C', 'D'] and other_val in ['A', 'B', 'C', 'D']:\n",
    "                    correlation_matrix[baseline_val][other_val] += 1\n",
    "                    correlation_matrix[baseline_val]['total'] += 1\n",
    "                else:\n",
    "                    print(f\"idx: {idx}, baseline_val: {baseline_val}, other_val: {other_val}\")\n",
    "    \n",
    "            prob_matrix = {}\n",
    "            for baseline_ans in ['A', 'B', 'C', 'D']:\n",
    "                prob_matrix[baseline_ans] = {}\n",
    "                total = correlation_matrix[baseline_ans]['total']\n",
    "                for other_ans in ['A', 'B', 'C', 'D']:\n",
    "                    if total > 0:\n",
    "                        prob_matrix[baseline_ans][other_ans] = correlation_matrix[baseline_ans][other_ans] / total\n",
    "                    else:\n",
    "                        prob_matrix[baseline_ans][other_ans] = None\n",
    "    \n",
    "            correlation_stats[other_file] = prob_matrix  \n",
    "\n",
    "\n",
    "        for idx in sorted(list(common_valid_indices)):\n",
    "            baseline_val = str(baseline_df.loc[idx, baseline_last_col]).strip().upper()[0]\n",
    "            current_expected_inverse = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "            current_actual_inverse = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "\n",
    "            for other_file in csv_files:\n",
    "                if other_file == baseline_model_file:\n",
    "                    continue\n",
    "                other_df = csv_dfs[other_file]\n",
    "                other_last_col = other_df.columns[-1]\n",
    "                other_val = str(other_df.loc[idx, other_last_col]).strip().upper()[0]\n",
    "                if other_val in ['A', 'B', 'C', 'D']:\n",
    "                    current_actual_inverse[other_val] += 1\n",
    "                else:\n",
    "                    print(f\"idx: {idx}, other_val: {other_val}\")\n",
    "            \n",
    "            if baseline_val == 'A':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for baseline_ans in ['B', 'C', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'B':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for baseline_ans in ['A', 'C', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'C':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for baseline_ans in ['A', 'B', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'D':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for baseline_ans in ['A', 'B', 'C']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            else:\n",
    "                print(f\"idx: {idx}, baseline_val: {baseline_val}\")\n",
    "\n",
    "            for ans in ['A', 'B', 'C', 'D']:\n",
    "                sample_expected_actual[idx]['expected_inverse'][ans] += current_expected_inverse[ans]\n",
    "                sample_expected_actual[idx]['actual_inverse'][ans] += current_actual_inverse[ans]\n",
    "            sample_expected_actual[idx]['baseline_count'] += 1\n",
    "\n",
    "    final_predictions_inverse = []\n",
    "\n",
    "    for sample_idx in sorted(list(common_valid_indices)):\n",
    "        expected_inverse = sample_expected_actual[sample_idx]['expected_inverse']\n",
    "        actual_inverse = sample_expected_actual[sample_idx]['actual_inverse']\n",
    "\n",
    "        diff_inverse = {ans: actual_inverse[ans] - expected_inverse[ans] for ans in ['A', 'B', 'C', 'D']}\n",
    "    \n",
    "        max_diff_inverse_ans = max(diff_inverse, key=diff_inverse.get)\n",
    "    \n",
    "        max_diff_inverse_val = diff_inverse[max_diff_inverse_ans]\n",
    "        max_diff_inverse_answers = [ans for ans, val in diff_inverse.items() if val == max_diff_inverse_val]\n",
    "        final_ans_inverse = random.choice(max_diff_inverse_answers)\n",
    "        \n",
    "        final_predictions_inverse.append(final_ans_inverse)\n",
    "\n",
    "    estimated_accuracy = {}\n",
    "\n",
    "    inverse_preds = np.array(final_predictions_inverse)\n",
    "\n",
    "    for csv_file in combination:\n",
    "        df = csv_dfs[csv_file]\n",
    "        last_col = df.columns[-1]\n",
    "        predictions = df.loc[sorted(list(common_valid_indices)), last_col].astype(str).str.strip().str.upper().str[0]\n",
    "        predictions = predictions.values\n",
    "        correct = np.sum(predictions == inverse_preds)\n",
    "        total = len(predictions)\n",
    "        acc = correct / total if total > 0 else 0\n",
    "        estimated_accuracy[csv_file] = acc\n",
    "\n",
    "    print(\"estimated accuracy:\", \" \".join([f\"{f.replace('mmlu_', '').replace('_new.csv', '')}:{estimated_accuracy[f]:.3f}\" for f in combination]))\n",
    "    best_estimated_model = max(estimated_accuracy, key=estimated_accuracy.get)\n",
    "    best_actual_model = max(combination, key=lambda f: model_accuracy[f])\n",
    "    print(\"actual accuracy:\", \" \".join([f\"{f.replace('mmlu_', '').replace('_new.csv', '')}:{model_accuracy[f]:.3f}\" for f in combination]))\n",
    "    print(f\"estimated accuracy best model: {best_estimated_model.replace('mmlu_', '').replace('_new.csv', '')} ({model_accuracy[best_estimated_model]:.4f})\")\n",
    "    print(f\"actual accuracy best model: {best_actual_model.replace('mmlu_', '').replace('_new.csv', '')} ({model_accuracy[best_actual_model]:.4f})\")\n",
    "    \n",
    "    weighted_predictions = []\n",
    "    weighted_correct = 0\n",
    "    weighted_total = 0\n",
    "    \n",
    "    for idx in sorted(list(common_valid_indices)):\n",
    "        a_weight = 0.0\n",
    "        b_weight = 0.0\n",
    "        c_weight = 0.0\n",
    "        d_weight = 0.0\n",
    "        \n",
    "        for csv_file in combination:\n",
    "            df = csv_dfs[csv_file]\n",
    "            last_col = df.columns[-1]\n",
    "            pred = str(df.loc[idx, last_col]).strip().upper()[0]\n",
    "            weight = sigma_inv(estimated_accuracy[csv_file])\n",
    "            \n",
    "            if pred == 'A':\n",
    "                a_weight += weight\n",
    "            elif pred == 'B':\n",
    "                b_weight += weight\n",
    "            elif pred == 'C':\n",
    "                c_weight += weight\n",
    "            elif pred == 'D':\n",
    "                d_weight += weight\n",
    "            else:\n",
    "                print(f\"{pred}: {csv_file}\")\n",
    "        \n",
    "\n",
    "        weighted_pred = random.choice([ans for ans, weight in {'A': a_weight, 'B': b_weight, 'C': c_weight, 'D': d_weight}.items() if weight == max(a_weight, b_weight, c_weight, d_weight)])\n",
    "        weighted_predictions.append(weighted_pred)\n",
    "        \n",
    "        answer_val = str(answer_df.loc[idx, 'answer']).strip().upper()[0]\n",
    "        weighted_total += 1\n",
    "        if weighted_pred == answer_val:\n",
    "            weighted_correct += 1\n",
    "    \n",
    "    weighted_accuracy = weighted_correct / weighted_total if weighted_total > 0 else 0\n",
    "    print(f\"weighted accuracy: {weighted_accuracy:.4f} ({weighted_correct}/{weighted_total})\")\n",
    "\n",
    "    \n",
    "    # Save results\n",
    "    result = {\n",
    "        'combination': combination,\n",
    "        'weighted_accuracy': weighted_accuracy,\n",
    "        'best_estimated_model': best_estimated_model,\n",
    "        'best_estimated_model_actual_accuracy': model_accuracy[best_estimated_model],\n",
    "        'best_actual_model': best_actual_model,\n",
    "        'best_actual_model_actual_accuracy': model_accuracy[best_actual_model],\n",
    "        'estimated_accuracies': estimated_accuracy\n",
    "        }\n",
    "    results.append(result)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9167f295",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create results DataFrame\n",
    "csv_data = []\n",
    "\n",
    "for result in results:\n",
    "    row = {}\n",
    "    \n",
    "    combination = result['combination']\n",
    "    row['qwen25'] = combination[2].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['gpt'] = combination[1].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['llama'] = combination[0].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['phi'] = combination[3].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    \n",
    "    row['weighted_accuracy'] = result['weighted_accuracy']\n",
    "    \n",
    "    row['best_estimated_model'] = result['best_estimated_model'].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['best_estimated_model_accuracy'] = result['best_estimated_model_actual_accuracy']\n",
    "    row['best_actual_model'] = result['best_actual_model'].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['best_actual_model_accuracy'] = result['best_actual_model_actual_accuracy']\n",
    " \n",
    "    for model_file in all_csv_files:\n",
    "        model_name = model_file.replace('mmlu_', '').replace('_new.csv', '')\n",
    "        if model_file in combination:\n",
    "            row[model_name] = model_accuracy[model_file]\n",
    "        else:\n",
    "            row[model_name] = None  \n",
    "    \n",
    "    csv_data.append(row)\n",
    "\n",
    "df_results = pd.DataFrame(csv_data)\n",
    "\n",
    "output_filename = 'four_mmlu_M1+M2.csv'\n",
    "df_results.to_csv(output_filename, index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb401799",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = df_results['weighted_accuracy'] - df_results['best_estimated_model_accuracy'] < 0\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.0001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < 0.0001\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6b756af",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = df_results['weighted_accuracy'] - df_results['best_actual_model_accuracy'] < 0\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.0001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < 0.0001\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "310707c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = df_results['best_actual_model']!=df_results['best_estimated_model']\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.0001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < 0.0001\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
