{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb35ee77",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "home = os.path.expanduser(\"~\")\n",
    "dataset_dir = os.path.join(home, \"datasets\")\n",
    "\n",
    "csv_files = [f for f in os.listdir(dataset_dir) if f.startswith(\"mmlu_\") and f.endswith(\"new.csv\")]\n",
    "\n",
    "csv_dfs = {}\n",
    "for f in csv_files:\n",
    "    csv_dfs[f] = pd.read_csv(os.path.join(dataset_dir, f))\n",
    "\n",
    "problem_rows_dict = {}\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    row_count = len(df)\n",
    "    \n",
    "    last_col = df.columns[-1]\n",
    "    invalid_indices = []\n",
    "    \n",
    "    for idx, row in df.iterrows():\n",
    "        value = str(row[last_col]).strip()\n",
    "    \n",
    "        a_matches = re.findall(r'\\bA(?![A-Za-z0-9])', value)\n",
    "        b_matches = re.findall(r'\\bB(?![A-Za-z0-9])', value)\n",
    "        c_matches = re.findall(r'\\bC(?![A-Za-z0-9])', value)\n",
    "        d_matches = re.findall(r'\\bD(?![A-Za-z0-9])', value)\n",
    "\n",
    "        a_count = len(a_matches)\n",
    "        b_count = len(b_matches)\n",
    "        c_count = len(c_matches)\n",
    "        d_count = len(d_matches)\n",
    "        \n",
    "        # Check if it's 0/1 or 1/0\n",
    "        if a_count >= 1 and b_count==0 and c_count ==0 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"A\"\n",
    "        elif a_count == 0 and b_count >= 1 and c_count ==0 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"B\"\n",
    "        elif a_count == 0 and b_count == 0 and c_count >= 1 and d_count ==0:\n",
    "            df.loc[idx, last_col] = \"C\"\n",
    "        elif a_count == 0 and b_count == 0 and c_count == 0 and d_count >= 1:\n",
    "            df.loc[idx, last_col] = \"D\"\n",
    "        else:\n",
    "            invalid_indices.append(idx)\n",
    "                \n",
    "    \n",
    "    invalid_count = len(invalid_indices)\n",
    "    problem_rows_dict[csv_file] = invalid_indices\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66a7ae25",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_indices_sets = []\n",
    "for csv_file in problem_rows_dict:\n",
    "    file_path = os.path.join(dataset_dir, csv_file)\n",
    "    try:\n",
    "        df = pd.read_csv(file_path)\n",
    "        total_indices = set(df.index.tolist())\n",
    "        problem_indices = set(problem_rows_dict[csv_file])\n",
    "        valid_indices = total_indices - problem_indices\n",
    "        all_indices_sets.append(valid_indices)\n",
    "    except Exception as e:\n",
    "        print(f\"error\")\n",
    "\n",
    "if all_indices_sets:\n",
    "    common_valid_indices = set.intersection(*all_indices_sets)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93020320",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "df = csv_dfs['mmlu_llama8B_new.csv']\n",
    "a_count=0\n",
    "b_count=0\n",
    "other_count=0\n",
    "for idx in tqdm(sorted(list(common_valid_indices))):\n",
    "    value = str(df.loc[idx, 'answer']).strip().upper()[0]\n",
    "    if value == 'A':\n",
    "        a_count += 1\n",
    "    elif value == 'B':\n",
    "        b_count += 1\n",
    "    elif value == 'C':\n",
    "        c_count += 1\n",
    "    elif value == 'D':\n",
    "        d_count += 1\n",
    "    else:\n",
    "        other_count += 1\n",
    "print(f\"answer_col: A: {a_count}, B: {b_count}, C: {c_count}, D: {d_count}, Other: {other_count}\")\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    last_col = df.columns[-1]\n",
    "    a_count=0\n",
    "    b_count=0\n",
    "    c_count=0\n",
    "    d_count=0\n",
    "    other_count=0\n",
    "    for idx in tqdm(sorted(list(common_valid_indices))):\n",
    "        value = str(df.loc[idx, last_col]).strip().upper()[0]\n",
    "        if value == 'A':\n",
    "            a_count += 1\n",
    "        elif value == 'B':\n",
    "            b_count += 1\n",
    "        elif value == 'C':\n",
    "            c_count += 1\n",
    "        elif value == 'D':\n",
    "            d_count += 1\n",
    "        else:\n",
    "            other_count += 1\n",
    "    print(f\"csv_file: {csv_file}, A: {a_count}, B: {b_count}, C: {c_count}, D: {d_count}, Other: {other_count}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e51b30a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_csv_files = csv_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b48eca5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "model_accuracy = {}\n",
    "\n",
    "answer_col = None\n",
    "for f in all_csv_files:\n",
    "    if 'answer' in csv_dfs[f].columns:\n",
    "        answer_col = f\n",
    "        break\n",
    "answer_df = csv_dfs[answer_col]\n",
    "\n",
    "for csv_file in all_csv_files:\n",
    "    df = csv_dfs[csv_file]\n",
    "    last_col = df.columns[-1]\n",
    "    correct = 0\n",
    "    total = 0\n",
    "    for idx in sorted(list(common_valid_indices)):\n",
    "        pred = str(df.loc[idx, last_col]).strip().upper()\n",
    "        pred = pred[0] if len(pred) > 0 else ''\n",
    "        answer_val = str(answer_df.loc[idx, 'answer']).strip().upper()[0]\n",
    "        if pred == answer_val:\n",
    "            correct += 1\n",
    "        total += 1\n",
    "    acc = correct / total if total > 0 else 0\n",
    "    model_accuracy[csv_file] = acc\n",
    "\n",
    "sorted_models = sorted(model_accuracy.items(), key=lambda x: x[1], reverse=True)\n",
    "\n",
    "for i, (model, accuracy) in enumerate(sorted_models, 1):\n",
    "    model_name = model.replace('mmlu_', '').replace('_new.csv', '')\n",
    "    print(f\"{i:2d}. {model_name:<20} {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8f6e6b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "import itertools\n",
    "llama_models = [f for f in all_csv_files if 'llama' in f]\n",
    "gpt_models = [f for f in all_csv_files if 'with_gpt' in f]\n",
    "qwen25_models = [f for f in all_csv_files if 'qwen25-' in f]\n",
    "phi_models = [f for f in all_csv_files if 'phi' in f]\n",
    "\n",
    "all_combinations = list(itertools.product(llama_models, gpt_models, qwen25_models, phi_models))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fecbc799",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import random\n",
    "results = []\n",
    "\n",
    "for i, combination in enumerate(all_combinations):\n",
    "    print(f\"\\n{'='*80}\")\n",
    "    \n",
    "    csv_files = list(combination)\n",
    "    \n",
    "    answer_col = None\n",
    "    for f in csv_files:\n",
    "        if 'answer' in csv_dfs[f].columns:\n",
    "            answer_col = f\n",
    "            break\n",
    "    \n",
    "    if answer_col is None:\n",
    "        continue\n",
    "    \n",
    "    answer_df = csv_dfs[answer_col]\n",
    "    \n",
    "    # ===== Majority Voting =====\n",
    "    majority_correct = 0\n",
    "    majority_total = 0\n",
    "    majority_answers = []\n",
    "    \n",
    "    for idx in tqdm(sorted(list(common_valid_indices)), desc=\"Majority Voting\"):\n",
    "        a_count = 0\n",
    "        b_count = 0\n",
    "        c_count = 0\n",
    "        d_count = 0\n",
    "        n_count = 0\n",
    "        for csv_file in csv_files:\n",
    "            df = csv_dfs[csv_file]\n",
    "            last_col = df.columns[-1]\n",
    "            val = str(df.loc[idx, last_col]).strip().upper()[0]\n",
    "            if val == 'A':\n",
    "                a_count += 1\n",
    "            elif val == 'B':\n",
    "                b_count += 1\n",
    "            elif val == 'C':\n",
    "                c_count += 1\n",
    "            elif val == 'D':\n",
    "                d_count += 1\n",
    "            else:\n",
    "                print(f\"idx: {idx}, val: {val}\")\n",
    "            n_count += 1\n",
    "        \n",
    "        answer_counts = {'A': a_count, 'B': b_count, 'C': c_count, 'D': d_count}\n",
    "\n",
    "        max_count = max(answer_counts.values())\n",
    "        most_common_answers = [answer for answer, count in answer_counts.items() if count == max_count]\n",
    "        maj_ans = random.choice(most_common_answers)\n",
    "        \n",
    "        majority_answers.append(maj_ans)\n",
    "        answer_val = str(answer_df.loc[idx, 'answer']).strip().upper()[0]\n",
    "        majority_total += 1\n",
    "        if maj_ans == answer_val:\n",
    "            majority_correct += 1\n",
    "    \n",
    "    majority_accuracy = majority_correct / majority_total if majority_total > 0 else 0\n",
    "    print(f\"Majority Voting: {majority_accuracy:.4f} ({majority_correct}/{majority_total})\")\n",
    "    \n",
    "    # ===== 2nd =====\n",
    "    sample_expected_actual = {}\n",
    "    for sample_idx in sorted(list(common_valid_indices)):\n",
    "        sample_expected_actual[sample_idx] = {\n",
    "        'expected_inverse': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'actual_inverse': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'expected': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'actual': {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0},\n",
    "        'baseline_count': 0\n",
    "        }\n",
    "\n",
    "    for baseline_model_file in tqdm(csv_files, desc=\"Processing baselines\"):\n",
    "        baseline_df = csv_dfs[baseline_model_file]\n",
    "        baseline_last_col = baseline_df.columns[-1]\n",
    "        correlation_stats = {}\n",
    "        \n",
    "        for other_file in csv_files:\n",
    "            if other_file == baseline_model_file:\n",
    "                continue\n",
    "            other_df = csv_dfs[other_file]\n",
    "            other_last_col = other_df.columns[-1]\n",
    "    \n",
    "            correlation_matrix = {\n",
    "            'A': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'B': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'C': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0},\n",
    "            'D': {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'total': 0}\n",
    "            }\n",
    "    \n",
    "            for idx in sorted(list(common_valid_indices)):\n",
    "                baseline_val = str(baseline_df.loc[idx, baseline_last_col]).strip().upper()[0]\n",
    "                other_val = str(other_df.loc[idx, other_last_col]).strip().upper()[0]\n",
    "      \n",
    "                if baseline_val in ['A', 'B', 'C', 'D'] and other_val in ['A', 'B', 'C', 'D']:\n",
    "                    correlation_matrix[baseline_val][other_val] += 1\n",
    "                    correlation_matrix[baseline_val]['total'] += 1\n",
    "                else:\n",
    "                    print(f\"idx: {idx}, baseline_val: {baseline_val}, other_val: {other_val}\")\n",
    "    \n",
    "            prob_matrix = {}\n",
    "            for baseline_ans in ['A', 'B', 'C', 'D']:\n",
    "                prob_matrix[baseline_ans] = {}\n",
    "                total = correlation_matrix[baseline_ans]['total']\n",
    "                for other_ans in ['A', 'B', 'C', 'D']:\n",
    "                    if total > 0:\n",
    "                        prob_matrix[baseline_ans][other_ans] = correlation_matrix[baseline_ans][other_ans] / total\n",
    "                    else:\n",
    "                        prob_matrix[baseline_ans][other_ans] = None\n",
    "    \n",
    "            correlation_stats[other_file] = prob_matrix  \n",
    "\n",
    "    \n",
    "        for idx in sorted(list(common_valid_indices)):\n",
    "            baseline_val = str(baseline_df.loc[idx, baseline_last_col]).strip().upper()[0]\n",
    "            current_expected = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "            current_actual = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "            current_expected_inverse = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "            current_actual_inverse = {'A': 0, 'B': 0, 'C': 0, 'D': 0}\n",
    "\n",
    "            for other_file in csv_files:\n",
    "                if other_file == baseline_model_file:\n",
    "                    continue\n",
    "                other_df = csv_dfs[other_file]\n",
    "                other_last_col = other_df.columns[-1]\n",
    "                other_val = str(other_df.loc[idx, other_last_col]).strip().upper()[0]\n",
    "                if other_val in ['A', 'B', 'C', 'D']:\n",
    "                    current_actual[other_val] += 1\n",
    "                    current_actual_inverse[other_val] += 1\n",
    "                else:\n",
    "                    print(f\"idx: {idx}, other_val: {other_val}\")\n",
    "            \n",
    "            \n",
    "            if baseline_val == 'A':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for ans in ['A', 'B', 'C', 'D']:\n",
    "                        if prob_matrix['A'][ans] is not None:\n",
    "                            current_expected[ans] += prob_matrix['A'][ans]\n",
    "                    for baseline_ans in ['B', 'C', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'B':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for ans in ['A', 'B', 'C', 'D']:\n",
    "                        if prob_matrix['B'][ans] is not None:\n",
    "                            current_expected[ans] += prob_matrix['B'][ans]\n",
    "                    for baseline_ans in ['A', 'C', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'C':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for ans in ['A', 'B', 'C', 'D']:\n",
    "                        if prob_matrix['C'][ans] is not None:\n",
    "                            current_expected[ans] += prob_matrix['C'][ans]\n",
    "                    for baseline_ans in ['A', 'B', 'D']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            elif baseline_val == 'D':\n",
    "                for other_file in csv_files:\n",
    "                    if other_file == baseline_model_file:\n",
    "                        continue\n",
    "                    prob_matrix = correlation_stats[other_file]\n",
    "                    for ans in ['A', 'B', 'C', 'D']:\n",
    "                        if prob_matrix['D'][ans] is not None:\n",
    "                            current_expected[ans] += prob_matrix['D'][ans]\n",
    "                    for baseline_ans in ['A', 'B', 'C']:\n",
    "                        for ans in ['A', 'B', 'C', 'D']:\n",
    "                            if prob_matrix[baseline_ans][ans] is not None:\n",
    "                                current_expected_inverse[ans] += prob_matrix[baseline_ans][ans] / 3\n",
    "\n",
    "            else:\n",
    "                print(f\"idx: {idx}, baseline_val: {baseline_val}\")\n",
    "\n",
    "            for ans in ['A', 'B', 'C', 'D']:\n",
    "                sample_expected_actual[idx]['expected_inverse'][ans] += current_expected_inverse[ans]\n",
    "                sample_expected_actual[idx]['actual_inverse'][ans] += current_actual_inverse[ans]\n",
    "                sample_expected_actual[idx]['expected'][ans] += current_expected[ans]\n",
    "                sample_expected_actual[idx]['actual'][ans] += current_actual[ans]\n",
    "            sample_expected_actual[idx]['baseline_count'] += 1\n",
    "            \n",
    "    final_predictions = []\n",
    "    final_predictions_inverse = []\n",
    "    final_correct_inverse = 0\n",
    "    final_total_inverse = 0\n",
    "    final_correct = 0\n",
    "    final_total = 0\n",
    "    \n",
    "    for sample_idx in sorted(list(common_valid_indices)):\n",
    "        expected = sample_expected_actual[sample_idx]['expected']\n",
    "        actual = sample_expected_actual[sample_idx]['actual']\n",
    "        expected_inverse = sample_expected_actual[sample_idx]['expected_inverse']\n",
    "        actual_inverse = sample_expected_actual[sample_idx]['actual_inverse']\n",
    "\n",
    "        diff = {ans: actual[ans] - expected[ans] for ans in ['A', 'B', 'C', 'D']}\n",
    "        diff_inverse = {ans: actual_inverse[ans] - expected_inverse[ans] for ans in ['A', 'B', 'C', 'D']}\n",
    "    \n",
    "        max_diff_ans = max(diff, key=diff.get)\n",
    "        max_diff_inverse_ans = max(diff_inverse, key=diff_inverse.get)\n",
    "    \n",
    "        max_diff_val = diff[max_diff_ans]\n",
    "        max_diff_inverse_val = diff_inverse[max_diff_inverse_ans]\n",
    "    \n",
    "        max_diff_answers = [ans for ans, val in diff.items() if val == max_diff_val]\n",
    "        max_diff_inverse_answers = [ans for ans, val in diff_inverse.items() if val == max_diff_inverse_val]\n",
    "\n",
    "        final_ans = random.choice(max_diff_answers)\n",
    "        final_ans_inverse = random.choice(max_diff_inverse_answers)\n",
    "        \n",
    "        final_predictions.append(final_ans)\n",
    "        final_predictions_inverse.append(final_ans_inverse)\n",
    "        answer_val = str(answer_df.loc[sample_idx, 'answer']).strip().upper()[0]\n",
    "        final_total += 1\n",
    "        final_total_inverse += 1\n",
    "        if final_ans == answer_val:\n",
    "            final_correct += 1\n",
    "        if final_ans_inverse == answer_val:\n",
    "            final_correct_inverse += 1\n",
    "    \n",
    "    final_accuracy = final_correct / final_total if final_total > 0 else 0\n",
    "    final_accuracy_inverse = final_correct_inverse / final_total_inverse if final_total_inverse > 0 else 0\n",
    "    \n",
    "    \n",
    "    result = {\n",
    "        'combination': combination,\n",
    "        'majority_accuracy': majority_accuracy,\n",
    "        'inverse_second_method_accuracy': final_accuracy_inverse,\n",
    "        'second_method_accuracy': final_accuracy,\n",
    "        'best_single_model_accuracy': max([model_accuracy[f] for f in combination])\n",
    "    }\n",
    "    results.append(result)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47b7cc5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "csv_data = []\n",
    "\n",
    "for result in results:\n",
    "    row = {}\n",
    "\n",
    "    combination = result['combination']\n",
    "    row['qwen25'] = combination[2].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['gpt'] = combination[1].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['llama'] = combination[0].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    row['phi'] = combination[3].replace('mmlu_', '').replace('_new.csv', '')\n",
    "    \n",
    "\n",
    "    row['inverse_second_method_accuracy'] = result['inverse_second_method_accuracy']\n",
    "    row['second_method_accuracy'] = result['second_method_accuracy']\n",
    "    row['majority_accuracy'] = result['majority_accuracy']\n",
    "    row['best_single_model_accuracy'] = result['best_single_model_accuracy']\n",
    "    \n",
    "    \n",
    "    for model_file in all_csv_files:\n",
    "        model_name = model_file.replace('mmlu_', '').replace('_new.csv', '')\n",
    "        if model_file in combination:\n",
    "            row[model_name] = model_accuracy[model_file]\n",
    "        else:\n",
    "            row[model_name] = None  \n",
    "    \n",
    "    csv_data.append(row)\n",
    "\n",
    "df_results = pd.DataFrame(csv_data)\n",
    "\n",
    "output_filename = 'four_mmlu_M2.csv'\n",
    "df_results.to_csv(output_filename, index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd35b4a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "mask = df_results['inverse_second_method_accuracy'] - df_results['majority_accuracy'] < 0\n",
    "#mask = df_results['inverse_second_method_accuracy'] - df_results['majority_accuracy'] < -0.0001\n",
    "#mask = df_results['inverse_second_method_accuracy'] - df_results['majority_accuracy'] < -0.001\n",
    "mask = df_results['inverse_second_method_accuracy'] - df_results['majority_accuracy'] < 0.0001\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8da38137",
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < 0\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.0001\n",
    "#mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < -0.001\n",
    "# mask = df_results['inverse_second_method_accuracy'] - df_results['best_single_model_accuracy'] < 0.0001\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25','gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae762299",
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = df_results['inverse_second_method_accuracy'] - df_results['second_method_accuracy'] < 0\n",
    "\n",
    "bad_rows = df_results.loc[mask, ['qwen25', 'gpt', 'llama', 'phi']]\n",
    "\n",
    "print(bad_rows)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
