{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "73ae176b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "from openai import OpenAI\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c001abf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_base_dir = \"Results\"\n",
    "n_shots = [1, 3]\n",
    "\n",
    "model_names = [\n",
    "    \"InternVL3.5-4B\",\n",
    "    \"InternVL3.5-8B\",\n",
    "    \"InternVL3.5-30B\",\n",
    "    \"InstructBlip-vicuna-7B\"\n",
    "    \"llava-1.5-7B-hf\",\n",
    "    \"Qwen2.5-VL-3B\",\n",
    "    \"Qwen2.5-VL-7B\",\n",
    "    \"Qwen2.5-VL-32B\",\n",
    "    \"Qwen3-VL-4B\",\n",
    "    \"Qwen3-VL-8B\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a415f015",
   "metadata": {},
   "outputs": [],
   "source": [
    "def all_to_lower_case(results_base_dir):\n",
    "    for root, dirs, files in os.walk(results_base_dir):\n",
    "        for file in files:\n",
    "            os.rename(os.path.join(root, file), os.path.join(root, file.lower()))\n",
    "        for dir in dirs:\n",
    "            os.rename(os.path.join(root, dir), os.path.join(root, dir.lower()))\n",
    "\n",
    "def rename_csv_files(results_base_dir):\n",
    "    for root, dirs, files in os.walk(results_base_dir):\n",
    "        for file in files:\n",
    "            if file.endswith(\".csv\"):\n",
    "                os.rename(os.path.join(root, file), os.path.join(root, \"results.csv\"))\n",
    "\n",
    "\n",
    "all_to_lower_case(results_base_dir)\n",
    "rename_csv_files(results_base_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a8103a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = OpenAI(\n",
    "    api_key='...',\n",
    "    base_url=\"...\",\n",
    ")\n",
    "\n",
    "def llm_judge(ground_truth: str, prediction: str) -> float:\n",
    "\n",
    "    prompt = f\"\"\"\n",
    "    You are an expert evaluator for Rebus puzzles. \n",
    "    Your task is to compare a 'Ground Truth' answer with a 'Predicted' answer.\n",
    "    \n",
    "    Ground Truth: \"{ground_truth}\"\n",
    "    Predicted: \"{prediction}\"\n",
    "    \n",
    "    Scoring Criteria:\n",
    "    - Score 1.0: Perfect match or semantically identical (e.g., \"Middle-aged\" vs \"middle aged\", \"Apple\" vs \"Apples\").\n",
    "    - Score 0.0: Completely unrelated.\n",
    "    - Otherwise: Based on the level of capturing the core concept in ground truth and partially correctness.\n",
    "    \n",
    "    Return ONLY a single numerical float between 0.0 and 1.0. \n",
    "    No explanations, no text.\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        response = client.chat.completions.create(\n",
    "            model=\"gpt-4o\",\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": \"You are a precise grading assistant.\"},\n",
    "                {\"role\": \"user\", \"content\": prompt}\n",
    "            ],\n",
    "            temperature=0\n",
    "        )\n",
    "        score_str = response.choices[0].message.content.strip()\n",
    "        return float(score_str)\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error during API call: {e}\")\n",
    "        return 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30d9790e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_scores(model_name, n_shot):\n",
    "    print(f\"Shot: {n_shot}\")\n",
    "    print(f\"Model Name: {model_name}\")\n",
    "    result_dir = f\"{results_base_dir}/Result {n_shot} shot/{model_name}/results.csv\"\n",
    "    results = pd.read_csv(result_dir)\n",
    "    ground_truth_list = results[\"gt_answer\"].tolist()\n",
    "    predicted_answer_list = results[\"pred_answer\"].tolist()\n",
    "    puzzle_id_list = results[\"puzzle_id\"].tolist()\n",
    "    llm_scores = []\n",
    "    em_scores = []\n",
    "    for i in tqdm(range(len(puzzle_id_list))):\n",
    "        llm_scores.append(llm_judge(ground_truth_list[i], predicted_answer_list[i]))\n",
    "        em_scores.append(1 if ground_truth_list[i] == predicted_answer_list[i] else 0)\n",
    "    print(f\"N Samples: {len(puzzle_id_list)}\")\n",
    "    print(f\"LLM Score: {sum(llm_scores)/len(llm_scores)}\")\n",
    "    print(f\"EM Score: {sum(em_scores)/len(em_scores)}\")\n",
    "    results_df = pd.DataFrame(data={\"puzzle_id\": puzzle_id_list,\n",
    "                                    \"gt\": ground_truth_list,\n",
    "                                    \"predicted\": predicted_answer_list,\n",
    "                                    \"llm_score\": llm_scores,\n",
    "                                    \"em_score\": em_scores})\n",
    "    os.makedirs(f\"evaluation_results/{n_shot}/{model_name}\", exist_ok=True)\n",
    "    results_df.to_csv(f\"judge_results/{n_shot}/{model_name}/llm_scores.csv\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9feea079",
   "metadata": {},
   "outputs": [],
   "source": [
    "for n_shot in n_shots:\n",
    "    for model_name in model_names:\n",
    "            get_scores(model_name, n_shot)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
