{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Amy Van Dyken\n",
      "Billie Jean King\n",
      "Evonne Goolagong Cawley\n",
      "Babe Didrikson Zaharias\n",
      "Michael Johnson\n",
      "Rafer Johnson\n",
      "Billie Jean King\n",
      "O. J. Simpson\n",
      "Florence Griffith Joyner\n",
      "Ben Johnson\n",
      "Mary Lou Retton\n",
      "Valerie Briscoe Hooks\n",
      "Nawal El Moutawakel\n",
      "Robert de Castella\n",
      "Billie Jean King\n",
      "Evonne Goolagong Cawley\n",
      "Helen Willis Moody\n",
      "Babe Didrikson Zaharias\n",
      "Evonne Goolagong Cawley\n",
      "Billie Jean King\n",
      "Helen Willis Moody\n",
      "Babe Didrikson Zaharias\n",
      "Billie Jean King\n",
      "Evonne Goolagong Cawley\n",
      "Billie Jean King\n",
      "Babe Didrikson Zaharias\n"
     ]
    }
   ],
   "source": [
    "def check (lastname):\n",
    "    # check if the name is in the file\n",
    "    with open(\"nobel_names.csv\", \"r\") as f:\n",
    "        names = f.readlines()\n",
    "        names = [x.strip().lower() for x in names]\n",
    "    if lastname.lower() not in names:\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "    \n",
    "def get_chemical (name):\n",
    "    name = name.lower().strip()\n",
    "    # check if the name is in the file\n",
    "    with open(\"elements.csv\", \"r\") as f:\n",
    "        df = pd.read_csv(f)\n",
    "        df = df[df[\"Element\"].str.lower() == name.lower()]\n",
    "        if df.empty:\n",
    "            raise ValueError(f\"Element {name} not found\")\n",
    "        return df[\"Symbol\"].values[0]\n",
    "    \n",
    "\n",
    "# The df has multiple answers for each question \n",
    "# so we need to group them by question, I want a dictionary that has id: {\"question\", \"answers\" : [\"answer1\", \"answer2\", ...]} \n",
    "df = pd.read_csv(\"CoverageQA.csv\")\n",
    "def get_questions(df):\n",
    "    questions = {}\n",
    "    for i, row in df.iterrows():\n",
    "        question = row['question']\n",
    "        answer = [row['answer']]\n",
    "        \n",
    "        if  \"Palestine\" in answer[0]:\n",
    "            answer = [\"State of Palestine\", \"Palestinian territories\", \"Palestine\"]\n",
    "        if answer[0] == \"United States of America\":\n",
    "            answer = [\"United States\", \"USA\", \"America\"]\n",
    "        if answer[0] == \"United Kingdom\":\n",
    "            answer = [\"UK\", \"England\", \"Great Britain\", \"Britain\"]\n",
    "        if \"Ireland\" in answer[0] :\n",
    "            answer = [\"Ireland\", \"Republic of Ireland\", \"Irish Republic\"]\n",
    "        if answer[0] == \"South Korea\":\n",
    "            answer = [\"South Korea\", \"Republic of Korea\"]\n",
    "        if \"China\" in answer[0]:\n",
    "            answer = [\"China\", \"People's Republic of China\"]\n",
    "        if answer[0] == \"United Arab Emirates\":\n",
    "            answer = [\"United Arab Emirates\",\"UAE\", \"Emirates\"]\n",
    "        if answer[0] == \"Laos\": \n",
    "            answer = [\"Laos\", \"Lao People's Democratic Republic\"]\n",
    "            \n",
    "        if \"National Park\" in answer[0]: \n",
    "            if answer[0] == \"Wrangell-St.Elias/Glacier Bay/Tatshenshini-Alsek/Kluane National Parks and Reserves\":\n",
    "                answer = [\"Wrangell-St. Elias National Park\", \"Glacier Bay National Park\", \"Tatshenshini-Alsek National Park\", \"Kluane National Park\"]\n",
    "            for a in answer.copy(): \n",
    "                answer.append( a.replace(\"National Park\", \"\").strip())\n",
    "            if \"National Monument\" in answer[0]:\n",
    "                answer.append(answer[0].replace(\"National Monument\", \"\").strip())\n",
    "            \n",
    "            \n",
    "        if \"country\" in question:\n",
    "            category = \"country\"\n",
    "        elif \"music\" in question: \n",
    "            category = \"instrument\"\n",
    "        elif \"chemical\" in question:\n",
    "            category = \"chemical\"\n",
    "        elif \"Associated Press Athlete of the Year\" in question: \n",
    "            category = \"athlete\"\n",
    "        elif \"physic\" in question.lower():\n",
    "            category = \"physic\"\n",
    "        elif \"park\" in question.lower():\n",
    "            category = \"park\"\n",
    "        else: \n",
    "            raise NotImplementedError\n",
    "        \n",
    "        if category == \"chemical\":\n",
    "            if answer[0].lower() == \"cesium\": \n",
    "                answer = [\"cesium\", \"caesium\"]\n",
    "            if answer[0].lower() == \"tungsten\":\n",
    "                answer = [ \"wolfram\", \"tungsten\"]\n",
    "            answer.append(get_chemical(answer[0]))\n",
    "            \n",
    "        if category == \"athlete\":\n",
    "            if len(answer[0].split(\" \")) == 2 and \"Johnson\" not in answer[0]: \n",
    "                answer = [answer[0].split(\" \")[-1], answer[0]]\n",
    "            else: \n",
    "                print (answer[0])\n",
    "            \n",
    "        \n",
    "        if category == \"physic\":\n",
    "            if answer[0].startswith(\"Philip\") and answer[0].endswith(\"Anderson\"):\n",
    "                answer = [\"Philip W. Anderson\", \"Philip Anderson\", \"Philip Warren Anderson\", \"Philip W Anderson\"]\n",
    "            elif answer[0] == \"Tomonaga Shin'ichiro\" or answer[0] == \"Sin-Itiro Tomonaga\": \n",
    "                answer = [\"Tomonaga Shin'ichiro\", \"Shinichiro Tomonaga\", \"Shin'ichiro Tomonaga\", \"Tomonaga\", \"Sin-Itiro Tomonaga\",\"Tomonaga Shinichiro\"]\n",
    "            elif answer[0] == \"John Strutt\":\n",
    "                answer = [\"John William Strutt, 3rd Baron Rayleigh\", \"John William Strutt\", \"Strutt\", \"John Strutt\", \"Lord Rayleigh\", \"Rayleigh\"]\n",
    "            elif answer[0] == \"Michael Kosterlitz\":\n",
    "                answer = [\"Michael Kosterlitz\"]\n",
    "            elif answer[0] == \"John M. Kosterlitz\":\n",
    "                answer = [\"John M. Kosterlitz\", \"John Kosterlitz\", \"John M Kosterlitz\"]\n",
    "            elif answer[0] == \"William Henry Bragg\": \n",
    "                answer = [\"William Henry Bragg\", \"William H. Bragg\", \"William H Bragg\", \"W. H. Bragg\", \"W. H Bragg\"]\n",
    "            elif answer[0] == \"William Lawrence Bragg\":\n",
    "                answer = [\"William Lawrence Bragg\", \"William L. Bragg\", \"William L Bragg\", \"W. L. Bragg\", \"W. L Bragg\"]\n",
    "            elif answer[0] == \"J. J. Thomson\": \n",
    "                answer = [\"J. J. Thomson\", \"Joseph John Thomson\", \"J.J. Thomson\", \"John Thomson\"]\n",
    "            elif \"Aage\" in answer[0]:\n",
    "                answer = [\"Aage N Bohr\", \"Aage N. Bohr\", \"Aage Niels Bohr\", \"Aage Bohr\"]\n",
    "            elif \"Vleck\" in answer[0]:\n",
    "                answer = [\"John Hasbrouck Van Vleck\", \"John H. Van Vleck\", \"John Van Vleck\", \"Van Vleck\"]\n",
    "            elif \"Cecil Frank Powell\" in answer[0]:\n",
    "                answer = [\"Cecil Frank Powell\", \"Cecil Powell\", \"C. F. Powell\", \"Powell\"]\n",
    "            elif \"Simon van der Meer\" in answer[0]:\n",
    "                answer = [\"van der Meer\", \"Simon Van Der Meer\"]\n",
    "            elif \"J. Hans D. Jensen\" in answer[0]:\n",
    "                answer = [\"Johannes Hans Daniel Jensen\" , \"Jensen\", \"J. Hans D. Jensen\"]\n",
    "            elif \"Charles Thomson Rees Wilson\" in answer[0]:\n",
    "                answer = [\"Charles Thomson Rees Wilson\", \"C. T. R. Wilson\", \"CTR Wilson\", \"C T R Wilson\", \"Charles Wilson\"]\n",
    "            elif \"Norman Foster Ramsey Jr\" in answer[0]:\n",
    "                answer = [\"Norman Foster Ramsey Jr\", \"Norman Ramsey\", \"Ramsey\", \"Norman F Ramsey Jr\", \"Norman F. Ramsey Jr\", \"Norman F. Ramsey\", \"Norman F. Ramsey Jr.\", \"Norman Ramsey Jr.\", \"Ramsey Jr.\", \"Ramsey Jr\"]\n",
    "            elif \"Johannes Diderik van der Waals\" in answer[0]:\n",
    "                answer = [\"Johannes Diderik van der Waals\", \"Johannes van der Waals\", \"van der Waals\"]\n",
    "            elif \"Martinus J. G. Veltman\" in answer[0]:\n",
    "                answer = [\"Martinus J. G. Veltman\", \"Martinus Veltman\", \"Veltman\", \"Martinus Justinus Godefriedus Veltman\"]\n",
    "            elif \"Dirac\" in answer[0]:\n",
    "                answer = [\"Paul Dirac\", \"Dirac\", \"Paul Adrien Maurice Dirac\"]\n",
    "            elif \"Ting\" in answer[0]:\n",
    "                answer = [\"Samuel C. C. Ting\", \"Samuel Ting\", \"Ting\", \"Chao Chung Ting\", \"Ting Chao Chung\", \"Samuel C.C. Ting\", \"Samuel C C Ting\"]\n",
    "            elif \"J. Robert Oppenheimer\" in answer[0]:\n",
    "                answer = [\"J. Robert Oppenheimer\", \"Robert Oppenheimer\", \"Oppenheimer\", \"J. R. Oppenheimer\"]\n",
    "            elif \"Hooft\" in answer[0]:\n",
    "                answer = [\"Gerard 't Hooft\", \"Gerard t Hooft\", \"Gerard 't Hooft\", \"Gerard t. Hooft\", \"Hooft\", \"Gerard Hooft\", \"Gerardus 't Hooft\", \"Gerardus t Hooft\", \"Gerardus Hooft\"]\n",
    "            elif \"Tsung-Dao Lee\" in answer[0]:\n",
    "                answer = [\"Tsung-Dao Lee\", \"Tsung Dao Lee\", \"Lee\", \"TD Lee\", \"T. D. Lee\"]\n",
    "            elif \"Leggett\" in answer[0]:\n",
    "                answer = [\"Anthony Leggett\", \"Leggett\", \"Anthony J. Leggett\", \"Anthony James Leggett\", \"Tony Leggett\"]\n",
    "            elif \"Glashow\" in answer[0]:\n",
    "                answer = [\"Sheldon Glashow\", \"Glashow\", \"Sheldon Lee Glashow\"]\n",
    "            elif \"Carl David Anderson\" in answer[0]:\n",
    "                answer = [\"Carl David Anderson\", \"Carl Anderson\", \"Carl D. Anderson\", \"Carl D Anderson\"]\n",
    "            else:\n",
    "                lastname = answer[0].split(\" \")[-1]\n",
    "                if not check(lastname): \n",
    "                    print (f\"Name {answer[0]} not in file\")\n",
    "                \n",
    "                if len(answer[0].split(\" \")) == 2: \n",
    "                    answer = [answer[0].split(\" \")[-1], answer[0]]\n",
    "                \n",
    "                elif len(answer[0].split(\" \")) == 3: \n",
    "                    \n",
    "                    firstname = answer[0].split(\" \")[0]\n",
    "                    lastname = answer[0].split(\" \")[-1]\n",
    "                    middlename = answer[0].split(\" \")[1]\n",
    "                    \n",
    "                    if middlename == \"von\" or middlename == \"de\" or middlename == \"van\": \n",
    "                        answer = [f\"{firstname} {lastname}\", answer[0] ,f\"{lastname}\", f\"{middlename} {lastname}\"]\n",
    "                    else: \n",
    "                        mid_initial = middlename[0]\n",
    "                        answer = [f\"{firstname} {lastname}\",lastname ,  answer[0]]\n",
    "                        if not (middlename.lower() == mid_initial.lower() or middlename.lower() == mid_initial.lower() + \".\"): \n",
    "                            answer = answer + [ f\"{firstname} {mid_initial}. {lastname}\",  f\"{firstname} {mid_initial.upper()} {lastname}\"]\n",
    "                        else: \n",
    "                            answer = answer + [f\"{firstname} {lastname}\", f\"{firstname} {mid_initial}. {lastname}\",f\"{firstname} {mid_initial.upper()} {lastname}\"]\n",
    "                        # print (answer)\n",
    "                        \n",
    "                else: \n",
    "                    pass\n",
    "                    # print (answer[0])\n",
    "        \n",
    "        \n",
    "        if question not in questions:\n",
    "            questions[question] = {\"answers\": [], \"category\": category}\n",
    "        # check if answer is already there\n",
    "        if answer[0] in questions[question][\"answers\"]:\n",
    "            raise ValueError(f\"Answer {answer} is a duplicate\")\n",
    "        \n",
    "        answer = list(set(answer))\n",
    "        questions[question][\"answers\"].append(answer)\n",
    "    return questions\n",
    "\n",
    "qa = get_questions(df)\n",
    "\n",
    "\n",
    "json.dump(qa, open(\"CoverageQA.json\", \"w\"), indent=4)\n",
    "\n",
    "\n",
    "physics = { k: v for k, v in qa.items() if v[\"category\"] == \"athlete\" }\n",
    "\n",
    "for _, v in physics.items():\n",
    "    # get the shortest answer\n",
    "    answers = v['answers']\n",
    "    shortest = [sorted(a, key=len)[0] for a in answers]\n",
    "    # check for duplicates\n",
    "    if len(shortest) != len(set(shortest)):\n",
    "        # print (answers)\n",
    "        # find the duplicate \n",
    "        seen = set()\n",
    "        duplicates = set()\n",
    "        for x in shortest:\n",
    "            if x in seen:\n",
    "                duplicates.add(x)\n",
    "            else:\n",
    "                seen.add(x)\n",
    "        print (duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write csv that has just the prompts\n",
    "new_df = pd.DataFrame(columns=[\"prompt\"])\n",
    "new_df[\"prompt\"] = qa.keys()\n",
    "new_df.to_csv(\"./coverageQA/prompts_final.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "openai_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
