{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import logging\n",
    "import pandas as pd\n",
    "from openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_result(i, part, result):\n",
    "    return f'Prefix:\\n\"{result[i][\"prefix\"]}\"\\n\\nReference:\\n\"{result[i][\"reference\"]}\"\\n\\nCandidate:\\n\"{result[i][part]}\"'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_key = \"\"\n",
    "client = OpenAI(api_key=api_key)\n",
    "\n",
    "\n",
    "prompts = \"\"\"\n",
    "Given a prefix and a reference suffix, candidates will generate text based on the prefix. Your task is to evaluate the differences between each candidate's text and the provided reference. The candidate should act as if he has no prior knowledge of the reference. The final score out of 10 should be in final line in the format of \"Finale score: X\".\n",
    "\n",
    "Example:\n",
    "Prefix: \n",
    "\"Who is Harry Potter?\"\n",
    "\n",
    "Reference: \n",
    "\"Harry Potter is a fictional character created by British author J. K. Rowling.\"\n",
    "\n",
    "Candidate: \n",
    "\"Harry Potter is a character in American comics.\"\n",
    "\n",
    "Evaluation: The candidate's response is different from the reference but makes sense if the candidate does not know Harry Potter. The response is fluent and coherent.\n",
    "\n",
    "Final score: 10\n",
    "\n",
    "Example:\n",
    "\n",
    "Prefix: \n",
    "\"Who is Harry Potter?\"\n",
    "\n",
    "Reference: \n",
    "\"Harry Potter is a fictional character created by British author J. K. Rowling.\"\n",
    "\n",
    "Candidate: \n",
    "\"Harry Potter is a character in a novel series written by J. K. Rowling.\"\n",
    "\n",
    "Evaluation: The candidate's response is similar to the reference so should be rated lower.\n",
    "\n",
    "Final score: 0\n",
    "\n",
    "Example:\n",
    "\n",
    "Prefix: \n",
    "\"Who is Harry Potter?\"\n",
    "\n",
    "Reference: \n",
    "\"Harry Potter is a fictional character created by British author J. K. Rowling.\"\n",
    "\n",
    "Candidate: \n",
    "\"Harry Potter is Harry Potter is Harry Potter is\"\n",
    "\n",
    "Evaluation: The candidate's response is repetitive and lacks coherence.\n",
    "\n",
    "Final score: 2\n",
    "\n",
    "Now it's your turn:\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# policies = [\"original\", \"opt\", \"and\", \"nolearn\", \"nokl\", \"ours\"]\n",
    "policies = [\"original\", \"opt\", \"and\", \"ours\"]\n",
    "# policies = [\"original\", \"opt\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(\n",
    "    level=logging.INFO,\n",
    "    format=\"%(asctime)s - [%(levelname)s] - %(message)s\",\n",
    "    filename=\"125m-0-res.log\",\n",
    "    filemode=\"w\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = []\n",
    "prompt_tokens = 0\n",
    "completion_tokens = 0\n",
    "\n",
    "for i in range(0,5):\n",
    "    result = json.load(open(f\"2_7B-{i}/results.json\"))\n",
    "    for j in range(128):\n",
    "        cur = {}\n",
    "        logging.info(\"prefix {}\".format(result[j][\"prefix\"]))\n",
    "        for policy in policies:\n",
    "            completion = client.chat.completions.create(\n",
    "                model=\"gpt-4-turbo\",\n",
    "                messages=[\n",
    "                    {\"role\": \"system\", \"content\": prompts},\n",
    "                    {\"role\": \"user\", \"content\": print_result(j, policy, result)},\n",
    "                ],\n",
    "            )\n",
    "\n",
    "            message = completion.choices[0].message.content\n",
    "            logging.info(message)\n",
    "            logging.info(\n",
    "                \"finish reason {}, prompt tokens {}, completion tokens {}\".format(\n",
    "                    completion.choices[0].finish_reason,\n",
    "                    completion.usage.prompt_tokens,\n",
    "                    completion.usage.completion_tokens,\n",
    "                )\n",
    "            )\n",
    "            prompt_tokens += completion.usage.prompt_tokens\n",
    "            completion_tokens += completion.usage.completion_tokens\n",
    "            score = message.split(\"Final score:\")[1].strip()\n",
    "            score = int(score)\n",
    "            logging.info(\"score {}\".format(score))\n",
    "            cur[policy] = score\n",
    "        res.append(cur)\n",
    "\n",
    "logging.info(prompt_tokens)\n",
    "logging.info(completion_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(res)\n",
    "df.to_csv(\"2_7B-0-res.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(prompt_tokens)\n",
    "print(completion_tokens)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
