{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xLEHHEBbLzIv"
      },
      "outputs": [],
      "source": [
        "from huggingface_hub import hf_hub_download\n",
        "from google import genai\n",
        "from google.genai import types\n",
        "from google.colab import userdata\n",
        "import pandas as pd\n",
        "import requests\n",
        "import json\n",
        "import os\n",
        "import time\n",
        "import numpy as np\n",
        "import pickle\n",
        "\n",
        "GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')\n",
        "# Takes the dataset from NECTAR with rankings of prompt answer pairs from GPT 4\n",
        "client = genai.Client(api_key=GOOGLE_API_KEY)\n",
        "\n",
        "context = \"You are an impartial judge. Given an input conversation and 6 assistant responses, rank the responses from best to worst according to this rubric: relevance to the request, factual accuracy, creativity when appropriate (or analytical correctness when expected), and sufficient detail. Break ties randomly. Avoid bias from response order, length, or assistant names. Output only the final rankings of models 0 to 5 in the form of a list.\"\n",
        "\n",
        "def chat(model, prompt):\n",
        "\tresponse = client.models.generate_content(model=model, contents=prompt, config=types.GenerateContentConfig(system_instruction = context, response_mime_type=\"application/json\", responseSchema= {\"type\": \"ARRAY\",\"items\": {\"type\": \"INTEGER\"}}))\n",
        "\treturn response"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Loading pre-processed files"
      ],
      "metadata": {
        "id": "J3Rj6U60mob-"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rrsg80y3VO5q"
      },
      "outputs": [],
      "source": [
        "file_name = \"prompt_list.jsonl\"\n",
        "\n",
        "with open(file_name, \"r\") as file:\n",
        "    # Use a list comprehension to read each line, parse it,\n",
        "    # and extract the string value in one step\n",
        "    loaded_strings = [json.loads(line)['text'] for line in file]\n",
        "\n",
        "print(\"Strings successfully loaded with a list comprehension:\")\n",
        "prompt_list = loaded_strings"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ptgLun1TU0Ew"
      },
      "outputs": [],
      "source": [
        "file_name = \"GPT_rankings.jsonl\" # or \"string_data.jsonl\"\n",
        "\n",
        "# This list will store all the data from the file\n",
        "loaded_data = []\n",
        "\n",
        "# Open the file and read it line by line\n",
        "with open(file_name, \"r\") as file:\n",
        "    for line in file:\n",
        "        # Use json.loads() to parse the JSON string on each line\n",
        "        # This will convert the string back into a Python list or dictionary\n",
        "        parsed_object = json.loads(line)\n",
        "        loaded_data.append(parsed_object)\n",
        "\n",
        "print(\"Data successfully loaded from the JSONL file:\")\n",
        "ranking_GPT_list = loaded_data"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Generate LLM ranks"
      ],
      "metadata": {
        "id": "WbVBnOT9mxe9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "query_list = random.sample(range(len(prompt_list)), 5000)\n",
        "len(query_list)"
      ],
      "metadata": {
        "id": "kbHIU8IhzEAm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "n54PRrTASm2W"
      },
      "outputs": [],
      "source": [
        "from google.colab import files\n",
        "\n",
        "T = 4800\n",
        "new_dataset=np.zeros([T,6,2])\n",
        "input_list = []\n",
        "count = 0\n",
        "tried = 0\n",
        "for iter in query_list:\n",
        "  tried += 1\n",
        "  prompt = prompt_list[iter]\n",
        "  response1 = chat(\"gemini-2.5-flash\", prompt)\n",
        "  parsed_response1 = response1.parsed\n",
        "  response2 = chat(\"gemini-1.5-flash\", prompt)\n",
        "  parsed_response2 = response2.parsed\n",
        "  print(parsed_response1, parsed_response2)\n",
        "  if parsed_response1 is not None and parsed_response2 is not None:\n",
        "    if len(parsed_response1) == 6 and len(parsed_response2) == 6:\n",
        "      for r in range(6):\n",
        "        new_dataset[count, parsed_response1[r], 0]= r+1\n",
        "        new_dataset[count, parsed_response2[r], 1]= r+1\n",
        "      count += 1\n",
        "      input_list.append(iter)\n",
        "\n",
        "  if count%100==0 and count>0:\n",
        "      with open('coupled_LLMrankings.pickle', 'wb') as handle:\n",
        "        pickle.dump(new_dataset[:count,:,:],handle)\n",
        "\n",
        "  print(tried,count)\n",
        "  if count==T:\n",
        "    break\n",
        "\n",
        "file_name = \"input_list.jsonl\"\n",
        "\n",
        "# Save the data to the JSONL file\n",
        "with open(file_name, \"w\") as file:\n",
        "  file.write(json.dumps(input_list))\n",
        "\n",
        "print(f\"Data saved to {file_name}\")\n",
        "\n",
        "files.download('coupled_LLMrankings.pickle')\n",
        "files.download(\"input_list.jsonl\")\n",
        "print(\"Downloaded files\")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "for i in range(6):\n",
        "  print(np.corrcoef(new_dataset[:,i,0], new_dataset[:,i,1]))"
      ],
      "metadata": {
        "id": "D4CrO5SwYHIf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Data Preprocessing code"
      ],
      "metadata": {
        "id": "HR_7ELUPWffx"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zw-NyqfWO-v0"
      },
      "outputs": [],
      "source": [
        "# To ensure anonymity for the double-blind review process, the specific source URL is masked.\n",
        "# The data is sourced from a publicly available LLM ranking benchmark dataset.\n",
        "# The reviewer can run this by locally downloading the Nectar Dataset mentioned in the paper and load it as 'rlaif_data.parquet'\n",
        "try:\n",
        "    dataset = pd.read_parquet(\"rlaif_data.parquet\")\n",
        "except:\n",
        "    print(\"Could not load 'rlaif_data.parquet'.\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "10trZtGyPKn4"
      },
      "outputs": [],
      "source": [
        "model_list=['gpt-4-0613', 'gpt-4', 'gpt-3.5-turbo-instruct', 'anthropic', 'mistral-7b-instruct-v0.1', 'gpt-3.5-turbo']\n",
        "model_indices={'gpt-4-0613':0, 'gpt-4':1, 'gpt-3.5-turbo-instruct':2, 'anthropic':3, 'mistral-7b-instruct-v0.1':4, 'gpt-3.5-turbo':5}\n",
        "N = len(dataset)\n",
        "prompt_list = []\n",
        "ranking_GPT_list = []\n",
        "#iterate through data\n",
        "for row in range(N):\n",
        "  data_row = dataset.iloc[row]\n",
        "  answers=data_row['answers']\n",
        "  models=[answer['model'] for answer in answers]\n",
        "  #ensure datapoint contains all models that we want, otherwise we skip\n",
        "  same_models=True\n",
        "  for m in model_list:\n",
        "    if m not in models:\n",
        "      same_models=False\n",
        "\n",
        "  if same_models:\n",
        "    gpt_ranking = []\n",
        "\n",
        "    #build prompt and answers for LLM query\n",
        "    prompt=data_row['prompt']\n",
        "    chosen_indices = np.zeros(6)\n",
        "    already_seen = []\n",
        "    for i in range(7):\n",
        "      m = models[i]\n",
        "      if m in model_list and m not in already_seen:\n",
        "        chosen_indices[model_indices[m]] = i\n",
        "        #obtain GPT-4 rankings\n",
        "        gpt_ranking.append(model_indices[m])\n",
        "        already_seen.append(m)\n",
        "    answers_strings=[answers[int(i)]['answer'] for i in chosen_indices]\n",
        "    only_ans=''\n",
        "    for i in range(6):\n",
        "      only_ans+=\" [MODEL \"+str(i)+\" RESPONSE START]: \"+answers_strings[i]+\"[MODEL \"+str(i)+\" RESPONSE END], \"\n",
        "\n",
        "    full_prompt=\"[INPUT CONVERSATION: \"+prompt+\"'],\"+only_ans\n",
        "\n",
        "    #only look at short prompts to save time\n",
        "    if len(full_prompt)<3000:\n",
        "      prompt_list.append(full_prompt)\n",
        "      ranking_GPT_list.append(gpt_ranking)\n",
        "\n",
        "file_name = \"prompt_list.jsonl\"\n",
        "\n",
        "# Save the data to the JSONL file\n",
        "with open(file_name, \"w\") as file:\n",
        "    for text_string in prompt_list:\n",
        "        # Wrap the string in a dictionary or just dump the string itself.\n",
        "        # For batch API, it's often best to have a JSON object per line.\n",
        "        # This example wraps each string for clarity.\n",
        "        json_object = {\"text\": text_string}\n",
        "        file.write(json.dumps(json_object) + \"\\n\")\n",
        "\n",
        "print(f\"Data saved to {file_name}\")\n",
        "\n",
        "file_name = \"GPT_rankings.jsonl\"\n",
        "\n",
        "# Save the data to the JSONL file\n",
        "with open(file_name, \"w\") as file:\n",
        "    for inner_list in ranking_GPT_list:\n",
        "        # Dump the list as a JSON object on a new line\n",
        "        file.write(json.dumps(inner_list) + \"\\n\")\n",
        "\n",
        "print(f\"Data saved to {file_name}\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}