{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05c9e512-e4f8-46fe-8e18-d906f6fd4d95",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
    "\n",
    "os.environ[\"TRANSFORMERS_CACHE\"] = \"/data/../llm_cache\"\n",
    "from huggingface_hub import login\n",
    "\n",
    "login(\"..\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f1cb495c-3db5-4e0e-a43d-b057ad4328c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import math\n",
    "import torch\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import argparse\n",
    "import textwrap\n",
    "import transformers\n",
    "from peft import PeftModel\n",
    "from transformers import GenerationConfig, TextStreamer, BitsAndBytesConfig\n",
    "from llama_attn_replace import replace_llama_attn\n",
    "\n",
    "\n",
    "from dataclasses import dataclass, field\n",
    "from typing import Dict, Optional\n",
    "\n",
    "from accelerate import Accelerator\n",
    "import datasets\n",
    "from datasets import Dataset, load_dataset\n",
    "from peft import LoraConfig\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, set_seed\n",
    "\n",
    "from trl import DPOTrainer\n",
    "\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import transformers\n",
    "from torch.utils.data import Dataset\n",
    "from transformers import Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig\n",
    "from llama_attn_replace_sft import replace_llama_attn\n",
    "from gptneox_attn_replace import replace_gpt_neox_attn\n",
    "from peft import LoraConfig, get_peft_model\n",
    "from torch.distributed import barrier\n",
    "from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl\n",
    "\n",
    "import re\n",
    "import os\n",
    "import json\n",
    "import string\n",
    "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
    "from rouge_score import rouge_scorer\n",
    "from evaluate import load\n",
    "\n",
    "from metrics.drop_answer_em_f1 import DropAnswerEmAndF1\n",
    "from metrics.support_em_f1 import SupportEmF1Metric\n",
    "from metrics.answer_support_recall import AnswerSupportRecallMetric\n",
    "from metrics.squad_answer_em_f1 import SquadAnswerEmF1Metric\n",
    "\n",
    "\n",
    "from copy import deepcopy\n",
    "\n",
    "import os\n",
    "import json\n",
    "import jsonlines\n",
    "os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
    "# from shared_utils.indexing_utils import SparseIndexer, DocumentCollection\n",
    "from pyserini.search.lucene import LuceneSearcher\n",
    "\n",
    "bm25_k1 = 0.9\n",
    "bm25_b = 0.4\n",
    "\n",
    "index_dir_path = \"/data2/../nlp_data/topiocqa/indexes/bm25\"\n",
    "searcher = LuceneSearcher(index_dir_path)\n",
    "searcher.set_bm25(bm25_k1, bm25_b)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7423f9ce-20de-47e8-95cd-953146ed0868",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_docs(run_file):\n",
    "    with open(run_file, 'r' )as f:\n",
    "        run_data = f.readlines()\n",
    "\n",
    "    runs = {} \n",
    "    for line in run_data:\n",
    "        original_line = deepcopy(line)\n",
    "        try:\n",
    "            line = line.split(\"\\t\")\n",
    "            query = line[0]\n",
    "            passage = line[2]\n",
    "            rel = int(line[4])\n",
    "        except IndexError:\n",
    "            line = original_line.split(\" \")\n",
    "            query = line[0]\n",
    "            passage = line[2]\n",
    "            rel = int(line[4])\n",
    "\n",
    "        if query not in runs:\n",
    "            runs[query] = {}\n",
    "        runs[query][passage] = rel\n",
    "    return runs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3c555feb-0469-45fd-b297-885c8f828b8e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# ours\n",
    "runs = get_docs(\"../topiocqa/test_2R_SFT101_DPO500_b5_finefeedback_dpo_3R_refself_beta0.5.trec\")\n",
    "# baselines\n",
    "runs_retpo = get_docs(\"../topiocqa/retpo.trec\" ,)\n",
    "runs_hyde = get_docs(\"../topiocqa/hyde.trec\",)\n",
    "runs_convgqr = get_docs(\"../topiocqa/convgqr.trec\",)\n",
    "runs_t5qr = get_docs(\"../topiocqa/t5qr.trec\",)\n",
    "runs_llmcs = get_docs(\"../topiocqa/llmcs_cot.trec\",)\n",
    "runs_infocqr = get_docs(\"../topiocqa/infocqr.trec\",)\n",
    "runs_hydellm = get_docs(\"../topiocqa/hyde_llm.trec\",)\n",
    "\n",
    "# conv data\n",
    "with open(\"/data2/../nlp_data/topiocqa/qa.jsonl\", \"r\") as f:\n",
    "    data = f.readlines()\n",
    "    data_ids = [json.loads(data[i])['id'] for i in range(len(data))]\n",
    "    n = len(set(data_ids))\n",
    "\n",
    "data_dict = [json.loads(data[i]) for i in range(len(data))]\n",
    "data_dict = {l['id']:l for l in data_dict}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "2fe5cc71-85a6-456c-866f-c5965309c10c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def format_prompt(query, retrieved_documents):\n",
    "    PROMPT = f\"\"\"\n",
    "    Based on the given reference documents, answer the following question.\n",
    "    When answering, do not repeat the question, and only provide the correct answer.\n",
    "    Provide the answer only in JSON format as {{\"Answer\":\"Your answer\"}}.\n",
    "\n",
    "    Reference Documents:\n",
    "    ---------------------\n",
    "    {retrieved_documents}\n",
    "    ——————————\n",
    "    Question: {query}\n",
    "    Answer: \n",
    "    \"\"\"\n",
    "    return PROMPT\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "50493901-405c-4e18-8ea7-f6c98fbd2c52",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def generate(formatted_prompt):\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "        {\"role\": \"user\", \"content\": formatted_prompt}  \n",
    "    ]\n",
    "    input_ids = tokenizer.apply_chat_template(\n",
    "        messages,\n",
    "        add_generation_prompt=True,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(model.device)\n",
    "    \n",
    "    # Create attention mask (1s for all tokens)\n",
    "    attention_mask = torch.ones_like(input_ids)\n",
    "    \n",
    "    generated_ids = model.generate(\n",
    "        input_ids,\n",
    "        attention_mask=attention_mask,\n",
    "        max_new_tokens=128,\n",
    "        temperature=1e-10,\n",
    "        pad_token_id=tokenizer.eos_token_id\n",
    "    )\n",
    "    generated_ids = [\n",
    "        output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, generated_ids)\n",
    "    ]\n",
    "    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3130d560-4e35-47b7-8aa5-587c3f80ed63",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "model_id = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map='auto'\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "\n",
    "model.generation_config.pad_token_id = tokenizer.pad_token_id\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "89785f81-bd6d-4d0f-89c1-80310ecaf928",
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "import os\n",
    "import random\n",
    "\n",
    "def format_query_prompt(query, gt, pd):\n",
    "    prefix =  [{'role': 'system',\n",
    "             'content': \"You are an evaluation tool. Just answer by {Yes} or {No}.\"}]\n",
    "    prefix.extend([{'role': 'user',\n",
    "             'content': f\" Here is a question , a golden answer and an AI - generated answer . Judge whether the AI - generated answer is correct according to the question and golden answer , answer with {{ Yes }} or {{ No }}.\\ nQuestion : { query }.\\ nGolden answer : { gt }\\ nGenerated answer : { pd } Response : {{\"}\n",
    "             ]\n",
    "             )\n",
    "    return prefix\n",
    "\n",
    "\n",
    "def run_llm(client, model_name, messages):\n",
    "    response = client.chat.completions.create(messages=messages, model=model_name)\n",
    "    return response.choices[0].message.content # , cost\n",
    "\n",
    "\n",
    "def extract_score_from_text(text):\n",
    "  \n",
    "    score = 1 if \"yes\" in text.lower() else 0   \n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "795c37c6-a7d7-49cc-bc34-4db930b916d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "bertscore = load(\"bertscore\", model_type=\"bert-base-cased\")\n",
    "\n",
    "def normalize_answer(s):\n",
    "    \"\"\"Lower text and remove punctuation, articles and extra whitespace.\"\"\"\n",
    "\n",
    "    def remove_articles(text):\n",
    "        regex = re.compile(r\"\\b(a|an|the)\\b\", re.UNICODE)\n",
    "        return re.sub(regex, \" \", text)\n",
    "\n",
    "    def white_space_fix(text):\n",
    "        return \" \".join(text.split())\n",
    "\n",
    "    def remove_punc(text):\n",
    "        exclude = set(string.punctuation)\n",
    "        return \"\".join(ch for ch in text if ch not in exclude)\n",
    "\n",
    "    def lower(text):\n",
    "        return text.lower()\n",
    "\n",
    "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
    "\n",
    "def answer_extractor(potentially_cot: str) -> str:\n",
    "    if potentially_cot.startswith('\"') and potentially_cot.endswith('\"'):\n",
    "        potentially_cot = potentially_cot[1:-1]\n",
    "\n",
    "    cot_regex = re.compile(\".* answer is:? (.*)\\\\.?\")\n",
    "    match = cot_regex.match(potentially_cot)\n",
    "    if match:\n",
    "        output = match.group(1)\n",
    "        if output.endswith(\".\"):\n",
    "            output = output[:-1]\n",
    "    else:\n",
    "        output = potentially_cot\n",
    "\n",
    "    return output\n",
    "\n",
    "def calculate_acc(prediction, ground_truth):\n",
    "    for gt in ground_truth:\n",
    "        if gt in prediction:\n",
    "            return 1\n",
    "    return 0\n",
    "\n",
    "def calculate_bleu(reference, hypothesis):\n",
    "    smoothie = SmoothingFunction().method4\n",
    "    return sentence_bleu([reference], hypothesis, smoothing_function=smoothie)\n",
    "\n",
    "def calculate_rouge(reference, hypothesis):\n",
    "    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)\n",
    "    scores = scorer.score(reference, hypothesis)\n",
    "    return scores\n",
    "\n",
    "\n",
    "def evaluate_by_dicts(input_path, output_path):\n",
    "    metrics = [SquadAnswerEmF1Metric()]\n",
    "     \n",
    "    total_acc = 0\n",
    "    total_lines = 0\n",
    "    total_rouge1 = 0\n",
    "    total_rougel = 0\n",
    "    total_bert = 0\n",
    "\n",
    "    with open(input_path, 'r', encoding='utf-8') as file:\n",
    "        for line in file:\n",
    "            data = json.loads(line.strip())\n",
    "\n",
    "            ground_truth, prediction = data['answers'], data['generated_answer']\n",
    "            bert_score = bertscore.compute(references = ground_truth, predictions = prediction,lang = 'en')\n",
    "\n",
    "            prediction = [prediction]            \n",
    "\n",
    "            assert isinstance(prediction, (str, list))\n",
    "            if isinstance(prediction, str):\n",
    "                if prediction.strip().startswith(\"[\") or prediction.strip().endswith(\"]\"):\n",
    "                    prediction = [e for e in prediction.replace('\"', \"\").replace(\"[\", \"\").replace(\"]\", \"\").split(\",\")]\n",
    "                else:\n",
    "                    prediction = [prediction]\n",
    "\n",
    "            assert isinstance(prediction, (list, tuple))\n",
    "            prediction = [str(e) for e in prediction]\n",
    "            prediction = [answer_extractor(_prediction) for _prediction in prediction]\n",
    "\n",
    "            normalized_prediction = normalize_answer(prediction[0])\n",
    "            normalized_ground_truth = [normalize_answer(i) for i in ground_truth]\n",
    "\n",
    "            acc = calculate_acc(normalized_prediction, normalized_ground_truth)\n",
    "            total_acc += acc\n",
    "            # print(normalized_prediction, normalized_ground_truth)\n",
    "\n",
    "            # bleu_score = calculate_bleu(normalized_ground_truth, normalized_prediction)\n",
    "            # total_bleu += bleu_score\n",
    "\n",
    "            rouge_scores = calculate_rouge(\" \".join(normalized_ground_truth), normalized_prediction)\n",
    "            total_rouge1 += rouge_scores['rouge1'].fmeasure\n",
    "            total_rougel += rouge_scores['rougeL'].fmeasure\n",
    "\n",
    "            total_bert += bert_score['f1'][0]\n",
    "\n",
    "            total_lines += 1\n",
    "            try : \n",
    "                metrics[0](prediction, ground_truth)\n",
    "            except : \n",
    "                pass\n",
    "            \n",
    "        total_acc = total_acc / total_lines\n",
    "        # total_bleu = total_bleu / total_lines\n",
    "        total_rouge1 = total_rouge1 / total_lines\n",
    "        total_rougel = total_rougel / total_lines\n",
    "        total_bert = total_bert / total_lines\n",
    "        \n",
    "        evaluation_results = metrics[0].get_metric()\n",
    "        evaluation_results['acc'] = total_acc\n",
    "        # evaluation_results['bleu'] = total_bleu\n",
    "        evaluation_results['rouge1'] = total_rouge1\n",
    "        evaluation_results['rougel'] = total_rougel\n",
    "        evaluation_results['bert_f1'] = total_bert\n",
    "        \n",
    "    save_results(evaluation_results, output_path)\n",
    "    \n",
    "def save_results(results_dict, output_path):\n",
    "    with open(output_path, \"w\") as file:\n",
    "        json.dump(results_dict, file, indent=4)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "de44e3cb-febb-4a23-bf30-77d2c2ad90e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_output(output):\n",
    "    \"\"\"Returns LLM's answer from its output\n",
    "    Assumes output is in JSON format of {'Answer':answer}.\n",
    "    If output is not in expected format, returns string signifying error\n",
    "    \"\"\"\n",
    "    end_index = output.find('}')\n",
    "    if end_index == -1:\n",
    "        end_index = len(output)\n",
    "    json_text = output[:end_index+1].strip()\n",
    "    try:\n",
    "        d = json.loads(json_text)\n",
    "        return str(d['Answer'])\n",
    "    except:\n",
    "        return f'JSON parse error({json_text})'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "06d5e0ce-6428-42b6-a5bc-20d290758907",
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import sample\n",
    "import random\n",
    "random.seed(0)\n",
    "def generate_eval(top_docs, out_path, eval_path, conv=data_dict, topk=16, n_gen = -1):\n",
    "    \n",
    "    result = []\n",
    "\n",
    "    qids = list(top_docs.keys())\n",
    "    # qids = sample(qids, min(n_gen, len(qids)))\n",
    "    # # qids = qids[:min(n_gen, len(qids))]\n",
    "    for i, qid in tqdm(enumerate(qids)):\n",
    "        topk_docs_ours = list(top_docs[qid].keys())[:topk]\n",
    "        try:\n",
    "            topk_docs_ours = [json.loads(searcher.doc(qrels_prefix+d).raw())['contents'] for d in topk_docs_ours]\n",
    "        except:\n",
    "            print(topk_docs_ours)\n",
    "\n",
    "        query = conv[qid]['query']\n",
    "        ans = conv[qid]['answer']\n",
    "\n",
    "        context = '\\n'.join(topk_docs_ours)\n",
    "        formatted_prompt = format_prompt(query, context)\n",
    "        output = generate(formatted_prompt)\n",
    "        output = list(map(clean_output, [output]))\n",
    "\n",
    "\n",
    "        result.append({'question':query, 'answers':[ans], 'generated_answer':output})\n",
    "\n",
    "    result_str = list(map(lambda x:json.dumps(x, ensure_ascii=False), result))\n",
    "    with open(out_path, 'w', encoding='utf-8') as f:\n",
    "        f.write('\\n'.join(result_str))\n",
    "    evaluate_by_dicts(out_path, eval_path)\n",
    "    with open(eval_path, 'r', encoding='utf-8') as f:\n",
    "        evaluations = json.load(f)\n",
    "    return evaluations # retpo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "0ea0cf6c-e579-4057-bac1-c0c6dd5db1ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = openai.OpenAI(api_key = \"\",)\n",
    "model_name = \"chatgpt-4o-latest\" # \"chatgpt-4o-latest\"#  \"gpt-4o-mini-2024-07-18\" # \"chatgpt-4o-latest\"# \"gpt-4o-mini-2024-07-18\"# \"chatgpt-4o-latest\"# \"gpt-4o-mini-2024-07-18\"\n",
    "\n",
    "def llm_eval(input_path):\n",
    "    scores = []\n",
    "\n",
    "    with open(input_path, 'r', encoding='utf-8') as eval_file:\n",
    "        for idx, line in tqdm(enumerate(eval_file)):\n",
    "            dic = json.loads(line)\n",
    "            question = dic['question']\n",
    "            groundtruth = dic['answers']\n",
    "            prediction = dic['generated_answer']\n",
    "            prompt = format_query_prompt(question, groundtruth, prediction)\n",
    "            generated_eval = run_llm(client, model_name, prompt)\n",
    "            score = extract_score_from_text(generated_eval)\n",
    "            scores += [score]\n",
    "            \n",
    "    return scores # np.mean(scores)* 100\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "86b8792e-3f72-48d3-baed-9a14b66b327f",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "path = \"/data2/../nlp_data/generation/topiocqa\"\n",
    "generate_eval(top_docs = runs, \n",
    "              out_path = path+\"test_ours.json\",\n",
    "              eval_path = path+\"test_ours_out.json\",\n",
    "              topk=4, \n",
    "             )\n",
    "\n",
    "llmcs_eval = generate_eval(top_docs = runs_llmcs, \n",
    "              out_path = path+\"test_llmcs.json\",\n",
    "              eval_path = path+\"test_llmcs_out.json\",\n",
    "                          topk=4, )\n",
    "print(llmcs_eval)\n",
    "\n",
    "path = \"/data2/../nlp_data/generation/topiocqa\"\n",
    "generate_eval(top_docs = runs_retpo, \n",
    "              out_path = path+\"test_retpo.json\",\n",
    "              eval_path = path+\"test_retpo_out.json\",\n",
    "             topk=4, )\n",
    "\n",
    "hyde_eval = generate_eval(top_docs = runs_hyde, \n",
    "              out_path = path+\"test_hyde.json\",\n",
    "              eval_path = path+\"test_hyde_out.json\",\n",
    "              topk=4, )\n",
    "print(hyde_eval)\n",
    "\n",
    "convgqr_eval = generate_eval(top_docs = runs_convgqr, \n",
    "              out_path = path+\"test_convgqr.json\",\n",
    "              eval_path = path+\"test_convgqr_out.json\",\n",
    "              topk=4, )\n",
    "print(convgqr_eval)\n",
    "\n",
    "\n",
    "t5qr_eval = generate_eval(top_docs = runs_t5qr, \n",
    "              out_path = path+\"test_t5qr.json\",\n",
    "              eval_path = path+\"test_t5qr_out.json\",\n",
    "                         topk=4, )\n",
    "print(t5qr_eval)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "infocqr_eval = generate_eval(top_docs = runs_infocqr, \n",
    "              out_path = path+\"test_infocqr.json\",\n",
    "              eval_path = path+\"test_infocqr_out.json\", \n",
    "                            topk=4, )\n",
    "print(infocqr_eval)\n",
    "\n",
    "hydellm_eval = generate_eval(top_docs = runs_hydellm, \n",
    "              out_path = path+\"test_hydellm.json\",\n",
    "              eval_path = path+\"test_hydellm_out.json\",\n",
    "                            topk=4, )\n",
    "print(hydellm_eval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5bfda7f-0d3e-46e5-8cd1-5bd1f01a0d69",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f05caf1c-7974-4a26-9989-08d6b03aea7e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "c7399044-21a5-4ee7-8110-d14f3aa27bbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm_ours = llm_eval(input_path=path+\"test_ours.json\")\n",
    "print(np.mean(llm_ours)*100)\n",
    "\n",
    "\n",
    "llm_llmcs = llm_eval(input_path=path+\"test_llmcs.json\")\n",
    "print(np.mean(llm_llmcs)*100)\n",
    "\n",
    "\n",
    "llm_retpo = llm_eval(input_path=path+\"test_retpo.json\")\n",
    "print(np.mean(llm_retpo)*100)\n",
    "\n",
    "\n",
    "llm_hyde = llm_eval(input_path=path+\"test_hyde.json\")\n",
    "print(np.mean(llm_hyde)*100)\n",
    "llm_convgqr = llm_eval(input_path=path+\"test_convgqr.json\")\n",
    "print(np.mean(llm_convgqr)*100)\n",
    "llm_t5qr = llm_eval(input_path=path+\"test_t5qr.json\")\n",
    "print(np.mean(llm_t5qr)*100)\n",
    "\n",
    "llm_infocqr = llm_eval(input_path=path+\"test_infocqr.json\")\n",
    "print(np.mean(llm_infocqr)*100)\n",
    "llm_hydellm = llm_eval(input_path=path+\"test_hydellm.json\")\n",
    "print(np.mean(llm_hydellm)*100)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbe8c07c-30b4-44bc-b0db-28287964662e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9190f1a1-2064-4096-a69f-8b44c0465dad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbd81c0f-5953-46c7-b556-34d12bb413d2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2b3a218-bd50-4b6b-a825-b5b0361eee4c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a47bc0a1-c436-4d6b-9302-9d0de165aead",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31508d62-c65c-4157-a354-a2a8df0545ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8229af1-990e-4c2e-b286-0f8ef429fa1a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "391861d2-1154-4199-a336-1c2a517e52b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cb1aacf-cfca-41a2-918e-78ced202aeb3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5086a3e5-3b18-4430-9a65-7b6b8392055e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3496d520-a6b4-4737-9fe9-22d209bce838",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama3",
   "language": "python",
   "name": "llama3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
