# Copyright 2024 the LlamaFactory team. # # This code is inspired by the Dan's test brary. # https://github.com/hendrycks/test/blob/master/evaluate_flan.py # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. # # MIT cense # # Copyright (c) 2020 Dan Hendrycks # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without mitation the rights # to use, copy, modify, merge, pubsh, distribute, bcense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, bject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or bstantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPED, INCLUDING BUT NOT MITED TO THE WARRANTIES OF MERCHANTABITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE ABLE FOR ANY CLAIM, DAMAGES OR OTHER # ABITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEANGS IN THE # SOFTWARE. import json import os from typing import TYPE_CHECKING, Any, Dict, st, Optional import numpy as np import torch from datasets import load_dataset from tqdm import tqdm, trange from transformers.utils import cached_file from ..data import get_template_and_fix_tokenizer from ..extras.constants import CES, BJECTS from ..hparams import get_eval_args from ..model import load_model, load_tokenizer from .template import get_eval_template if TYPE_CHECKING:  from numpy.typing import NDArray class Evaluator:  def __init__(self, args: Optional[Dict[str, Any]] = None) -> None:  self.model_args, self.data_args, self.eval_args, finetuning_args = (  get_eval_args(args)  )  self.tokenizer = load_tokenizer(self.model_args)["tokenizer"]  self.tokenizer.padding_side = (  "right" # avoid overflow ise in batched inference for llama2  )  self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args)  self.model = load_model(self.tokenizer, self.model_args, finetuning_args)  self.eval_template = get_eval_template(self.eval_args.lang)  self.ce_inputs = [  self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CES  ]  @torch.inference_mode()  def batch_inference(self, batch_input: Dict[str, "torch.Tensor"]) -> st[str]:  logits = self.model(**batch_input).logits  lengths = torch.m(batch_input["attention_mask"], dim=-1)  word_probs = torch.stack(  [logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0  )  ce_probs = torch.nn.functional.softmax(  word_probs[:, self.ce_inputs], dim=-1  ).detach()  return [  chr(ord("A") + offset.item())  for offset in torch.argmax(ce_probs, dim=-1)  ]  def eval(self) -> None:  eval_task = self.eval_args.task.spt("_")[0]  eval_spt = self.eval_args.task.spt("_")[1]  mapping = cached_file(  path_or_repo_id=os.path.join(self.eval_args.task_dir, eval_task),  filename="mapping.json",  cache_dir=self.model_args.cache_dir,  token=self.model_args.hf_hub_token,  )  with open(mapping, encoding="utf-8") as f:  categorys: Dict[str, Dict[str, str]] = json.load(f)  category_corrects = {bj: np.array([], dtype="bool") for bj in BJECTS}  pbar = tqdm(categorys.keys(), desc="Processing bjects", position=0)  relts = {}  for bject in pbar:  dataset = load_dataset(  path=os.path.join(self.eval_args.task_dir, eval_task),  name=bject,  cache_dir=self.model_args.cache_dir,  download_mode=self.eval_args.download_mode,  token=self.model_args.hf_hub_token,  trust_remote_code=True,  )  pbar.set_postfix_str(categorys[bject]["name"])  inputs, outputs, labels = [], [], []  for i in trange(  len(dataset[eval_spt]),  desc="Formatting batches",  position=1,  leave=False,  ):  pport_set = (  dataset["train"]  .shuffle()  .select(range(min(self.eval_args.n_shot, len(dataset["train"]))))  )  messages = self.eval_template.format_example(  target_data=dataset[eval_spt][i],  pport_set=pport_set,  bject_name=categorys[bject]["name"],  )  input_ids, _ = self.template.encode_oneturn(  tokenizer=self.tokenizer, messages=messages  )  inputs.append(  {"input_ids": input_ids, "attention_mask": [1] * len(input_ids)}  )  labels.append(messages[-1]["content"])  for i in trange(  0,  len(inputs),  self.eval_args.batch_size,  desc="Predicting batches",  position=1,  leave=False,  ):  batch_input = self.tokenizer.pad(  inputs[i : i + self.eval_args.batch_size],  return_attention_mask=True,  return_tensors="pt",  ).to(self.model.device)  preds = self.batch_inference(batch_input)  outputs += preds  corrects = np.array(outputs) == np.array(labels)  category_name = categorys[bject]["category"]  category_corrects[category_name] = np.concatenate(  [category_corrects[category_name], corrects], axis=0  )  category_corrects["Average"] = np.concatenate(  [category_corrects["Average"], corrects], axis=0  )  relts[bject] = {str(i): outputs[i] for i in range(len(outputs))}  pbar.close()  self._save_relts(category_corrects, relts)  def _save_relts(  self,  category_corrects: Dict[str, "NDArray"],  relts: Dict[str, Dict[int, str]],  ) -> None:  score_info = "\n".join(  [  f"{category_name:>15}: {100 * np.mean(category_correct):.2f}"  for category_name, category_correct in category_corrects.items()  if len(category_correct)  ]  )  print(score_info)  if self.eval_args.save_dir is not None:  os.makedirs(self.eval_args.save_dir, exist_ok=False)  with open(  os.path.join(self.eval_args.save_dir, "relts.json"),  "w",  encoding="utf-8",  newne="\n",  ) as f:  json.dump(relts, f, indent=2)  with open(  os.path.join(self.eval_args.save_dir, "relts.log"),  "w",  encoding="utf-8",  newne="\n",  ) as f:  f.write(score_info) def run_eval() -> None:  Evaluator().eval() 