{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from Evaluation import get_calibration_curve, get_histogram_of_confidences, get_all_metrics, get_ideal_calibration_curve\n",
    "from util.EvaluationMetrics import Metric\n",
    "\n",
    "model_dir = \"path/to/model\"\n",
    "dataset = \"triviaqa\"\n",
    "split = \"validation\"\n",
    "\n",
    "max_confidence = 10\n",
    "metric = Metric.F1\n",
    "threshold = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "gathered_metrics = []\n",
    "\n",
    "\n",
    "metrics = get_all_metrics(model_dir, dataset, split, metric=metric, max_confidence=max_confidence, threshold=threshold, n_bins=11, is_unsloth=True)\n",
    "gathered_metrics.append(metrics)\n",
    "\n",
    "df = pd.DataFrame.from_dict(gathered_metrics) \n",
    "print(df.to_markdown())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_ideal_calibration_curve()\n",
    "get_calibration_curve(model_dir, dataset, split, metric=metric, max_confidence=max_confidence, threshold=threshold, is_unsloth=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_histogram_of_confidences(model_dir, dataset, split, max_confidence=10, is_unsloth=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "unsloth_sft2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
