{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "972fd99f",
   "metadata": {},
   "source": [
    "# Cost-of-Pass: An Economic Framework for Evaluating Language Models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49f87a96",
   "metadata": {},
   "source": [
    "This notebook provides codes for fully reproducing our results! Each section (below Setup) corresponds to the experiments in the paper!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10595d18",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27deb58c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from cost_of_pass import *\n",
    "from cost_of_pass.models.litellm import make_litellm_client\n",
    "from collections import defaultdict\n",
    "from datetime import datetime\n",
    "from cost_of_pass.evaluation.utils import expected_value"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41da95ac",
   "metadata": {},
   "source": [
    "### Tasks & Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e22f9d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "class AnalysisTask:\n",
    "    def __init__(self, task_name, n_samples, metric_name, human_baseline_cost, task_type):\n",
    "        self.task_name = task_name\n",
    "        self.n_samples = n_samples\n",
    "        self.metric_name = metric_name\n",
    "        self.human_baseline_cost = human_baseline_cost\n",
    "        self.task_type = task_type\n",
    "\n",
    "        self.task = get_task(task_name, n_samples=n_samples)\n",
    "        self.metric = get_metric(metric_name)\n",
    "        self.baseline_cost = human_baseline_cost\n",
    "        \n",
    "\n",
    "basic_quantitative_tasks = [\n",
    "    AnalysisTask(task_name=\"TwoDigitAddition\", n_samples=128, metric_name=\"MathExpressionMatch\", human_baseline_cost=0.02, task_type=\"Basic Quantitative\"),\n",
    "    AnalysisTask(task_name=\"GSM8K\", n_samples=128, metric_name=\"MathExpressionMatch\", human_baseline_cost=3.50, task_type=\"Basic Quantitative\"),\n",
    "]\n",
    "\n",
    "knowledge_based_tasks = [\n",
    "    AnalysisTask(task_name=\"BBQ\", n_samples=128, metric_name=\"MultipleChoiceMatch\", human_baseline_cost=0.10, task_type=\"Knowledge Based\"),\n",
    "    AnalysisTask(task_name=\"GPQA_Diamond\", n_samples=128, metric_name=\"MultipleChoiceMatch\", human_baseline_cost=58.0, task_type=\"Knowledge Based\"),\n",
    "]\n",
    "\n",
    "complex_quantitative_tasks = [\n",
    "    AnalysisTask(task_name=\"MATH500\", n_samples=128, metric_name=\"MathExpressionMatch\", human_baseline_cost=12.0, task_type=\"Complex Quantitative\"),\n",
    "    AnalysisTask(task_name=\"AIME_2024\", n_samples=128, metric_name=\"MathExpressionMatch\", human_baseline_cost=20.0, task_type=\"Complex Quantitative\"),\n",
    "]\n",
    "\n",
    "all_tasks = [\n",
    "    basic_quantitative_tasks,\n",
    "    knowledge_based_tasks,\n",
    "    complex_quantitative_tasks,\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8eceaf11",
   "metadata": {},
   "source": [
    "### Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "56fc7322",
   "metadata": {},
   "outputs": [],
   "source": [
    "class AnalysisModel:\n",
    "    def __init__(self, model_name, model_type, release_date=None, should_register=False, runtime_kwargs={}):\n",
    "        self.model_name = model_name\n",
    "        self.model_type = model_type\n",
    "        self.release_date = release_date\n",
    "        self.should_register = should_register\n",
    "        self.runtime_kwargs = runtime_kwargs\n",
    "        self.runtime_kwargs[\"max_attempts\"] = 5 # How many times to retry querying the API if it fails\n",
    "        \n",
    "        if should_register:\n",
    "            register_client(model_name)(make_litellm_client(model_name))\n",
    "        self.client = get_client(model_name)\n",
    "\n",
    "# NOTE: We manually register the models from togetherai, as they are not directly retrievable through LiteLLM (but are available there)\n",
    "# NOTE: We pass different sampling args to the OpenAI reasoning models, as required so.\n",
    "\n",
    "lightweight_models = [\n",
    "    AnalysisModel(model_name=\"together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\", should_register=True, model_type=\"Lightweight\", release_date=\"2024-07-23\"),\n",
    "    AnalysisModel(model_name=\"gpt-4o-mini-2024-07-18\", model_type=\"Lightweight\", release_date=\"2024-07-18\"),\n",
    "    AnalysisModel(model_name=\"together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo\", should_register=True, model_type=\"Lightweight\", release_date=\"2024-12-06\"),\n",
    "]\n",
    "\n",
    "large_models = [\n",
    "    AnalysisModel(model_name=\"together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\", should_register=True, model_type=\"Large\", release_date=\"2024-07-23\"),\n",
    "    AnalysisModel(model_name=\"claude-3-5-sonnet-20240620\", model_type=\"Large\", release_date=\"2024-06-20\"),\n",
    "    AnalysisModel(model_name=\"gpt-4o-2024-05-13\", model_type=\"Large\", release_date=\"2024-05-13\"),\n",
    "]\n",
    "\n",
    "reasoning_models = [\n",
    "    AnalysisModel(model_name=\"o1-mini-2024-09-12\", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type=\"Reasoning\", release_date=\"2024-09-12\"),\n",
    "    AnalysisModel(model_name=\"o1-2024-12-17\", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type=\"Reasoning\", release_date=\"2024-12-05\"),\n",
    "    AnalysisModel(model_name=\"together_ai/deepseek-ai/DeepSeek-R1\", should_register=True, model_type=\"Reasoning\", release_date=\"2025-01-20\"),\n",
    "    AnalysisModel(model_name=\"o3-mini-2025-01-31\", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type=\"Reasoning\", release_date=\"2025-01-31\"),\n",
    "]\n",
    "\n",
    "all_models = [\n",
    "    lightweight_models,\n",
    "    large_models,\n",
    "    reasoning_models,\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0eec0b29",
   "metadata": {},
   "source": [
    "### Inference Time Techniques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e8543964",
   "metadata": {},
   "outputs": [],
   "source": [
    "vanilla_prompting = get_test_time_method(\"VanillaPromptMethod\")\n",
    "\n",
    "inference_time_techniques = [\n",
    "    get_test_time_method(\"SelfRefinementMethod\"),\n",
    "    get_test_time_method(\"MajorityVotingMethod\", n_votes=3),\n",
    "    get_test_time_method(\"MajorityVotingMethod\", n_votes=4), \n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ed2cb2e",
   "metadata": {},
   "source": [
    "## 3.2. Frontier Cost-of-Pass with a Single Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84efe698",
   "metadata": {},
   "outputs": [],
   "source": [
    "single_model_fcop = defaultdict(float)\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        for model_category in all_models:\n",
    "            for model in model_category:\n",
    "                model_family = ModelFamily(f\"Single Model: {model.model_name}\")\n",
    "                model_family = model_family.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "                frontier_cop = frontier_cop_estimator.estimate_task(model_family, n_runs=8)\n",
    "                single_model_fcop[(task.task_type, model.model_type, task.task.task_name, model.model_name)] = frontier_cop\n",
    "\n",
    "single_model_fcop"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9a9116b",
   "metadata": {},
   "source": [
    "## 3.3. Tracking Frontier Cost-of-Pass with New Releases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91dc8d5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "track_norm_fcop = defaultdict(float)\n",
    "\n",
    "models_sorted_by_release_date = sorted(\n",
    "    [model for model_category in all_models for model in model_category],\n",
    "    key=lambda x: datetime.strptime(x.release_date, \"%Y-%m-%d\")\n",
    ")\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        model_family_t = ModelFamily(\"Models over time\")\n",
    "        base_frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)\n",
    "        track_norm_fcop[(task.task_type, task.task.task_name, \"-\", \"Human Expert Baseline\")] = base_frontier_cop\n",
    "        for model in models_sorted_by_release_date:\n",
    "            model_family_t = model_family_t.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "            frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)\n",
    "            track_norm_fcop[(task.task_type, task.task.task_name, model.release_date, model.model_name)] = frontier_cop / base_frontier_cop\n",
    "\n",
    "track_norm_fcop"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "238e951d",
   "metadata": {},
   "source": [
    "## 3.4. Essentialness of Model Families: Counterfactual Frontier Cost-of-Pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09cea52f",
   "metadata": {},
   "outputs": [],
   "source": [
    "essentialness = defaultdict(float)\n",
    "\n",
    "model_family_T = ModelFamily(\"All Models\")\n",
    "for model_category in all_models:\n",
    "    for model in model_category:\n",
    "        model_family_T = model_family_T.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        frontier_cop = frontier_cop_estimator.estimate_task(model_family_T, n_runs=8)\n",
    "        for model_category in all_models:\n",
    "            model_family_g_rem = ModelFamily(f\"Counterfactual of {model_category[0].model_type}\", model_family_T)\n",
    "            for model in model_category:\n",
    "                model_family_g_rem = model_family_g_rem.del_model(model.client, vanilla_prompting)\n",
    "            frontier_cop_g_rem = frontier_cop_estimator.estimate_task(model_family_g_rem, n_runs=8)\n",
    "            essentialness[(task.task_type, task.task.task_name, model.model_type)] = (frontier_cop_g_rem - frontier_cop) / frontier_cop_g_rem\n",
    "\n",
    "essentialness"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3d17361",
   "metadata": {},
   "source": [
    "## 3.5.  Impact of Inference Time Techniques on Frontier Cost-of-Pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f43268c",
   "metadata": {},
   "outputs": [],
   "source": [
    "impact_of_inf_tech = defaultdict(float)\n",
    "\n",
    "model_family_L = ModelFamily(\"All non-reasoning models\")\n",
    "for model_category in all_models[:-1]:\n",
    "    for model in model_category:\n",
    "        model_family_L = model_family_L.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        frontier_cop_L = frontier_cop_estimator.estimate_task(model_family_L, n_runs=8)\n",
    "        for inf_tech in inference_time_techniques:\n",
    "            model_family_Ls = ModelFamily(f\"Models augmented with {inf_tech.method_name}\", model_family_L)\n",
    "            for model_category in all_models[:-1]:\n",
    "                for model in model_category:\n",
    "                    model_family_Ls = model_family_Ls.add_model(model.client, inf_tech, model.runtime_kwargs)\n",
    "            frontier_cop_Ls = frontier_cop_estimator.estimate_task(model_family_Ls, n_runs=8)\n",
    "            impact_of_inf_tech[(task.task_type, task.task.task_name, inf_tech.method_name)] = (frontier_cop_L - frontier_cop_Ls) / frontier_cop_L\n",
    "\n",
    "impact_of_inf_tech"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "559ab86e",
   "metadata": {},
   "source": [
    "## C.1. Expected Accuracy and Inference Costs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd358fc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_acc, exp_cost = defaultdict(float), defaultdict(float)\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        for model_category in all_models:\n",
    "            for model in model_category:\n",
    "                evaluator = Evaluator(\n",
    "                    model=model.client,\n",
    "                    task=task.task,\n",
    "                    tt_method=vanilla_prompting,\n",
    "                )\n",
    "                records = evaluator.evaluate(task.metric, n_runs=8)\n",
    "                key = (task.task_type, task.task.task_name, model.model_type, model.model_name)\n",
    "                exp_acc[key] = expected_value([record.performance for record in records])\n",
    "                exp_cost[key] = expected_value([record.cost for record in records])\n",
    "\n",
    "exp_acc, exp_cost"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1b7041e",
   "metadata": {},
   "source": [
    "## C.2. Relative Gain per Model Release"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13f1db25",
   "metadata": {},
   "outputs": [],
   "source": [
    "rel_gain_per_model = defaultdict(float)\n",
    "\n",
    "models_sorted_by_release_date = sorted(\n",
    "    [model for model_category in all_models for model in model_category],\n",
    "    key=lambda x: datetime.strptime(x.release_date, \"%Y-%m-%d\")\n",
    ")\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        model_family_t = ModelFamily(\"Models over time\")\n",
    "        prev_frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)\n",
    "        for model in models_sorted_by_release_date:\n",
    "            model_family_t = model_family_t.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "            frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)\n",
    "            rel_gain_per_model[(task.task_type, task.task.task_name, model.release_date, model.model_name)] = (prev_frontier_cop - frontier_cop) / prev_frontier_cop\n",
    "            prev_frontier_cop = frontier_cop\n",
    "\n",
    "rel_gain_per_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7594fdb8",
   "metadata": {},
   "source": [
    "## C.3. Counterfactual Frontier Cost-of-Pass in the Absence of a Single Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6368e26e",
   "metadata": {},
   "outputs": [],
   "source": [
    "counter_model = defaultdict(float)\n",
    "\n",
    "model_family_T = ModelFamily(\"All Models\")\n",
    "for model_category in all_models:\n",
    "    for model in model_category:\n",
    "        model_family_T = model_family_T.add_model(model.client, vanilla_prompting, model.runtime_kwargs)\n",
    "\n",
    "\n",
    "for task_category in all_tasks:\n",
    "    for task in task_category:\n",
    "        frontier_cop_estimator = FrontierCostofPass(\n",
    "            task=task.task,\n",
    "            baseline_cop=task.baseline_cost,\n",
    "            metric=task.metric,\n",
    "        )\n",
    "        frontier_cop = frontier_cop_estimator.estimate_task(model_family_T, n_runs=8)\n",
    "        for model_category in all_models:\n",
    "            for model in model_category:\n",
    "                model_family_m_rem = ModelFamily(f\"Counterfactual of {model_category[0].model_type}\", model_family_T)\n",
    "                model_family_m_rem = model_family_m_rem.del_model(model.client, vanilla_prompting)\n",
    "                frontier_cop_m_rem = frontier_cop_estimator.estimate_task(model_family_m_rem, n_runs=8)\n",
    "                counter_model[(task.task_type, task.task.task_name, model.model_type, model.model_name)] = (frontier_cop_m_rem - frontier_cop) / frontier_cop_m_rem\n",
    "\n",
    "counter_model"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cost_of_pass",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
