{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3315d3f4",
   "metadata": {},
   "source": [
    "# Super Simple Autometrics Tutorial\n",
    "=================================\n",
    "\n",
    "This tutorial shows the absolute basics of using autometrics.\n",
    "Just load a dataset, run the pipeline, and get your metrics!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c772926e",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "202b54a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import dspy\n",
    "from autometrics.autometrics import Autometrics\n",
    "from autometrics.dataset.datasets.simplification.simplification import SimpDA\n",
    "from autometrics.aggregator.regression.ElasticNet import ElasticNet\n",
    "\n",
    "# Set your OpenAI API key\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"your-api-key-here\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dafd1b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the SimpDA dataset (text simplification)\n",
    "dataset = SimpDA()\n",
    "target_measure = \"simplicity\"  # The human score column we want to predict\n",
    "\n",
    "print(f\"Dataset: {dataset.get_name()}\")\n",
    "print(f\"Size: {len(dataset.get_dataframe())} examples\")\n",
    "print(f\"Target measure: {target_measure}\")\n",
    "\n",
    "## Cell 3: Configure LLMs\n",
    "# Use GPT-4o-mini for both generation and judging\n",
    "generator_llm = dspy.LM(\"openai/gpt-4o-mini\")\n",
    "judge_llm = dspy.LM(\"openai/gpt-4o-mini\")\n",
    "\n",
    "print(\"LLMs configured!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01d124b9",
   "metadata": {},
   "source": [
    "## Autometrics Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d56e08aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Super simple configuration:\n",
    "# - Generate 1 metric using LLM judge\n",
    "# - Retrieve 10 metrics from the bank\n",
    "# - Select top 5 using ElasticNet regression\n",
    "autometrics = Autometrics(\n",
    "    metric_generation_configs={\n",
    "        \"llm_judge\": {\"metrics_per_trial\": 1}  # Just generate 1 metric\n",
    "    },\n",
    "    regression_strategy=ElasticNet,  # Use ElasticNet instead of default Lasso\n",
    "    seed=42,  # For reproducibility\n",
    "    generated_metrics_dir=\"tutorial_metrics\"  # Unique directory for this tutorial\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f3a58b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Running autometrics pipeline...\")\n",
    "print(\"This will:\")\n",
    "print(\"1. Generate 1 LLM judge metric\")\n",
    "print(\"2. Retrieve 10 relevant metrics from the bank\")\n",
    "print(\"3. Evaluate all metrics on your dataset\")\n",
    "print(\"4. Select top 5 using ElasticNet regression\")\n",
    "print(\"5. Create a final aggregated metric\")\n",
    "\n",
    "results = autometrics.run(\n",
    "    dataset=dataset,\n",
    "    target_measure=target_measure,\n",
    "    generator_llm=generator_llm,\n",
    "    judge_llm=judge_llm,\n",
    "    num_to_retrieve=10,  # Retrieve 10 metrics\n",
    "    num_to_regress=5     # Select top 5\n",
    ")\n",
    "\n",
    "print(\"Pipeline complete! 🎉\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80f2b91b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*50)\n",
    "print(\"RESULTS\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "print(f\"\\nGenerated metrics: {len(results['all_generated_metrics'])}\")\n",
    "for i, metric in enumerate(results['all_generated_metrics']):\n",
    "    print(f\"  {i+1}. {metric.__name__}\")\n",
    "\n",
    "print(f\"\\nRetrieved metrics: {len(results['retrieved_metrics'])}\")\n",
    "for i, metric in enumerate(results['retrieved_metrics'][:3]):  # Show first 3\n",
    "    print(f\"  {i+1}. {metric.__name__}\")\n",
    "\n",
    "print(f\"\\nTop selected metrics: {len(results['top_metrics'])}\")\n",
    "for i, metric in enumerate(results['top_metrics']):\n",
    "    print(f\"  {i+1}. {metric.get_name()}\")\n",
    "\n",
    "print(f\"\\nFinal regression metric: {results['regression_metric'].get_name()}\")\n",
    "print(f\"Description: {results['regression_metric'].get_description()}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b297b57",
   "metadata": {},
   "source": [
    "## Use Your Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b4a5d3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*50)\n",
    "print(\"USING YOUR METRICS\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "# Get predictions from your final metric\n",
    "final_scores = results['regression_metric'].predict(dataset)\n",
    "human_scores = dataset.get_dataframe()[target_measure]\n",
    "\n",
    "print(f\"\\nPredicted vs Human scores for first 5 examples:\")\n",
    "print(\"Example | Predicted | Human | Pred Rank | Human Rank\")\n",
    "print(\"-\" * 55)\n",
    "\n",
    "# Get first 5 examples\n",
    "first_5_pred = final_scores[:5]\n",
    "first_5_human = human_scores.iloc[:5]\n",
    "\n",
    "for i in range(min(5, len(final_scores))):\n",
    "    predicted = first_5_pred[i]\n",
    "    human = first_5_human.iloc[i]\n",
    "    \n",
    "    # Calculate ranks within these 5 examples (higher score = higher rank)\n",
    "    pred_rank = (first_5_pred > predicted).sum() + 1\n",
    "    human_rank = (first_5_human > human).sum() + 1\n",
    "    \n",
    "    print(f\"  {i+1}     | {predicted:.3f}    | {human:.3f} | {pred_rank:>9} | {human_rank:>10}\")\n",
    "\n",
    "# Check correlation with human scores\n",
    "import numpy as np\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "correlation, p_value = pearsonr(human_scores, final_scores)\n",
    "print(f\"\\nCorrelation with human scores: {correlation:.3f} (p={p_value:.3f})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae427c0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*50)\n",
    "print(\"REPORT CARD\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "print(results['report_card'])\n",
    "\n",
    "print(\"\\n\" + \"=\"*50)\n",
    "print(\"TUTORIAL COMPLETE!\")\n",
    "print(\"=\"*50)\n",
    "print(\"You now have:\")\n",
    "print(\"✅ A custom metric for your task\")\n",
    "print(\"✅ Top 5 most relevant metrics\")\n",
    "print(\"✅ A final aggregated metric\")\n",
    "print(\"✅ Correlation with human judgments\")\n",
    "print(\"\\nYou can use these metrics on new data!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "autometrics",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
