{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import tiktoken # for token counting\n",
    "import numpy as np\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num examples: 9000\n",
      "First example:\n",
      "{'role': 'system', 'content': \"You are an expert in mathematics who is precise and rigorous. You will be given a mathematical expression to integrate.\\nYour task is to integrate this expression step by step and at the end output a one line Python code that uses SymPy's sp.parse_expr('') to get the final answer as a sympy expression. Use symbol E (capital E) for Euler's number, symbol I for the imaginary unit, and symbol pi for the constant pi. Use exp for exponential function and log for natural logarithm. You can omit the absolute value operation in the result of integrals. Make sure not to use sp. prefix for the final result. For example, you can write tan(x) instead of sp.tan(x). Make sure sp.parse_expr() is used only once in the response and includes the final result. Do your best to make sure final result is easily parsable. Do not include constant of integration C in the final result. Do not use floating point numbers, instead keep things as simplified fractions. Even if you can't solve it, try your best and give a valid final answer as described above. For example, final result of (x**2) should look like 'sp.parse_expr('(x**3)/3')'.\\n\"}\n",
      "{'role': 'user', 'content': 'Expression to integrate: Integral(x + (4 - 5*x)/(x*sqrt(cos(E))), x)'}\n",
      "{'role': 'assistant', 'content': 'Here is a step-by-step solution to compute the integral: Integral(x + (4 - 5*x)/(x*sqrt(cos(E))), x)\\n\\nStep 1: \\nWe transform Integral(x + (4 - 5*x)/(x*sqrt(cos(E))), x) by breaking it into its additive parts. This results in Integral(x, x) + Integral((4 - 5*x)/(x*sqrt(cos(E))), x).\\n\\nStep 2: \\nWe transform Integral(x, x) by applying the power rule for integration. This results in x**2/2 + Integral((4 - 5*x)/(x*sqrt(cos(E))), x).\\n\\nStep 3: \\nWe transform (4 - 5*x)/(x*sqrt(cos(E))) by expanding the expression. This results in x**2/2 + Integral(-5/sqrt(cos(E)) + 4/(x*sqrt(cos(E))), x).\\n\\nStep 4: \\nWe transform Integral(-5/sqrt(cos(E)) + 4/(x*sqrt(cos(E))), x) by breaking it into its additive parts. This results in x**2/2 + Integral(4/(x*sqrt(cos(E))), x) + Integral(-5/sqrt(cos(E)), x).\\n\\nStep 5: \\nWe transform Integral(-5/sqrt(cos(E)), x) by integrating the constant term. This results in x**2/2 - 5*x/sqrt(cos(E)) + Integral(4/(x*sqrt(cos(E))), x).\\n\\nStep 6: \\nWe transform Integral(4/(x*sqrt(cos(E))), x) by taking out the constant factor. This results in x**2/2 - 5*x/sqrt(cos(E)) + 4*Integral(1/x, x)/sqrt(cos(E)).\\n\\nStep 7: \\nWe transform Integral(1/x, x) by using the fact that integral of 1/x is ln(x). This results in x**2/2 - 5*x/sqrt(cos(E)) + 4*log(x)/sqrt(cos(E)).\\n\\nFinal Answer: `sp.parse_expr(x**2/2 - 5*x/sqrt(cos(E)) + 4*log(x)/sqrt(cos(E)))`'}\n"
     ]
    }
   ],
   "source": [
    "data_path = \"integral_train_10000.jsonl\"\n",
    "\n",
    "# Load the dataset\n",
    "with open(data_path, 'r', encoding='utf-8') as f:\n",
    "    dataset = [json.loads(line) for line in f]\n",
    "\n",
    "# Initial dataset stats\n",
    "print(\"Num examples:\", len(dataset))\n",
    "print(\"First example:\")\n",
    "for message in dataset[0][\"messages\"]:\n",
    "    print(message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No errors found\n"
     ]
    }
   ],
   "source": [
    "# Format error checks\n",
    "format_errors = defaultdict(int)\n",
    "\n",
    "for ex in dataset:\n",
    "    if not isinstance(ex, dict):\n",
    "        format_errors[\"data_type\"] += 1\n",
    "        continue\n",
    "        \n",
    "    messages = ex.get(\"messages\", None)\n",
    "    if not messages:\n",
    "        format_errors[\"missing_messages_list\"] += 1\n",
    "        continue\n",
    "        \n",
    "    for message in messages:\n",
    "        if \"role\" not in message or \"content\" not in message:\n",
    "            format_errors[\"message_missing_key\"] += 1\n",
    "        \n",
    "        if any(k not in (\"role\", \"content\", \"name\", \"function_call\", \"weight\") for k in message):\n",
    "            format_errors[\"message_unrecognized_key\"] += 1\n",
    "        \n",
    "        if message.get(\"role\", None) not in (\"system\", \"user\", \"assistant\", \"function\"):\n",
    "            format_errors[\"unrecognized_role\"] += 1\n",
    "            \n",
    "        content = message.get(\"content\", None)\n",
    "        function_call = message.get(\"function_call\", None)\n",
    "        \n",
    "        if (not content and not function_call) or not isinstance(content, str):\n",
    "            format_errors[\"missing_content\"] += 1\n",
    "    \n",
    "    if not any(message.get(\"role\", None) == \"assistant\" for message in messages):\n",
    "        format_errors[\"example_missing_assistant_message\"] += 1\n",
    "\n",
    "if format_errors:\n",
    "    print(\"Found errors:\")\n",
    "    for k, v in format_errors.items():\n",
    "        print(f\"{k}: {v}\")\n",
    "else:\n",
    "    print(\"No errors found\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
    "\n",
    "# not exact!\n",
    "# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n",
    "def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):\n",
    "    num_tokens = 0\n",
    "    for message in messages:\n",
    "        num_tokens += tokens_per_message\n",
    "        for key, value in message.items():\n",
    "            num_tokens += len(encoding.encode(value))\n",
    "            if key == \"name\":\n",
    "                num_tokens += tokens_per_name\n",
    "    num_tokens += 3\n",
    "    return num_tokens\n",
    "\n",
    "def num_assistant_tokens_from_messages(messages):\n",
    "    num_tokens = 0\n",
    "    for message in messages:\n",
    "        if message[\"role\"] == \"assistant\":\n",
    "            num_tokens += len(encoding.encode(message[\"content\"]))\n",
    "    return num_tokens\n",
    "\n",
    "def print_distribution(values, name):\n",
    "    print(f\"\\n#### Distribution of {name}:\")\n",
    "    print(f\"min / max: {min(values)}, {max(values)}\")\n",
    "    print(f\"mean / median: {np.mean(values)}, {np.median(values)}\")\n",
    "    print(f\"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num examples missing system message: 0\n",
      "Num examples missing user message: 0\n",
      "\n",
      "#### Distribution of num_messages_per_example:\n",
      "min / max: 3, 3\n",
      "mean / median: 3.0, 3.0\n",
      "p5 / p95: 3.0, 3.0\n",
      "\n",
      "#### Distribution of num_total_tokens_per_example:\n",
      "min / max: 363, 69538\n",
      "mean / median: 1005.3103333333333, 763.0\n",
      "p5 / p95: 482.0, 1519.1000000000004\n",
      "\n",
      "#### Distribution of num_assistant_tokens_per_example:\n",
      "min / max: 81, 69243\n",
      "mean / median: 716.8616666666667, 475.0\n",
      "p5 / p95: 198.0, 1227.0\n",
      "\n",
      "10 examples may be over the 16,385 token limit, they will be truncated during fine-tuning\n"
     ]
    }
   ],
   "source": [
    "# Warnings and tokens counts\n",
    "n_missing_system = 0\n",
    "n_missing_user = 0\n",
    "n_messages = []\n",
    "convo_lens = []\n",
    "assistant_message_lens = []\n",
    "\n",
    "for ex in dataset:\n",
    "    messages = ex[\"messages\"]\n",
    "    if not any(message[\"role\"] == \"system\" for message in messages):\n",
    "        n_missing_system += 1\n",
    "    if not any(message[\"role\"] == \"user\" for message in messages):\n",
    "        n_missing_user += 1\n",
    "    n_messages.append(len(messages))\n",
    "    convo_lens.append(num_tokens_from_messages(messages))\n",
    "    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))\n",
    "    \n",
    "print(\"Num examples missing system message:\", n_missing_system)\n",
    "print(\"Num examples missing user message:\", n_missing_user)\n",
    "print_distribution(n_messages, \"num_messages_per_example\")\n",
    "print_distribution(convo_lens, \"num_total_tokens_per_example\")\n",
    "print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n",
    "n_too_long = sum(1 for l in convo_lens if l > 16385)\n",
    "print(f\"\\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset has ~8962502 tokens that will be charged for during training\n",
      "By default, you'll train for 2 epochs on this dataset\n",
      "By default, you'll be charged for ~17925004 tokens\n"
     ]
    }
   ],
   "source": [
    "# Pricing and default n_epochs estimate\n",
    "MAX_TOKENS_PER_EXAMPLE = 16385\n",
    "\n",
    "TARGET_EPOCHS = 3\n",
    "MIN_TARGET_EXAMPLES = 100\n",
    "MAX_TARGET_EXAMPLES = 25000\n",
    "MIN_DEFAULT_EPOCHS = 1\n",
    "MAX_DEFAULT_EPOCHS = 25\n",
    "\n",
    "n_epochs = TARGET_EPOCHS\n",
    "n_train_examples = len(dataset)\n",
    "if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:\n",
    "    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)\n",
    "elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:\n",
    "    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)\n",
    "\n",
    "n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)\n",
    "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n",
    "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n",
    "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "formalmath",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
