{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ewTYsqbSadVO"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import scipy as sp\n",
        "from scipy import integrate\n",
        "import sympy as sy\n",
        "import random\n",
        "import os\n",
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uFI6BEz6gcoO"
      },
      "source": [
        "### Generating \"traditional\" integrals"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_qs_VDUwatKh"
      },
      "outputs": [],
      "source": [
        "def generate_poly(num_terms):\n",
        "    poly = np.zeros(10) \n",
        "    degrees = np.zeros(10)\n",
        "\n",
        "    for i in range(1, num_terms):\n",
        "        poly[i] = np.random.randint(0, 10)  \n",
        "        degrees[i] = np.random.randint(1, 20)  \n",
        "\n",
        "    return poly, degrees\n",
        "\n",
        "def integral(degree):\n",
        "    poly, degrees = generate_poly(degree)\n",
        "\n",
        "    a = np.random.uniform(0, 100)\n",
        "\n",
        "    eps_list = 10.**np.linspace(-6, 10, 100)\n",
        "\n",
        "    def integrand(x, eps):\n",
        "        return 1 / (eps + sum(poly[i] * x**degrees[i] for i in range(len(poly))))\n",
        "\n",
        "    I_eps = [integrate.quad(integrand, 0, a, args=(eps,))[0] for eps in eps_list]\n",
        "\n",
        "    # Plotting\n",
        "    plt.rcParams.update({'font.size': 22})\n",
        "    plt.figure(figsize=(15, 10))\n",
        "    plt.loglog(eps_list, np.abs(I_eps), 'o', mfc='none', markersize=10, label='numerical')\n",
        "    plt.xlabel('$\\epsilon$')\n",
        "    plt.ylabel('Integral')\n",
        "    plt.legend(loc='best')\n",
        "\n",
        "    return poly, degrees, a"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 805
        },
        "id": "SqceODgebMps",
        "outputId": "e5720b82-fd66-4fd4-ea73-f6b7c319e627"
      },
      "outputs": [],
      "source": [
        "poly, degrees, a = integral(7)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-nugmu1Hhuj6",
        "outputId": "29e1a9d4-4d33-4612-80d7-ee8f4c875d72"
      },
      "outputs": [],
      "source": [
        "ep = sy.symbols('\\epsilon')\n",
        "x = sy.symbols('x')\n",
        "print(degrees)\n",
        "eq = ep+poly[1]*x**degrees[1] + poly[2]*x**degrees[2] +poly[3]*x**degrees[3] + poly[4]*x**degrees[4]+poly[5]*x**degrees[5]+poly[6]*x**degrees[6] + poly[7]*x**degrees[7] + poly[8]*x**degrees[8]+ poly[9]*x**degrees[9]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p-A-Pr1xPVmD"
      },
      "source": [
        "### Format LaTeX"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ap3mjkbvOjsO"
      },
      "outputs": [],
      "source": [
        "def format_integral_question(poly, degrees, a):\n",
        "    x = sy.symbols('x')  \n",
        "    polynomial = sum(poly[i] * x**degrees[i] for i in range(len(poly))) \n",
        "\n",
        "    polynomial_latex = sy.latex(polynomial)\n",
        "\n",
        "    latex_str = r'Consider the integral $I(\\epsilon) = \\int_0^{' + f'{a:.2f}' + r'} \\frac{1}{\\epsilon + ' + polynomial_latex + r'} dx$. Develop analytical formulas that approximate $I(\\epsilon)$ for different regimes of $\\epsilon$.'\n",
        "\n",
        "    return latex_str"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zBpK3fmeOneg",
        "outputId": "a6bf5087-8623-406a-fd26-225f8f42e3e4"
      },
      "outputs": [],
      "source": [
        "latex_question = format_integral_question(poly, degrees, a)\n",
        "print(latex_question)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "98-F4orPiDcE"
      },
      "source": [
        "### Deriving approximate solutions"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ht7ili31bvCi"
      },
      "source": [
        "### Small $\\epsilon$"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SPvNlJ8ivFP7"
      },
      "outputs": [],
      "source": [
        "def find_smallest(array):\n",
        "    tempval = []\n",
        "    tempindex = []\n",
        "\n",
        "    for i, value in enumerate(array):\n",
        "        if value != 0:\n",
        "            tempval.append(value)  \n",
        "            tempindex.append(i)  \n",
        "\n",
        "    # Find the minimum among the non-zero values\n",
        "    min_value = np.min(tempval)\n",
        "    min_index = np.argmin(tempval)\n",
        "    original_index = tempindex[min_index]\n",
        "\n",
        "    return min_value, original_index"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9CQ8eKAxb3is"
      },
      "source": [
        "### Large $\\epsilon$"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zN5crIQqcAgR"
      },
      "outputs": [],
      "source": [
        "def find_largest(array):\n",
        "    tempval = []\n",
        "    tempindex = []\n",
        "\n",
        "    for i, value in enumerate(array):\n",
        "        if value != 0:\n",
        "            tempval.append(value)   \n",
        "            tempindex.append(i)      \n",
        "\n",
        "    # Find the maximum among the non-zero values\n",
        "    max_value = np.max(tempval)\n",
        "    max_index = np.argmax(tempval)\n",
        "    original_index = tempindex[max_index]\n",
        "\n",
        "    return max_value, original_index"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "02SyXIgxjHhj"
      },
      "outputs": [],
      "source": [
        "def solve_simple_integral(degree):\n",
        "    # Generate a random polynomial of the given degree\n",
        "    poly, degrees = generate_poly(degree)\n",
        "\n",
        "    a = np.random.randint(0, 100)\n",
        "\n",
        "    latex_question = format_integral_question(poly, degrees, a)\n",
        "\n",
        "    eps_list = np.logspace(-6, 40, 100)\n",
        "    eps_list = np.concatenate(([1e-3], eps_list))\n",
        "\n",
        "    I_eps = [integrate.quad(lambda x: 1/(eps + sum(poly[i] * x**degrees[i] for i in range(len(poly)))), 0, a)[0] for eps in eps_list]\n",
        "\n",
        "    deg1, ind1 = find_smallest(degrees)\n",
        "    deg, ind = find_largest(degrees)\n",
        "\n",
        "    smalleps = [1/eps * (1/poly[ind1]*eps)**(1/deg1) for eps in eps_list]\n",
        "    largeeps = [1/eps * (1/poly[ind]*eps)**(1/deg) for eps in eps_list]\n",
        "    verylargeeps = [a/eps for eps in eps_list]\n",
        "\n",
        "    epsilon = sy.Symbol('epsilon')\n",
        "\n",
        "    smalleps_expr = round((1/poly[ind1]) ** (1/deg), 2) * 1/epsilon * epsilon ** round((1/deg), 2)\n",
        "    largeeps_expr = round((1/poly[ind]) ** (1/deg1), 2) * 1/epsilon * epsilon ** round((1/deg1), 2)\n",
        "    verylargeeps_expr = a/epsilon\n",
        "\n",
        "    smalleps_latex = sy.latex(smalleps_expr)\n",
        "    largeeps_latex = sy.latex(largeeps_expr)\n",
        "    verylargeeps_latex = sy.latex(verylargeeps_expr)\n",
        "\n",
        "    # Creating the LaTeX explanation\n",
        "    explanation = r\"\"\"\n",
        "    The integral is of the form $I(\\epsilon) = \\int_0^{%s} \\frac{1}{\\epsilon + P(x)} dx$ where $P(x)$ is a polynomial. Thus, its value can be estimated as the product between a height and a width.\n",
        "    Since the integrand is maximized at $x = 0$, the height can be set to $\\frac{1}{\\epsilon}$.\n",
        "\n",
        "    For small $\\epsilon$,\n",
        "    we define the width as the point where the integrand becomes half of its maximum height.\n",
        "    This corresponds to solving for $x$ given $P(x) = \\epsilon$.\n",
        "    Applying dominant balance, considering the term in $P(x)$ with the smallest degree, the width is approximated as $ \\left( \\frac{1}{%s*\\epsilon} \\right)^{1/%s} $.\n",
        "    Therefore, the analytical approximation of the integral for small $\\epsilon$ is $\\boxed{I(\\epsilon) = %s}$.\n",
        "\n",
        "    For an intermediate regime where $\\epsilon$ is large,\n",
        "    we also define the width based on the term with the largest degree.\n",
        "    The width is approximated as \\( \\left( \\frac{1}{%s*\\epsilon} \\right)^{1/%s} \\).\n",
        "    Therefore, the analytical approximation of the integral for large $\\epsilon$ is $\\boxed{I(\\epsilon) = %s}$.\n",
        "\n",
        "    If the width of the integral exceeds the range of integration, we consider one more regime for very large $\\epsilon$.\n",
        "    The width is then just the range of integration, so in this regime, the integral can be approximated as $\\frac{L}{\\epsilon}$.\n",
        "    Therefore, the analytical approximation of the integral for very large $\\epsilon$ is $\\boxed{I(\\epsilon) = %s}$.\n",
        "\n",
        "    Altogether, the solutions at small, large, and very large $\\epsilon$ are $\\boxed{%s, %s, %s}$.\n",
        "    \"\"\" % (a, poly[ind1], deg1, smalleps_latex, poly[ind], deg, largeeps_latex,\\\n",
        "        verylargeeps_latex, smalleps_latex, largeeps_latex, verylargeeps_latex)\n",
        "\n",
        "    extracted_solution = r\"\"\"\n",
        "    $$\\boxed{[%s, %s]}$$\n",
        "    \"\"\" % (smalleps_latex, largeeps_latex)\n",
        "\n",
        "    # Numerical value at small x\n",
        "    small_x_numerical_eval = np.abs(I_eps[0])\n",
        "\n",
        "    # Numerical value at large x\n",
        "    large_x_numerical_eval = np.abs(I_eps[int(len(I_eps)/3)])\n",
        "\n",
        "    # Numerical value at very large x\n",
        "    verylarge_x_numerical_eval = np.abs(I_eps[-1])\n",
        "\n",
        "    # Approximate value at small x\n",
        "    small_x_approx_eval = smalleps[0]\n",
        "\n",
        "    # Approximate value at large x\n",
        "    large_x_approx_eval = largeeps[int(len(I_eps)/3)]\n",
        "\n",
        "    # Approximate value at very large x\n",
        "    verylarge_x_approx_eval = verylargeeps[-1]\n",
        "\n",
        "    return latex_question, explanation, extracted_solution, small_x_numerical_eval, large_x_numerical_eval,\\\n",
        "        verylarge_x_numerical_eval, small_x_approx_eval, large_x_approx_eval,\\\n",
        "            verylarge_x_approx_eval, poly, degrees, a"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Verifier of solution accuracy"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def verifier(small_eps_numerical, large_eps_numerical, verylarge_eps_numerical,\\\n",
        "    small_eps_sol, large_eps_sol, verylarge_eps_sol, error_limit):\n",
        "\n",
        "    place_holder_0 = 0\n",
        "\n",
        "    if (np.abs(small_eps_sol) == np.Inf) or (small_eps_sol == place_holder_0) or (small_eps_sol == np.NAN):\n",
        "        return True\n",
        "    if (np.abs(large_eps_sol) == np.Inf) or (large_eps_sol == place_holder_0) or (large_eps_sol == np.NAN):\n",
        "        return True\n",
        "    if (np.abs(verylarge_eps_sol) == np.Inf) or (verylarge_eps_sol == place_holder_0) or (verylarge_eps_sol == np.NAN):\n",
        "        return True \n",
        "\n",
        "    if (np.abs(small_eps_numerical) == np.Inf) or (small_eps_numerical == place_holder_0) or (small_eps_numerical == np.NAN):\n",
        "        return True\n",
        "    if (np.abs(large_eps_numerical) == np.Inf) or (large_eps_numerical == place_holder_0) or (large_eps_numerical == np.NAN):\n",
        "        return True\n",
        "    if (np.abs(verylarge_eps_numerical) == np.Inf) or (verylarge_eps_numerical == place_holder_0) or (verylarge_eps_numerical == np.NAN):\n",
        "        return True\n",
        "\n",
        "    if np.abs((large_eps_sol - large_eps_numerical) / (large_eps_numerical)) * 100 > error_limit:\n",
        "        return True\n",
        "    if np.abs((small_eps_sol - small_eps_numerical) / (small_eps_numerical)) * 100 > error_limit:\n",
        "        return True\n",
        "    if np.abs((verylarge_eps_sol - verylarge_eps_numerical) / (verylarge_eps_numerical)) * 100 > error_limit:\n",
        "        return True\n",
        "    \n",
        "    return False"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Plotting the solutions for visual verification"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def plot_solution(poly, degrees, a):\n",
        "    eps_list = np.logspace(-6, 40, 100)\n",
        "    a = np.random.randint(0, 100)\n",
        "\n",
        "    latex_question = format_integral_question(poly, degrees, a)\n",
        "\n",
        "    I_eps = [integrate.quad(lambda x: 1/(eps + sum(poly[i] * x**degrees[i] for i in range(len(poly)))), 0, a)[0] for eps in eps_list]\n",
        "\n",
        "    deg1, ind1 = find_smallest(degrees)\n",
        "    deg, ind = find_largest(degrees)\n",
        "\n",
        "    smalleps = [1/eps * (1/poly[ind1]*eps)**(1/deg1) for eps in eps_list]\n",
        "    largeeps = [1/eps * (1/poly[ind]*eps)**(1/deg) for eps in eps_list]\n",
        "    verylargeeps = [a/eps for eps in eps_list]\n",
        "\n",
        "    plt.rcParams.update({'font.size': 22})\n",
        "    plt.figure(figsize=(15, 10))\n",
        "    plt.loglog(eps_list, np.abs(I_eps), '-', mfc='none', markersize=10, label='Numerical Integration')\n",
        "    plt.loglog(eps_list, smalleps, '-', label='Approximation for small $\\epsilon$')\n",
        "    plt.loglog(eps_list, largeeps, 'r+', label='Approximation for large $\\epsilon$')\n",
        "    plt.loglog(eps_list, verylargeeps, 'o', label='Approximation for very large $\\epsilon$')\n",
        "\n",
        "    plt.xlabel('$\\epsilon$')\n",
        "    plt.ylabel('Integral')\n",
        "    plt.legend(loc='best')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "plot_solution(poly, degrees, a)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Generating the dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def generate_dataset(n, filename, error_limit=10):\n",
        "  current_path = os.getcwd()\n",
        "\n",
        "  data = []\n",
        "  problem_type = \"Integral\"\n",
        "  solution_type = \"list\"\n",
        "\n",
        "  seen_questions = {}\n",
        "  \n",
        "  num_complete_problems = 0\n",
        "  counter = 0\n",
        "\n",
        "  eps_list = np.logspace(-6, 40, 100)\n",
        "  eps_list = np.concatenate(([1e-3], eps_list))\n",
        "\n",
        "  while num_complete_problems < n:\n",
        "    try:\n",
        "\n",
        "      degree = random.randint(2, 7)\n",
        "      question, solution, extracted_solution,\\\n",
        "        small_x_numerical_eval, large_x_numerical_eval, verylarge_x_numerical_eval,\\\n",
        "          small_x_approx_eval, large_x_approx_eval, verylarge_x_approx_eval,\\\n",
        "          poly, degrees, a = solve_simple_integral(degree)\n",
        "        \n",
        "      if question in seen_questions:\n",
        "        raise ValueError(\"Duplicate problem detected.\")\n",
        "\n",
        "      if verifier(small_x_numerical_eval, large_x_numerical_eval, verylarge_x_numerical_eval,\\\n",
        "        small_x_approx_eval, large_x_approx_eval, verylarge_x_approx_eval, error_limit):\n",
        "        raise Exception(\"Percent error exceeded.\")\n",
        "\n",
        "      seen_questions[question] = True\n",
        "\n",
        "      data.append([question, solution, problem_type, solution_type,\\\n",
        "         extracted_solution,\\\n",
        "          eps_list[0], small_x_approx_eval, small_x_numerical_eval,\\\n",
        "           eps_list[int(len(eps_list)/3)], large_x_approx_eval, large_x_numerical_eval])\n",
        "          \n",
        "      num_complete_problems += 1\n",
        "      counter += 1\n",
        "      print(f\"Num complete: {num_complete_problems}\")\n",
        "\n",
        "    except Exception as e:\n",
        "      print(f\"Error in iteration {counter+1}: {e}. {num_complete_problems} correct; moving to the next iteration.\")\n",
        "      counter += 1\n",
        "      continue  \n",
        "    \n",
        "  print(f\"Num total iterations: {counter}\")\n",
        "  data = np.array(data)\n",
        "\n",
        "  full_path = os.path.join(current_path, filename) \n",
        "  data_df = pd.DataFrame(data, columns=[\"question\", \"solution\", \"question type\", \"answer type\",\\\n",
        "    \"extracted answer\", \\\n",
        "      \"small_eval_point\", \"small_analytical\", \"small_numerical\",\\\n",
        "        \"large_eval_point\", \"large_analytical\", \"large_numerical\"])\n",
        "\n",
        "  data_df.to_csv(full_path, index=False)\n",
        "\n",
        "  print(f\"Number of complete problems: {num_complete_problems} / {counter}\")\n",
        "  return data_df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "n = 150\n",
        "data = generate_dataset(n, \"integral_train_dataset.csv\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
