{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "00e7fdc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7c28f810",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🔍 Auto-discovered latest enriched dataset: mbpp_with_complexity_20250622_230039.parquet\n",
      "📅 Last modified: 2025-06-22 23:00:39\n"
     ]
    }
   ],
   "source": [
    "# Paste the parquet file path here, or leave empty to auto-find latest\n",
    "file_path = \"\"  # Replace with your parquet file path, or leave empty for auto-discovery\n",
    "\n",
    "# Auto-discovery of latest dataset if file_path is empty\n",
    "if not file_path.strip():\n",
    "    import glob\n",
    "    \n",
    "    # Search for all MBPP enriched dataset parquet files\n",
    "    datasets_dir = \"../data/phase0/\"\n",
    "    pattern = os.path.join(datasets_dir, \"mbpp_with_complexity_*.parquet\")\n",
    "    matching_files = glob.glob(pattern)\n",
    "    \n",
    "    if matching_files:\n",
    "        # Sort by modification time to get the latest file\n",
    "        latest_file = max(matching_files, key=os.path.getmtime)\n",
    "        file_path = latest_file\n",
    "        print(f\"🔍 Auto-discovered latest enriched dataset: {Path(file_path).name}\")\n",
    "        print(f\"📅 Last modified: {pd.Timestamp.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')}\")\n",
    "    else:\n",
    "        raise FileNotFoundError(f\"No MBPP enriched dataset files found in {datasets_dir}\")\n",
    "else:\n",
    "    print(f\"📁 Using specified file: {Path(file_path).name}\")\n",
    "\n",
    "# Verify file exists\n",
    "if not os.path.exists(file_path):\n",
    "    raise FileNotFoundError(f\"File not found: {file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "85cdf668",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enriched MBPP Dataset Information:\n",
      "Number of records: 974\n",
      "Number of columns: 5\n",
      "\n",
      "Columns:\n",
      "  - task_id\n",
      "  - text\n",
      "  - code\n",
      "  - test_list\n",
      "  - cyclomatic_complexity\n",
      "\n",
      "File size: 0.20 MB\n"
     ]
    }
   ],
   "source": [
    "# Load parquet file and display basic info\n",
    "df = pd.read_parquet(file_path)\n",
    "\n",
    "print(f\"Enriched MBPP Dataset Information:\")\n",
    "print(f\"Number of records: {len(df):,}\")\n",
    "print(f\"Number of columns: {len(df.columns)}\")\n",
    "print(f\"\\nColumns:\")\n",
    "for col in df.columns:\n",
    "    print(f\"  - {col}\")\n",
    "print(f\"\\nFile size: {os.path.getsize(file_path) / (1024**2):.2f} MB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7f8142e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column Information:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 974 entries, 0 to 973\n",
      "Data columns (total 5 columns):\n",
      " #   Column                 Non-Null Count  Dtype \n",
      "---  ------                 --------------  ----- \n",
      " 0   task_id                974 non-null    int64 \n",
      " 1   text                   974 non-null    object\n",
      " 2   code                   974 non-null    object\n",
      " 3   test_list              974 non-null    object\n",
      " 4   cyclomatic_complexity  974 non-null    int64 \n",
      "dtypes: int64(2), object(3)\n",
      "memory usage: 38.2+ KB\n",
      "None\n",
      "\n",
      "Data types:\n",
      "task_id                   int64\n",
      "text                     object\n",
      "code                     object\n",
      "test_list                object\n",
      "cyclomatic_complexity     int64\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "# Display column types and basic statistics\n",
    "print(\"Column Information:\")\n",
    "print(df.info())\n",
    "print(\"\\nData types:\")\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "qzc5vp76te",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 10 records (complete table):\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>task_id</th>\n",
       "      <th>text</th>\n",
       "      <th>code</th>\n",
       "      <th>test_list</th>\n",
       "      <th>cyclomatic_complexity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].</td>\n",
       "      <td>R = 3\\r\\nC = 3\\r\\ndef min_cost(cost, m, n): \\r\\n\\ttc = [[0 for x in range(C)] for x in range(R)] \\r\\n\\ttc[0][0] = cost[0][0] \\r\\n\\tfor i in range(1, m+1): \\r\\n\\t\\ttc[i][0] = tc[i-1][0] + cost[i][0] \\r\\n\\tfor j in range(1, n+1): \\r\\n\\t\\ttc[0][j] = tc[0][j-1] + cost[0][j] \\r\\n\\tfor i in range(1, m+1): \\r\\n\\t\\tfor j in range(1, n+1): \\r\\n\\t\\t\\ttc[i][j] = min(tc[i-1][j-1], tc[i-1][j], tc[i][j-1]) + cost[i][j] \\r\\n\\treturn tc[m][n]</td>\n",
       "      <td>[assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8, assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12, assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16]</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Write a function to find the similar elements from the given two tuple lists.</td>\n",
       "      <td>def similar_elements(test_tup1, test_tup2):\\r\\n  res = tuple(set(test_tup1) &amp; set(test_tup2))\\r\\n  return (res)</td>\n",
       "      <td>[assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5), assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4), assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Write a python function to identify non-prime numbers.</td>\n",
       "      <td>import math\\r\\ndef is_not_prime(n):\\r\\n    result = False\\r\\n    for i in range(2,int(math.sqrt(n)) + 1):\\r\\n        if n % i == 0:\\r\\n            result = True\\r\\n    return result</td>\n",
       "      <td>[assert is_not_prime(2) == False, assert is_not_prime(10) == True, assert is_not_prime(35) == True]</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   task_id  \\\n",
       "0        1   \n",
       "1        2   \n",
       "2        3   \n",
       "\n",
       "                                                                                                                                               text  \\\n",
       "0  Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].   \n",
       "1                                                                     Write a function to find the similar elements from the given two tuple lists.   \n",
       "2                                                                                            Write a python function to identify non-prime numbers.   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                             code  \\\n",
       "0  R = 3\\r\\nC = 3\\r\\ndef min_cost(cost, m, n): \\r\\n\\ttc = [[0 for x in range(C)] for x in range(R)] \\r\\n\\ttc[0][0] = cost[0][0] \\r\\n\\tfor i in range(1, m+1): \\r\\n\\t\\ttc[i][0] = tc[i-1][0] + cost[i][0] \\r\\n\\tfor j in range(1, n+1): \\r\\n\\t\\ttc[0][j] = tc[0][j-1] + cost[0][j] \\r\\n\\tfor i in range(1, m+1): \\r\\n\\t\\tfor j in range(1, n+1): \\r\\n\\t\\t\\ttc[i][j] = min(tc[i-1][j-1], tc[i-1][j], tc[i][j-1]) + cost[i][j] \\r\\n\\treturn tc[m][n]   \n",
       "1                                                                                                                                                                                                                                                                                                                                def similar_elements(test_tup1, test_tup2):\\r\\n  res = tuple(set(test_tup1) & set(test_tup2))\\r\\n  return (res)    \n",
       "2                                                                                                                                                                                                                                                           import math\\r\\ndef is_not_prime(n):\\r\\n    result = False\\r\\n    for i in range(2,int(math.sqrt(n)) + 1):\\r\\n        if n % i == 0:\\r\\n            result = True\\r\\n    return result   \n",
       "\n",
       "                                                                                                                                                                                               test_list  \\\n",
       "0       [assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8, assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12, assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16]   \n",
       "1  [assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5), assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4), assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)]   \n",
       "2                                                                                                    [assert is_not_prime(2) == False, assert is_not_prime(10) == True, assert is_not_prime(35) == True]   \n",
       "\n",
       "   cyclomatic_complexity  \n",
       "0                      7  \n",
       "1                      1  \n",
       "2                      3  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display first 10 records - full table view\n",
    "# Set pandas display options to show full content\n",
    "pd.set_option('display.max_columns', None)  # Show all columns\n",
    "pd.set_option('display.max_rows', None)     # Show all rows (for head(10))\n",
    "pd.set_option('display.max_colwidth', None) # Show full textin each cell\n",
    "pd.set_option('display.width', None)        # Don't wrap to terminal width\n",
    "print(\"First 10 records (complete table):\")\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2fc941a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "MBPP ENRICHED DATASET SUMMARY\n",
      "============================================================\n",
      "\n",
      "📊 DATASET OVERVIEW\n",
      "Total problems analyzed: 974\n",
      "Data source: mbpp_with_complexity_20250622_230039.parquet\n",
      "Analysis timestamp: 2025-08-25 13:12:19\n",
      "\n",
      "📦 ENRICHED DATASET CONTENTS\n",
      "This dataset contains the complete MBPP test set with:\n",
      "  • task_id: Unique identifier for each problem\n",
      "  • text: Problem description\n",
      "  • code: Reference solution\n",
      "  • test_list: Test cases for validation\n",
      "  • cyclomatic_complexity: Computed difficulty metric\n",
      "\n",
      "🧮 COMPLEXITY STATISTICS\n",
      "Minimum complexity: 1\n",
      "Maximum complexity: 16\n",
      "Mean complexity: 2.84\n",
      "Median complexity: 2.0\n",
      "Standard deviation: 2.02\n",
      "\n",
      "📈 PERCENTILE DISTRIBUTION\n",
      "25th percentile: 1.0\n",
      "75th percentile: 4.0\n",
      "90th percentile: 5.0\n",
      "\n",
      "📋 COMPLEXITY DISTRIBUTION\n",
      "Complexity Level | Count | Percentage\n",
      "----------------------------------------\n",
      "            1 |   268 |    27.5%\n",
      "            2 |   250 |    25.7%\n",
      "            3 |   193 |    19.8%\n",
      "            4 |   121 |    12.4%\n",
      "            5 |    67 |     6.9%\n",
      "            6 |    22 |     2.3%\n",
      "            7 |    16 |     1.6%\n",
      "            8 |    13 |     1.3%\n",
      "            9 |    10 |     1.0%\n",
      "           10 |     4 |     0.4%\n",
      "... and 4 more complexity levels\n",
      "\n",
      "📝 METHODOLOGY NOTE\n",
      "This enriched dataset serves as the single source of truth for all\n",
      "downstream phases. It combines the original MBPP data with computed\n",
      "cyclomatic complexity metrics, enabling consistent difficulty-based\n",
      "sampling and analysis throughout the PVA-SAE pipeline.\n"
     ]
    }
   ],
   "source": [
    "# Comprehensive Enriched Dataset Analysis\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "\n",
    "print(\"=\" * 60)\n",
    "print(\"MBPP ENRICHED DATASET SUMMARY\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "# Basic statistics\n",
    "total_problems = len(df)\n",
    "complexity_scores = df['cyclomatic_complexity'].values\n",
    "\n",
    "print(f\"\\n📊 DATASET OVERVIEW\")\n",
    "print(f\"Total problems analyzed: {total_problems:,}\")\n",
    "print(f\"Data source: {Path(file_path).name}\")\n",
    "print(f\"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
    "\n",
    "print(f\"\\n📦 ENRICHED DATASET CONTENTS\")\n",
    "print(\"This dataset contains the complete MBPP test set with:\")\n",
    "print(\"  • task_id: Unique identifier for each problem\")\n",
    "print(\"  • text: Problem description\")\n",
    "print(\"  • code: Reference solution\")\n",
    "print(\"  • test_list: Test cases for validation\")\n",
    "print(\"  • cyclomatic_complexity: Computed difficulty metric\")\n",
    "\n",
    "print(f\"\\n🧮 COMPLEXITY STATISTICS\")\n",
    "print(f\"Minimum complexity: {complexity_scores.min()}\")\n",
    "print(f\"Maximum complexity: {complexity_scores.max()}\")\n",
    "print(f\"Mean complexity: {complexity_scores.mean():.2f}\")\n",
    "print(f\"Median complexity: {np.median(complexity_scores):.1f}\")\n",
    "print(f\"Standard deviation: {complexity_scores.std():.2f}\")\n",
    "\n",
    "print(f\"\\n📈 PERCENTILE DISTRIBUTION\")\n",
    "print(f\"25th percentile: {np.percentile(complexity_scores, 25):.1f}\")\n",
    "print(f\"75th percentile: {np.percentile(complexity_scores, 75):.1f}\")\n",
    "print(f\"90th percentile: {np.percentile(complexity_scores, 90):.1f}\")\n",
    "\n",
    "print(f\"\\n📋 COMPLEXITY DISTRIBUTION\")\n",
    "# Show distribution by complexity levels\n",
    "complexity_counts = df['cyclomatic_complexity'].value_counts().sort_index()\n",
    "print(\"Complexity Level | Count | Percentage\")\n",
    "print(\"-\" * 40)\n",
    "for complexity, count in complexity_counts.head(10).items():\n",
    "    percentage = (count / total_problems) * 100\n",
    "    print(f\"{complexity:>13} | {count:>5} | {percentage:>7.1f}%\")\n",
    "\n",
    "if len(complexity_counts) > 10:\n",
    "    remaining = len(complexity_counts) - 10\n",
    "    print(f\"... and {remaining} more complexity levels\")\n",
    "\n",
    "print(f\"\\n📝 METHODOLOGY NOTE\")\n",
    "print(\"This enriched dataset serves as the single source of truth for all\")\n",
    "print(\"downstream phases. It combines the original MBPP data with computed\")\n",
    "print(\"cyclomatic complexity metrics, enabling consistent difficulty-based\")\n",
    "print(\"sampling and analysis throughout the PVA-SAE pipeline.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5ba09fbb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "💡 Example Usage - Accessing Complete Problem Information:\n",
      "============================================================\n",
      "\n",
      "Problem 1:\n",
      "Description: Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].\n",
      "\n",
      "Reference Solution:\n",
      "R = 3\n",
      "C = 3\n",
      "def min_cost(cost, m, n): \n",
      "\ttc = [[0 for x in range(C)] for x in range(R)] \n",
      "\ttc[0][0] = cost[0][0] \n",
      "\tfor i in range(1, m+1): \n",
      "\t\ttc[i][0] = tc[i-1][0] + cost[i][0] \n",
      "\tfor j in range(1, n+1): \n",
      "\t\ttc[0][j] = tc[0][j-1] + cost[0][j] \n",
      "\tfor i in range(1, m+1): \n",
      "\t\tfor j in range(1, n+1): \n",
      "\t\t\ttc[i][j] = min(tc[i-1][j-1], tc[i-1][j], tc[i][j-1]) + cost[i][j] \n",
      "\treturn tc[m][n]\n",
      "\n",
      "Test Cases (3 total):\n",
      "  1. assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8\n",
      "  2. assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12\n",
      "  3. assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16\n",
      "\n",
      "Complexity Score: 7\n",
      "\n",
      "This enriched format provides everything needed for downstream phases!\n"
     ]
    }
   ],
   "source": [
    "# Example: Accessing full MBPP data from the enriched dataset\n",
    "print(\"💡 Example Usage - Accessing Complete Problem Information:\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "# Show a complete example record\n",
    "example_idx = 0\n",
    "example = df.iloc[example_idx]\n",
    "\n",
    "print(f\"\\nProblem {example['task_id']}:\")\n",
    "print(f\"Description: {example['text']}\")\n",
    "print(f\"\\nReference Solution:\")\n",
    "print(example['code'])\n",
    "print(f\"\\nTest Cases ({len(example['test_list'])} total):\")\n",
    "for i, test in enumerate(example['test_list'][:3]):  # Show first 3 tests\n",
    "    print(f\"  {i+1}. {test}\")\n",
    "if len(example['test_list']) > 3:\n",
    "    print(f\"  ... and {len(example['test_list']) - 3} more tests\")\n",
    "print(f\"\\nComplexity Score: {example['cyclomatic_complexity']}\")\n",
    "print(f\"\\nThis enriched format provides everything needed for downstream phases!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "65a4c2a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== Testing Prompt Builder with Multiple Examples ===\n",
      "\n",
      "Example 1:\n",
      "Task ID: 1\n",
      "Problem: Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix...\n",
      "\n",
      "Prompt Generated:\n",
      "```\n",
      "Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].\n",
      "\n",
      "assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8\n",
      "assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12\n",
      "assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16\n",
      "\n",
      "# Solution:\n",
      "```\n",
      "\n",
      "======================================================================\n",
      "\n",
      "Example 2:\n",
      "Task ID: 2\n",
      "Write a function to find the similar elements from the given two tuple lists.\n",
      "\n",
      "Prompt Generated:\n",
      "```\n",
      "Write a function to find the similar elements from the given two tuple lists.\n",
      "\n",
      "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n",
      "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\n",
      "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n",
      "\n",
      "# Solution:\n",
      "```\n",
      "\n",
      "======================================================================\n",
      "\n",
      "Example 3:\n",
      "Task ID: 3\n",
      "Write a python function to identify non-prime numbers.\n",
      "\n",
      "Prompt Generated:\n",
      "```\n",
      "Write a python function to identify non-prime numbers.\n",
      "\n",
      "assert is_not_prime(2) == False\n",
      "assert is_not_prime(10) == True\n",
      "assert is_not_prime(35) == True\n",
      "\n",
      "# Solution:\n",
      "```\n",
      "\n",
      "======================================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Test prompt builder with multiple examples\n",
    "import sys\n",
    "sys.path.append('..')  # Add parent directory to path\n",
    "from common.prompt_utils import PromptBuilder\n",
    "import numpy as np\n",
    "\n",
    "print(\"=== Testing Prompt Builder with Multiple Examples ===\\n\")\n",
    "\n",
    "for i in range(min(3, len(df))):  # Test first 3 problems\n",
    "    sample = df.iloc[i]\n",
    "    \n",
    "    # Get test cases\n",
    "    test_cases = sample.get('test_list', [])\n",
    "    \n",
    "    # Handle if test_cases is a numpy array or list\n",
    "    if isinstance(test_cases, np.ndarray):\n",
    "        test_cases = test_cases.tolist()\n",
    "    \n",
    "    if test_cases and len(test_cases) > 0:\n",
    "        test_cases_str = '\\n'.join(test_cases)\n",
    "    else:\n",
    "        test_cases_str = \"# No test cases provided\"\n",
    "    \n",
    "    # Build prompt\n",
    "    prompt = PromptBuilder.build_prompt(\n",
    "        problem_description=sample['text'],\n",
    "        test_cases=test_cases_str\n",
    "    )\n",
    "    \n",
    "    print(f\"Example {i+1}:\")\n",
    "    print(f\"Task ID: {sample['task_id']}\")\n",
    "    print(f\"Problem: {sample['text'][:100]}...\" if len(sample['text']) > 100 else sample['text'])\n",
    "    print(f\"\\nPrompt Generated:\")\n",
    "    print(\"```\")\n",
    "    print(prompt)\n",
    "    print(\"```\")\n",
    "    print(\"\\n\" + \"=\"*70 + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da0a521d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pva_sae",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
