{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JSONL file created at: data\\aime.jsonl\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from csv import DictReader\n",
    "from utils.utils import write_jsonl\n",
    "import json\n",
    "\n",
    "\n",
    "# Replace with your file paths\n",
    "csv_file_path = os.path.join('data', 'AIME_Dataset_1983_2024.csv')  \n",
    "jsonl_file_path = os.path.join('data', 'aime.jsonl')\n",
    "\n",
    "# Open the CSV file and read its contents\n",
    "with open(csv_file_path, 'r', encoding='utf-8') as csv_file:\n",
    "    csv_reader = DictReader(csv_file)\n",
    "    \n",
    "    # Open the JSONL file to write the transformed data\n",
    "    with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:\n",
    "        for i, row in enumerate(csv_reader):\n",
    "            # Convert keys to lowercase and change 'answer' to 'ground_truth'\n",
    "            row_modified = {k.lower(): (v if k.lower() != 'answer' else row['Answer']) for k, v in row.items()}\n",
    "            if 'answer' in row_modified:\n",
    "                row_modified['ground_truth'] = row_modified.pop('answer')\n",
    "            # add idx\n",
    "            row_modified['idx'] = i\n",
    "            # Convert each row to a JSON object and write it to the JSONL file\n",
    "            json_line = json.dumps(row_modified, ensure_ascii=False)\n",
    "            jsonl_file.write(json_line + '\\n')\n",
    "\n",
    "print(f\"JSONL file created at: {jsonl_file_path}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from csv import DictReader\n",
    "from utils.utils import write_jsonl\n",
    "import json\n",
    "\n",
    "# Replace with your file paths\n",
    "csv_file_path = os.path.join('data', 'AIME_Dataset_1983_2024.csv')  \n",
    "jsonl_file_path = os.path.join('data', 'aime.jsonl')\n",
    "\n",
    "# Open the CSV file and read its contents\n",
    "with open(csv_file_path, 'r', encoding='utf-8') as csv_file:\n",
    "    csv_reader = DictReader(csv_file)\n",
    "    \n",
    "    # Open the JSONL file to write the transformed data\n",
    "    with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:\n",
    "        for i, row in enumerate(csv_reader):\n",
    "            # Convert keys to lowercase and change 'answer' to 'ground_truth'\n",
    "            row_modified = {k.lower(): (v if k.lower() != 'answer' else row['Answer']) for k, v in row.items()}\n",
    "            if 'answer' in row_modified:\n",
    "                row_modified['ground_truth'] = row_modified.pop('answer')\n",
    "            # add idx\n",
    "            row_modified['idx'] = i\n",
    "            # Convert each row to a JSON object and write it to the JSONL file\n",
    "            json_line = json.dumps(row_modified, ensure_ascii=False)\n",
    "            jsonl_file.write(json_line + '\\n')\n",
    "\n",
    "print(f\"JSONL file created at: {jsonl_file_path}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JSONL file created at: data\\train\\algebra.jsonl\n",
      "JSONL file created at: data\\train\\counting & probability.jsonl\n",
      "JSONL file created at: data\\train\\geometry.jsonl\n",
      "JSONL file created at: data\\train\\intermediate algebra.jsonl\n",
      "JSONL file created at: data\\train\\number theory.jsonl\n",
      "JSONL file created at: data\\train\\prealgebra.jsonl\n",
      "JSONL file created at: data\\train\\precalculus.jsonl\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from csv import DictReader\n",
    "from utils.utils import write_jsonl, read_jsonl\n",
    "import json\n",
    "\n",
    "\n",
    "# Replace with your file paths\n",
    "csv_file_path = os.path.join('data', 'math', 'train.jsonl')  \n",
    "# jsonl_file_path = os.path.join('data', 'aime.jsonl')\n",
    "data = read_jsonl(csv_file_path)\n",
    "\n",
    "formatted_data = {}\n",
    "# split the data based on type\n",
    "\n",
    "for _, row in enumerate(data):  \n",
    "    \n",
    "    # change all keys to lowercase\n",
    "    row_modified = {k.lower(): v for k, v in row.items()}\n",
    "    # change problem to question\n",
    "    row_modified['question'] = row_modified.pop('problem')\n",
    "    # # Convert keys to lowercase and change 'answer' to 'ground_truth'\n",
    "    # row_modified = {k.lower(): (v if k.lower() != 'answer' else row['Answer']) for k, v in row.items()}\n",
    "    # convert solution to answer\n",
    "    row_modified['answer'] = row_modified.pop('solution')\n",
    "    \n",
    "    # if 'answer' in row_modified:\n",
    "        # row_modified['ground_truth'] = row_modified.pop('answer')\n",
    "   \n",
    "    if row['type'].lower() not in formatted_data:\n",
    "        formatted_data[row['type'].lower()] = [] # add idx\n",
    "        row_modified['idx'] = 0\n",
    "        formatted_data[row['type'].lower()].append(row_modified)\n",
    "    else:\n",
    "        row_modified['idx'] = len(formatted_data[row['type'].lower()])\n",
    "        formatted_data[row['type'].lower()].append(row_modified)\n",
    "    \n",
    "    \n",
    "\n",
    "for key, value in formatted_data.items():\n",
    "    jsonl_file_path = os.path.join('data', 'train', f'{key}.jsonl')\n",
    "    print(f\"JSONL file created at: {jsonl_file_path}\")\n",
    "    write_jsonl(value, jsonl_file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "textgen",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
