{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_latex_table(table_str):\n",
    "    # Correctly split the input text into lines at the LaTeX row ending '\\\\'\n",
    "    rows = table_str.strip().split('\\\\\\\\\\n')\n",
    "    \n",
    "    # Initialize an empty list to store our processed data\n",
    "    data = []\n",
    "\n",
    "    # Loop over each row\n",
    "    for row in rows:\n",
    "        # Remove the LaTeX formatting completely from the rows\n",
    "        clean_row = re.sub(r\"\\\\multirow{1}{\\*}{\\\\bfseries\\s*}\", \"\", row)\n",
    "        clean_row = re.sub(r\"\\\\bfseries\\s*\", \"\", clean_row)\n",
    "        clean_row = clean_row.strip()\n",
    "\n",
    "        # Split the row into cells based on '&' delimiter\n",
    "        cells = clean_row.split('&')\n",
    "\n",
    "        # Remove leading and trailing braces around model names\n",
    "        cells[0] = re.sub(r\"[{}]\", \"\", cells[0]).strip()\n",
    "        \n",
    "        # Append the cleaned cells to the data list\n",
    "        data.append(cells)\n",
    "\n",
    "    # Assuming the first column corresponds to model names\n",
    "    model_names = [row[0] for row in data]\n",
    "    \n",
    "    # Assuming the remaining columns are data points\n",
    "    data_points = [row[1:] for row in data]\n",
    "    \n",
    "    # Now create DataFrame with model names as the index and numeric columns\n",
    "    df = pd.DataFrame(data_points, index=model_names)\n",
    "    \n",
    "    # Convert all the non-numeric values to float and keep 'NR' as it is\n",
    "    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))\n",
    "\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Helper function to determine color based on the difference value\n",
    "def color_diff(value):\n",
    "    if value == 'NR' or pd.isna(value):\n",
    "        return \"NR\"\n",
    "    if abs(value) < 0.10:\n",
    "        # black if similar\n",
    "        return \"{:.3f}\".format(value)\n",
    "    elif value > 0:\n",
    "        # green if greater\n",
    "        return r\"\\textcolor{green}{%.3f}\" % value\n",
    "    else:\n",
    "        # red if lower\n",
    "        return r\"\\textcolor{red}{%.3f}\" % value\n",
    "\n",
    "# Convert 'NR' and calculate the differences, store them in the DataFrame\n",
    "def process_data(parsed_data):\n",
    "    df_parsed_table_data = parsed_data\n",
    "    for index, row in df_parsed_table_data.iterrows():\n",
    "        for i in range(0, len(row) - 2, 3):\n",
    "            # Ensure both values are not 'NR' and are numeric before subtracting\n",
    "            if row[i] != 'NR' and row[i+1] != 'NR':\n",
    "                try:\n",
    "                    # Attempt to convert both values to float and then subtract\n",
    "                    diff = float(row[i+1]) - float(row[i])\n",
    "                    df_parsed_table_data.at[index, row.index[i+2]] = diff\n",
    "                except ValueError:\n",
    "                    # Handle the case where conversion to float fails\n",
    "                    df_parsed_table_data.at[index, row.index[i+2]] = 'NR'\n",
    "            else:\n",
    "                # If one or both values are 'NR', set the result of the third cell to 'NR'\n",
    "                df_parsed_table_data.at[index, row.index[i+2]] = 'NR'\n",
    "\n",
    "    for index, row in df_parsed_table_data.iterrows():\n",
    "        for i in range(0, len(row) - 2, 3):\n",
    "            if row[i] != 'NR' and row[i+1] != 'NR':\n",
    "    # Convert both values to float before subtracting\n",
    "                value1 = float(row[i])\n",
    "                value2 = float(row[i+1])\n",
    "                df_parsed_table_data.at[index, row.index[i+2]] = value2 - value1\n",
    "            else:\n",
    "                df_parsed_table_data.at[index, row.index[i+2]] = 'NR'\n",
    "\n",
    "\n",
    "    # Convert DataFrame to float where possible for max value comparison\n",
    "    df_numeric = df_parsed_table_data.apply(pd.to_numeric, errors='ignore')\n",
    "\n",
    "    # Find max values per column\n",
    "    max_values = df_numeric.max()\n",
    "\n",
    "    # Initialize list to store formatted LaTeX rows\n",
    "    formatted_rows = []\n",
    "\n",
    "    # Iterate over DataFrame rows to create the LaTeX formatted string\n",
    "    for index, row in df_parsed_table_data.iterrows():\n",
    "        formatted_cells = [index]  # Start with the model name\n",
    "        for i, cell in enumerate(row):\n",
    "            if pd.notna(cell):\n",
    "                # Format cell as a float or leave as 'NR'\n",
    "                cell_value = f\"{cell:.3f}\" if isinstance(cell, (int, float)) and not pd.isna(cell) else cell\n",
    "            else:\n",
    "                cell_value = \"NR\"\n",
    "            \n",
    "            if i % 3 == 2:  # Color code the 3rd cell in each group of three\n",
    "                cell_value = color_diff(cell)\n",
    "            \n",
    "            # Boldface the max value in the column if it's not 'NR' or NaN\n",
    "            if cell == max_values[i] and pd.notna(cell):\n",
    "                cell_value = r\"\\textbf{%s}\" % cell_value\n",
    "            \n",
    "            # Add the formatted cell to the list of cells\n",
    "            formatted_cells.append(cell_value)\n",
    "        \n",
    "        # Create a formatted row string\n",
    "        formatted_row = r\"\\multirow{1}{*}{\\bfseries %s} & \" % formatted_cells[0] + \" & \".join(formatted_cells[1:]) + r\"\\\\\"\n",
    "        \n",
    "        # Add the formatted row to the list of rows\n",
    "        formatted_rows.append(formatted_row)\n",
    "\n",
    "    # Join all formatted rows into one string with line breaks\n",
    "    formatted_table = \"\\n\".join(formatted_rows)\n",
    "\n",
    "    # Print the formatted table\n",
    "    return(formatted_table)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed LaTeX table from wikipedia.txt has been saved to processed_tables\\processed_wikipedia.txt\n",
      "Processed LaTeX table from reddit.txt has been saved to processed_tables\\processed_reddit.txt\n",
      "Processed LaTeX table from synthetic_V1.txt has been saved to processed_tables\\processed_synthetic_V1.txt\n",
      "Processed LaTeX table from synthetic_V2.txt has been saved to processed_tables\\processed_synthetic_V2.txt\n"
     ]
    }
   ],
   "source": [
    "file_names = ['wikipedia', 'reddit', 'synthetic_V1', 'synthetic_V2'] \n",
    "\n",
    "# Folder for the processed tables\n",
    "output_folder = 'processed_tables'\n",
    "if not os.path.exists(output_folder):\n",
    "    os.makedirs(output_folder)\n",
    "\n",
    "# Loop over each file name\n",
    "for base_name in file_names:\n",
    "    input_file_path = f'{base_name}.txt'\n",
    "    output_file_path = os.path.join(output_folder, f'processed_{base_name}.txt')\n",
    "    \n",
    "    # Read the LaTeX table from the input file\n",
    "    with open(input_file_path, 'r') as file:\n",
    "        table_str = file.read()\n",
    "\n",
    "    # Parse the LaTeX table into a DataFrame\n",
    "    df_parsed_table_data = parse_latex_table(table_str)\n",
    "\n",
    "    # Process the data and get the LaTeX-formatted table\n",
    "    latex_table = process_data(df_parsed_table_data)\n",
    "\n",
    "    # Save the LaTeX table to a text file with the corresponding name in the 'processed_tables' folder\n",
    "    with open(output_file_path, 'w') as output_file:\n",
    "        output_file.write(latex_table)\n",
    "    \n",
    "    print(f\"Processed LaTeX table from {input_file_path} has been saved to {output_file_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLjesse",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
