{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Change Column Structure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Dialect_Word</th>\n",
       "      <th>SAE_Word</th>\n",
       "      <th>Dialect_Prompt</th>\n",
       "      <th>SAE_Prompt</th>\n",
       "      <th>person_in_prompt</th>\n",
       "      <th>dialect_word_count</th>\n",
       "      <th>polysemic</th>\n",
       "      <th>Polysemy_Prompt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>math</td>\n",
       "      <td>monastery</td>\n",
       "      <td>an old math</td>\n",
       "      <td>an old monastery</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a student solving math problems on a chalkboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>math</td>\n",
       "      <td>monastery</td>\n",
       "      <td>a math on a hill</td>\n",
       "      <td>a monastery on a hill</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a professor exploring complex math equations</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>math</td>\n",
       "      <td>monastery</td>\n",
       "      <td>a small math</td>\n",
       "      <td>a small monastery</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a teacher explaining math concepts to a class</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>math</td>\n",
       "      <td>monastery</td>\n",
       "      <td>a painting of a math</td>\n",
       "      <td>a painting of a monastery</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a pile of math textbooks on a desk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>math</td>\n",
       "      <td>monastery</td>\n",
       "      <td>a busy math</td>\n",
       "      <td>a busy monastery</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a shirt with math equations</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Dialect_Word   SAE_Word        Dialect_Prompt                 SAE_Prompt  \\\n",
       "0         math  monastery           an old math           an old monastery   \n",
       "1         math  monastery      a math on a hill      a monastery on a hill   \n",
       "2         math  monastery          a small math          a small monastery   \n",
       "3         math  monastery  a painting of a math  a painting of a monastery   \n",
       "4         math  monastery           a busy math           a busy monastery   \n",
       "\n",
       "   person_in_prompt  dialect_word_count  polysemic  \\\n",
       "0                 0                   1          1   \n",
       "1                 0                   1          1   \n",
       "2                 0                   1          1   \n",
       "3                 0                   1          1   \n",
       "4                 1                   1          1   \n",
       "\n",
       "                                   Polysemy_Prompt  \n",
       "0  a student solving math problems on a chalkboard  \n",
       "1     a professor exploring complex math equations  \n",
       "2    a teacher explaining math concepts to a class  \n",
       "3               a pile of math textbooks on a desk  \n",
       "4                      a shirt with math equations  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Path to the CSV file\n",
    "file_path = \"./Dialect/multimodal-dialectal-bias/data/text/basic/ine.csv\"\n",
    "\n",
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv(file_path)\n",
    "\n",
    "# Define the new column order\n",
    "new_order = [\n",
    "    \"Dialect_Word\",\n",
    "    \"SAE_Word\",\n",
    "    \"Dialect_Prompt\",\n",
    "    \"SAE_Prompt\",\n",
    "    \"person_in_prompt\",\n",
    "    \"dialect_word_count\",\n",
    "    \"polysemic\",\n",
    "    \"Polysemy_Prompt\"\n",
    "]\n",
    "\n",
    "# Reorder the DataFrame columns\n",
    "df = df[new_order]\n",
    "\n",
    "# Optionally, write the updated DataFrame back to the CSV file\n",
    "# To overwrite the original file, uncomment the line below:\n",
    "# df.to_csv(file_path, index=False)\n",
    "\n",
    "# Or save to a new file:\n",
    "output_file_path = \"./Dialect/multimodal-dialectal-bias/data/text/basic/ine_reordered.csv\"\n",
    "df.to_csv(output_file_path, index=False)\n",
    "\n",
    "# Display the first few rows of the updated DataFrame\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Dialect_Word</th>\n",
       "      <th>SAE_Word</th>\n",
       "      <th>Dialect_Prompt</th>\n",
       "      <th>SAE_Prompt</th>\n",
       "      <th>person_in_prompt</th>\n",
       "      <th>dialect_word_count</th>\n",
       "      <th>polysemic</th>\n",
       "      <th>Polysemy_Prompt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>teh tarik</td>\n",
       "      <td>milk tea</td>\n",
       "      <td>a painting of teh tarik</td>\n",
       "      <td>a painting of milk tea</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>teh tarik</td>\n",
       "      <td>milk tea</td>\n",
       "      <td>a photo of teh tarik</td>\n",
       "      <td>a photo of milk tea</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>teh tarik</td>\n",
       "      <td>milk tea</td>\n",
       "      <td>a shop selling teh tarik</td>\n",
       "      <td>a shop selling milk tea</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>teh tarik</td>\n",
       "      <td>milk tea</td>\n",
       "      <td>a large teh tarik</td>\n",
       "      <td>a large milk tea</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>teh tarik</td>\n",
       "      <td>milk tea</td>\n",
       "      <td>a small teh tarik</td>\n",
       "      <td>a small milk tea</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Dialect_Word  SAE_Word            Dialect_Prompt               SAE_Prompt  \\\n",
       "0    teh tarik  milk tea   a painting of teh tarik   a painting of milk tea   \n",
       "1    teh tarik  milk tea      a photo of teh tarik      a photo of milk tea   \n",
       "2    teh tarik  milk tea  a shop selling teh tarik  a shop selling milk tea   \n",
       "3    teh tarik  milk tea         a large teh tarik         a large milk tea   \n",
       "4    teh tarik  milk tea         a small teh tarik         a small milk tea   \n",
       "\n",
       "   person_in_prompt  dialect_word_count  polysemic Polysemy_Prompt  \n",
       "0                 0                   1          0                  \n",
       "1                 0                   1          0                  \n",
       "2                 0                   1          0                  \n",
       "3                 1                   1          0                  \n",
       "4                 1                   1          0                  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Path to the CSV file\n",
    "file_path = \"./Dialect/multimodal-dialectal-bias/data/text/basic/sge.csv\"\n",
    "\n",
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv(file_path)\n",
    "\n",
    "# Add the missing columns:\n",
    "# - dialect_word_count should be filled with 1s\n",
    "# - Polysemy_Prompt should be empty strings\n",
    "df['dialect_word_count'] = 1\n",
    "df['Polysemy_Prompt'] = \"\"\n",
    "\n",
    "# Define the new column order\n",
    "new_order = [\n",
    "    \"Dialect_Word\",\n",
    "    \"SAE_Word\",\n",
    "    \"Dialect_Prompt\",\n",
    "    \"SAE_Prompt\",\n",
    "    \"person_in_prompt\",\n",
    "    \"dialect_word_count\",\n",
    "    \"polysemic\",\n",
    "    \"Polysemy_Prompt\"\n",
    "]\n",
    "\n",
    "# Reorder the DataFrame columns\n",
    "df = df[new_order]\n",
    "\n",
    "# Overwrite the original CSV file with the updated DataFrame\n",
    "df.to_csv(file_path, index=False)\n",
    "\n",
    "# Display the first few rows of the updated DataFrame for verification\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Dialect_Word  SAE_Word                 Dialect_Prompt  \\\n",
      "0    teh tarik  milk tea   a plastic cup with teh tarik   \n",
      "1    teh tarik  milk tea  a cup of teh tarik on a table   \n",
      "2    teh tarik  milk tea       a shop selling teh tarik   \n",
      "3    teh tarik  milk tea       a man drinking teh tarik   \n",
      "4    teh tarik  milk tea     a woman drinking teh tarik   \n",
      "\n",
      "                      SAE_Prompt  person_in_prompt  dialect_word_count  \\\n",
      "0    a plastic cup with milk tea                 0                   1   \n",
      "1  a cup of teh tarik on a table                 0                   1   \n",
      "2        a shop selling milk tea                 0                   1   \n",
      "3        a man drinking milk tea                 1                   1   \n",
      "4      a woman drinking milk tea                 1                   1   \n",
      "\n",
      "   polysemic Polysemy_Prompt  \n",
      "0          0                  \n",
      "1          0                  \n",
      "2          0                  \n",
      "3          0                  \n",
      "4          0                  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Path to the CSV file\n",
    "file_path = \"./Dialect/multimodal-dialectal-bias/data/text/complex/sge.csv\"\n",
    "\n",
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv(file_path)\n",
    "\n",
    "# Add the missing columns:\n",
    "# - 'polysemic' should be filled with 0s\n",
    "# - 'Polysemy_Prompt' should be empty strings\n",
    "df['polysemic'] = 0\n",
    "df['Polysemy_Prompt'] = \"\"\n",
    "\n",
    "# Define the new column order\n",
    "new_order = [\n",
    "    \"Dialect_Word\",\n",
    "    \"SAE_Word\",\n",
    "    \"Dialect_Prompt\",\n",
    "    \"SAE_Prompt\",\n",
    "    \"person_in_prompt\",\n",
    "    \"dialect_word_count\",\n",
    "    \"polysemic\",\n",
    "    \"Polysemy_Prompt\"\n",
    "]\n",
    "\n",
    "# Reorder the DataFrame columns\n",
    "df = df[new_order]\n",
    "\n",
    "# Overwrite the original CSV file with the updated DataFrame\n",
    "df.to_csv(file_path, index=False)\n",
    "\n",
    "# Optionally, display the first few rows for verification\n",
    "print(df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Both files have the same number of rows: 216\n",
      "The 'Dialect_Word' column matches exactly in both files.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Define the file paths\n",
    "dialect = \"ine\"\n",
    "basic_file = f\"./Dialect/multimodal-dialectal-bias/data/text/basic/{dialect}.csv\"\n",
    "complex_file = f\"./Dialect/multimodal-dialectal-bias/data/text/complex/{dialect}.csv\"\n",
    "\n",
    "# Read the CSV files into DataFrames\n",
    "df_basic = pd.read_csv(basic_file)\n",
    "df_complex = pd.read_csv(complex_file)\n",
    "\n",
    "# Check that both files have the same number of rows\n",
    "if len(df_basic) != len(df_complex):\n",
    "    print(f\"Row count mismatch: basic file has {len(df_basic)} rows while complex file has {len(df_complex)} rows.\")\n",
    "else:\n",
    "    print(f\"Both files have the same number of rows: {len(df_basic)}\")\n",
    "\n",
    "# Compare the 'Dialect_Word' column element-wise\n",
    "# (assuming the rows are in the same order)\n",
    "mismatch_mask = df_basic[\"Dialect_Word\"] != df_complex[\"Dialect_Word\"]\n",
    "\n",
    "if mismatch_mask.any():\n",
    "    print(\"Found mismatches in the 'Dialect_Word' column at the following rows:\")\n",
    "    mismatches = pd.DataFrame({\n",
    "        \"Row\": df_basic.index[mismatch_mask],\n",
    "        \"Basic_Dialect_Word\": df_basic[\"Dialect_Word\"][mismatch_mask],\n",
    "        \"Complex_Dialect_Word\": df_complex[\"Dialect_Word\"][mismatch_mask]\n",
    "    })\n",
    "    print(mismatches)\n",
    "else:\n",
    "    print(\"The 'Dialect_Word' column matches exactly in both files.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Dialect_Word SAE_Word                                     Dialect_Prompt  \\\n",
      "0       torque    truck  a large black torque driving on a neighborhood...   \n",
      "1       torque    truck           a woman driving a green torque on a farm   \n",
      "2       torque    truck                    a torque with a dog in the back   \n",
      "3       torque    truck              a man driving a torque on the highway   \n",
      "4       torque    truck           two kids sitting on the back of a torque   \n",
      "\n",
      "                                          SAE_Prompt  person_in_prompt  \\\n",
      "0  a large black truck driving on a neighborhood ...                 0   \n",
      "1            a woman driving a green truck on a farm                 1   \n",
      "2                     a truck with a dog in the back                 0   \n",
      "3               a man driving a truck on the highway                 1   \n",
      "4            two kids sitting on the back of a truck                 1   \n",
      "\n",
      "   dialect_word_count  polysemic  Polysemy_Prompt  \n",
      "0                   1          1              NaN  \n",
      "1                   1          1              NaN  \n",
      "2                   1          1              NaN  \n",
      "3                   1          1              NaN  \n",
      "4                   1          1              NaN  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Define the file paths\n",
    "dialect = \"che\"\n",
    "basic_file = f\"./Dialect/multimodal-dialectal-bias/data/text/basic/{dialect}.csv\"\n",
    "complex_file = f\"./Dialect/multimodal-dialectal-bias/data/text/complex/{dialect}.csv\"\n",
    "\n",
    "# Read the CSV files into DataFrames\n",
    "basic_df = pd.read_csv(basic_file)\n",
    "complex_df = pd.read_csv(complex_file)\n",
    "\n",
    "# Replace the 'polysemic' column in the complex file with the one from the basic file\n",
    "complex_df['polysemic'] = basic_df['polysemic']\n",
    "\n",
    "# Overwrite the complex file with the updated DataFrame\n",
    "complex_df.to_csv(complex_file, index=False)\n",
    "\n",
    "# Optionally, print the first few rows for verification\n",
    "print(complex_df.head())\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
