{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split Data and Save in train/val/test csvs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_concise/aae.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_concise/aae'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_concise/bre.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_concise/bre'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_concise/che.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_concise/che'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_concise/ine.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_concise/ine'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_concise/sge.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_concise/sge'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed/aae.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_detailed/aae'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed/bre.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_detailed/bre'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed/che.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_detailed/che'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed/ine.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_detailed/ine'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed/sge.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/rewrite_detailed/sge'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_concise/aae.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_concise/aae'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_concise/bre.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_concise/bre'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_concise/che.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_concise/che'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_concise/ine.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_concise/ine'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_concise/sge.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_concise/sge'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_detailed/aae.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_detailed/aae'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_detailed/bre.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_detailed/bre'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_detailed/che.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_detailed/che'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_detailed/ine.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_detailed/ine'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/translate_detailed/sge.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/translate_detailed/sge'\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "MODES = [\"concise\", \"detailed\", \"rewrite_concise\", \"rewrite_detailed\", \"translate_concise\", \"translate_detailed\"]\n",
    "\n",
    "for MODE in MODES:\n",
    "    # Folder containing the CSV files to process\n",
    "    INPUT_FOLDER = rf\"./Dialect/multimodal-dialectal-bias/data/text/{MODE}/\"\n",
    "\n",
    "    # Train, validation, and test split numbers (e.g., 2 for train, 2 for validation, 2 for test)\n",
    "    SPLIT_NUMS = [4, 1, 1]  # This means each Dialect_Word must have 6 rows in total\n",
    "\n",
    "    # Folder to save the split results\n",
    "    OUTPUT_FOLDER = rf\"./Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/{MODE}/\"\n",
    "    \n",
    "    \n",
    "    # Expected first four column titles\n",
    "    EXPECTED_COLS = ['Dialect_Word', 'SAE_Word', 'Dialect_Prompt', 'SAE_Prompt']\n",
    "\n",
    "    # Find all CSV files in the input folder\n",
    "    csv_files = glob.glob(os.path.join(INPUT_FOLDER, \"*.csv\"))\n",
    "    if not csv_files:\n",
    "        raise FileNotFoundError(f\"No CSV files found in {INPUT_FOLDER}\")\n",
    "\n",
    "    # Total number of rows required per Dialect_Word (sum of split numbers)\n",
    "    required_count = sum(SPLIT_NUMS)\n",
    "\n",
    "    for csv_file in csv_files:\n",
    "        # Read CSV file\n",
    "        df = pd.read_csv(csv_file)\n",
    "        \n",
    "        # Check that the first four columns match the expected names\n",
    "        if list(df.columns[:4]) != EXPECTED_COLS:\n",
    "            raise ValueError(f\"File {csv_file} does not have the required first four columns: {EXPECTED_COLS}\")\n",
    "        \n",
    "        # Group rows by Dialect_Word (ignoring the header row)\n",
    "        groups = df.groupby('Dialect_Word', sort=False)\n",
    "        \n",
    "        # Check that every Dialect_Word has exactly the required number of rows\n",
    "        for word, group in groups:\n",
    "            if len(group) != required_count:\n",
    "                raise ValueError(f\"Dialect_Word '{word}' in file {csv_file} has {len(group)} rows, expected {required_count}\")\n",
    "        \n",
    "        # Prepare lists to collect split dataframes\n",
    "        train_list = []\n",
    "        val_list = []\n",
    "        test_list = []\n",
    "        \n",
    "        # Process each group to split rows based on SPLIT_NUMS\n",
    "        for word, group in groups:\n",
    "            # Ensure we maintain the original row order\n",
    "            group = group.sort_index()\n",
    "            n_train, n_val, n_test = SPLIT_NUMS\n",
    "            # Split the group rows into train, val, and test sets\n",
    "            train_list.append(group.iloc[:n_train])\n",
    "            val_list.append(group.iloc[n_train:n_train+n_val])\n",
    "            test_list.append(group.iloc[n_train+n_val:])\n",
    "        \n",
    "        # Concatenate each list of dataframes\n",
    "        train_df = pd.concat(train_list)\n",
    "        val_df = pd.concat(val_list)\n",
    "        test_df = pd.concat(test_list)\n",
    "        \n",
    "        # Create a new output folder named after the original CSV file (without extension)\n",
    "        base_name = os.path.splitext(os.path.basename(csv_file))[0]\n",
    "        out_dir = os.path.join(OUTPUT_FOLDER, base_name)\n",
    "        os.makedirs(out_dir, exist_ok=True)\n",
    "        \n",
    "        # Write the split dataframes to CSV files (keeping the header row)\n",
    "        train_df.to_csv(os.path.join(out_dir, \"train.csv\"), index=False)\n",
    "        val_df.to_csv(os.path.join(out_dir, \"val.csv\"), index=False)\n",
    "        test_df.to_csv(os.path.join(out_dir, \"test.csv\"), index=False)\n",
    "        \n",
    "        print(f\"Processed file '{csv_file}' and saved splits to '{out_dir}'\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/detailed/aae.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/detailed/aae'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/detailed/bre.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/detailed/bre'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/detailed/che.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/detailed/che'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/detailed/ine.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/detailed/ine'\n",
      "Processed file './Dialect/multimodal-dialectal-bias/data/text/detailed/sge.csv' and saved splits to './Dialect/multimodal-dialectal-bias/data/text/train_val_test/4-1-1/detailed/sge'\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "def remove_trailing_spaces(folder_path: str) -> None:\n",
    "    \"\"\"\n",
    "    Recursively traverses the directory at folder_path.\n",
    "    If any sub-folder name ends with a space, removes that space\n",
    "    and prints the old and new folder paths.\n",
    "    \"\"\"\n",
    "    # Walk the directory tree from the bottom up\n",
    "    for root, dirs, _ in os.walk(folder_path, topdown=False):\n",
    "        for dir_name in dirs:\n",
    "            if dir_name.endswith(\" \"):\n",
    "                new_dir_name = dir_name.rstrip(\" \")\n",
    "                old_path = os.path.join(root, dir_name)\n",
    "                new_path = os.path.join(root, new_dir_name)\n",
    "                try:\n",
    "                    os.rename(old_path, new_path)\n",
    "                    print(f\"Renamed: '{old_path}' -> '{new_path}'\")\n",
    "                except Exception as e:\n",
    "                    print(f\"Error renaming '{old_path}': {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "remove_trailing_spaces(\"./Dialect/multimodal-dialectal-bias/data/image\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dialect",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
