{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Move concise and detailed to different folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "# Define input and output directories.\n",
    "input_folder = './Dialect/multimodal-dialectal-bias/data/text/concise'\n",
    "output_folder = './Dialect/multimodal-dialectal-bias/data/text/polysemy_concise'\n",
    "\n",
    "# Create the output directory if it doesn't exist.\n",
    "os.makedirs(output_folder, exist_ok=True)\n",
    "\n",
    "# Retrieve all CSV files in the input folder.\n",
    "csv_files = glob.glob(os.path.join(input_folder, '*.csv'))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    # Read the CSV file.\n",
    "    df = pd.read_csv(csv_file)\n",
    "    \n",
    "    # Check that every value in the 'polysemic' column is either 0 or 1.\n",
    "    if not df['polysemic'].isin([0, 1]).all():\n",
    "        raise ValueError(f\"File {os.path.basename(csv_file)} has invalid 'polysemic' values.\")\n",
    "    \n",
    "    # Filter rows where 'polysemic' equals 1.\n",
    "    df_poly = df[df['polysemic'] == 1]\n",
    "    \n",
    "    # Define the output file path (same file name as input).\n",
    "    output_file = os.path.join(output_folder, os.path.basename(csv_file))\n",
    "    \n",
    "    # Save the filtered rows to the new CSV.\n",
    "    df_poly.to_csv(output_file, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "# Define input and output directories.\n",
    "input_folder = './Dialect/multimodal-dialectal-bias/data/text/detailed'\n",
    "output_folder = './Dialect/multimodal-dialectal-bias/data/text/polysemy_detailed'\n",
    "\n",
    "# Create the output directory if it doesn't exist.\n",
    "os.makedirs(output_folder, exist_ok=True)\n",
    "\n",
    "# Retrieve all CSV files in the input folder.\n",
    "csv_files = glob.glob(os.path.join(input_folder, '*.csv'))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    # Read the CSV file.\n",
    "    df = pd.read_csv(csv_file)\n",
    "    \n",
    "    # Check that every value in the 'polysemic' column is either 0 or 1.\n",
    "    if not df['polysemic'].isin([0, 1]).all():\n",
    "        raise ValueError(f\"File {os.path.basename(csv_file)} has invalid 'polysemic' values.\")\n",
    "    \n",
    "    # Filter rows where 'polysemic' equals 1.\n",
    "    df_poly = df[df['polysemic'] == 1]\n",
    "    \n",
    "    # Define the output file path (same file name as input).\n",
    "    output_file = os.path.join(output_folder, os.path.basename(csv_file))\n",
    "    \n",
    "    # Save the filtered rows to the new CSV.\n",
    "    df_poly.to_csv(output_file, index=False)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Put data back"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing file: aae.csv\n",
      "Processing file: bre.csv\n",
      "Processing file: che.csv\n",
      "Processing file: ine.csv\n",
      "Processing file: sge.csv\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "input_folder = './Dialect/multimodal-dialectal-bias/data/text/polysemy_detailed'\n",
    "output_folder = './Dialect/multimodal-dialectal-bias/data/text/detailed'\n",
    "\n",
    "# Iterate through all CSV files in the input folder\n",
    "for input_file in glob.glob(os.path.join(input_folder, \"*.csv\")):\n",
    "    filename = os.path.basename(input_file)\n",
    "    output_file = os.path.join(output_folder, filename)\n",
    "    print(f\"Processing file: {filename}\")\n",
    "    \n",
    "    # Load the input and output CSV files\n",
    "    input_df = pd.read_csv(input_file)\n",
    "    output_df = pd.read_csv(output_file)\n",
    "    \n",
    "    # Get unique Dialect_Word values from the input CSV\n",
    "    unique_words = input_df[\"Dialect_Word\"].unique()\n",
    "    \n",
    "    # For each unique Dialect_Word, check counts and replace corresponding rows\n",
    "    for word in unique_words:\n",
    "        input_rows = input_df[input_df[\"Dialect_Word\"] == word]\n",
    "        output_rows = output_df[output_df[\"Dialect_Word\"] == word]\n",
    "        \n",
    "        if len(input_rows) != 6 or len(output_rows) != 6:\n",
    "            raise ValueError(f\"In file '{filename}', for Dialect_Word '{word}', expected exactly 6 rows in both input and output CSVs. Got {len(input_rows)} and {len(output_rows)} respectively.\")\n",
    "        \n",
    "        # Replace the rows in output_df with the rows from input_df.\n",
    "        # The replacement is done in-place, preserving the overall order of the output CSV.\n",
    "        new_rows = input_rows.reset_index(drop=True)\n",
    "        output_df.loc[output_rows.index] = new_rows.values\n",
    "    \n",
    "    # Save the updated output CSV\n",
    "    output_df.to_csv(output_file, index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dialect",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
