{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c9a1fd8d",
   "metadata": {},
   "source": [
    "# Extract dialect prompt columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0a82dcc6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ aae.csv → detailed_translate/aae.csv\n",
      "✅ bre.csv → detailed_translate/bre.csv\n",
      "✅ che.csv → detailed_translate/che.csv\n",
      "✅ ine.csv → detailed_translate/ine.csv\n",
      "✅ sge.csv → detailed_translate/sge.csv\n"
     ]
    }
   ],
   "source": [
    "# ----------------- configurable folder paths -----------------\n",
    "from pathlib import Path\n",
    "\n",
    "SRC_DIR  = Path(\"./Dialect/multimodal-dialectal-bias/data/text/detailed\")\n",
    "DEST_DIR = Path(\"./Dialect/multimodal-dialectal-bias/data/text/detailed_translate\")\n",
    "# -------------------------------------------------------------\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "# Create the destination folder (and any missing parents) if it doesn’t exist\n",
    "DEST_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# Process every *.csv file in the source folder\n",
    "for csv_path in SRC_DIR.glob(\"*.csv\"):\n",
    "    # Read the file, keep only the Dialect_Prompt column\n",
    "    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)\n",
    "    dialect_only = df[[\"Dialect_Prompt\"]]\n",
    "\n",
    "    # Save under the same filename inside DEST_DIR\n",
    "    out_path = DEST_DIR / csv_path.name\n",
    "    dialect_only.to_csv(out_path, index=False)\n",
    "\n",
    "    print(f\"✅ {csv_path.name} → {out_path.relative_to(DEST_DIR.parent)}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e48587b7",
   "metadata": {},
   "source": [
    "# Check for Overlap and Duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "cb5c024c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "\n",
    "# Define paths (you can modify these as needed)\n",
    "CONCISE_DIR = \"./Dialect/multimodal-dialectal-bias/data/text/concise\"\n",
    "DETAILED_DIR = \"./Dialect/multimodal-dialectal-bias/data/text/detailed\"\n",
    "\n",
    "# Dictionary to store results\n",
    "overlap_results = defaultdict(list)\n",
    "\n",
    "# List all CSV files in the concise folder\n",
    "csv_filenames = [f for f in os.listdir(CONCISE_DIR) if f.endswith(\".csv\")]\n",
    "\n",
    "# Iterate through each CSV file\n",
    "for filename in csv_filenames:\n",
    "    # Read the two corresponding CSV files\n",
    "    concise_df = pd.read_csv(os.path.join(CONCISE_DIR, filename))\n",
    "    detailed_df = pd.read_csv(os.path.join(DETAILED_DIR, filename))\n",
    "    \n",
    "    # Get the set of \"Dialect_Prompt\" values from the detailed file for fast lookup\n",
    "    detailed_prompts = set(detailed_df[\"Dialect_Prompt\"].dropna())\n",
    "    \n",
    "    # Iterate through concise \"Dialect_Prompt\" values and check for overlaps\n",
    "    for idx, row in concise_df.iterrows():\n",
    "        prompt = row[\"Dialect_Prompt\"]\n",
    "        if prompt in detailed_prompts:\n",
    "            overlap_results[filename].append((prompt, idx))  # store the prompt and its row index\n",
    "\n",
    "# Print results aggregated by file name\n",
    "for fname, matches in overlap_results.items():\n",
    "    print(f\"--- Overlaps in file: {fname} ---\")\n",
    "    for prompt, idx in matches:\n",
    "        print(f\"[Row {idx}] {prompt}\")\n",
    "    print()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "0953804b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ No duplicate Dialect_Prompt entries found in any CSV file.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "\n",
    "# --- CONFIGURATION ---\n",
    "INPUT_DIR = \"./Dialect/multimodal-dialectal-bias/data/text/concise\"\n",
    "\n",
    "# --- SCRIPT ---\n",
    "duplicate_prompts_by_file = defaultdict(list)\n",
    "\n",
    "for filename in os.listdir(INPUT_DIR):\n",
    "    if filename.endswith(\".csv\"):\n",
    "        filepath = os.path.join(INPUT_DIR, filename)\n",
    "        df = pd.read_csv(filepath)\n",
    "\n",
    "        if \"Dialect_Prompt\" in df.columns:\n",
    "            # Identify duplicate prompts\n",
    "            duplicated = df[df.duplicated(subset=[\"Dialect_Prompt\"], keep=False)]\n",
    "            if not duplicated.empty:\n",
    "                duplicate_prompts_by_file[filename] = duplicated[\"Dialect_Prompt\"].unique().tolist()\n",
    "\n",
    "# --- OUTPUT ---\n",
    "if duplicate_prompts_by_file:\n",
    "    for file, prompts in duplicate_prompts_by_file.items():\n",
    "        print(f\"📁 {file} has {len(prompts)} duplicate Dialect_Prompt entries:\")\n",
    "        for prompt in prompts:\n",
    "            print(f\"   - {prompt}\")\n",
    "        print()\n",
    "else:\n",
    "    print(\"✅ No duplicate Dialect_Prompt entries found in any CSV file.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "fcac8739",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ No duplicate Dialect_Prompt entries found in any CSV file.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "\n",
    "# --- CONFIGURATION ---\n",
    "INPUT_DIR = \"./Dialect/multimodal-dialectal-bias/data/text/detailed\"\n",
    "\n",
    "# --- SCRIPT ---\n",
    "duplicate_prompts_by_file = defaultdict(list)\n",
    "\n",
    "for filename in os.listdir(INPUT_DIR):\n",
    "    if filename.endswith(\".csv\"):\n",
    "        filepath = os.path.join(INPUT_DIR, filename)\n",
    "        df = pd.read_csv(filepath)\n",
    "\n",
    "        if \"Dialect_Prompt\" in df.columns:\n",
    "            # Identify duplicate prompts\n",
    "            duplicated = df[df.duplicated(subset=[\"Dialect_Prompt\"], keep=False)]\n",
    "            if not duplicated.empty:\n",
    "                duplicate_prompts_by_file[filename] = duplicated[\"Dialect_Prompt\"].unique().tolist()\n",
    "\n",
    "# --- OUTPUT ---\n",
    "if duplicate_prompts_by_file:\n",
    "    for file, prompts in duplicate_prompts_by_file.items():\n",
    "        print(f\"📁 {file} has {len(prompts)} duplicate Dialect_Prompt entries:\")\n",
    "        for prompt in prompts:\n",
    "            print(f\"   - {prompt}\")\n",
    "        print()\n",
    "else:\n",
    "    print(\"✅ No duplicate Dialect_Prompt entries found in any CSV file.\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f679ee31",
   "metadata": {},
   "source": [
    "# Gather Rewritten Dialect Prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c8ae2ad5",
   "metadata": {},
   "outputs": [
    {
     "ename": "OSError",
     "evalue": "[Errno 36] File name too long: './Dialect/multimodal-dialectal-bias/data/image/concise/aae/dalle3/dialect_imgs/A little baby of Hispanic descent, with rosy cheeks, sparkling hazel eyes and curly black hair, swaddled in a vibrant yellow blanket. The baby is laying on a soft, plush, azure baby mat, with colorful toys scattered around. An environment filled with warmth, safety, and joy./revised_prompt.txt'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 20\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m original_prompt \u001b[38;5;129;01min\u001b[39;00m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDialect_Prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m     17\u001b[0m     prompt_path \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m     18\u001b[0m         prompt_base \u001b[38;5;241m/\u001b[39m dialect_name \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdalle3\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdialect_imgs\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m/\u001b[39m original_prompt \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrevised_prompt.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     19\u001b[0m     )\n\u001b[0;32m---> 20\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mprompt_path\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexists\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m     21\u001b[0m         new_prompt \u001b[38;5;241m=\u001b[39m prompt_path\u001b[38;5;241m.\u001b[39mread_text(encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "File \u001b[0;32m./miniconda3/envs/dialect/lib/python3.8/pathlib.py:1407\u001b[0m, in \u001b[0;36mPath.exists\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1403\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1404\u001b[0m \u001b[38;5;124;03mWhether this path exists.\u001b[39;00m\n\u001b[1;32m   1405\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1406\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1407\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1408\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   1409\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _ignore_error(e):\n",
      "File \u001b[0;32m./miniconda3/envs/dialect/lib/python3.8/pathlib.py:1198\u001b[0m, in \u001b[0;36mPath.stat\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1193\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mstat\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m   1194\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1195\u001b[0m \u001b[38;5;124;03m    Return the result of the stat() system call on this path, like\u001b[39;00m\n\u001b[1;32m   1196\u001b[0m \u001b[38;5;124;03m    os.stat() does.\u001b[39;00m\n\u001b[1;32m   1197\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1198\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_accessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mOSError\u001b[0m: [Errno 36] File name too long: './Dialect/multimodal-dialectal-bias/data/image/concise/aae/dalle3/dialect_imgs/A little baby of Hispanic descent, with rosy cheeks, sparkling hazel eyes and curly black hair, swaddled in a vibrant yellow blanket. The baby is laying on a soft, plush, azure baby mat, with colorful toys scattered around. An environment filled with warmth, safety, and joy./revised_prompt.txt'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "# Define base paths\n",
    "csv_dir = Path(\"./Dialect/multimodal-dialectal-bias/data/text/rewrite_concise\")\n",
    "prompt_base = Path(\"./Dialect/multimodal-dialectal-bias/data/image/concise\")\n",
    "\n",
    "# Get all CSV files\n",
    "csv_files = sorted(csv_dir.glob(\"*.csv\"))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    dialect_name = csv_file.stem  # e.g., \"aae\", \"sge\"\n",
    "    df = pd.read_csv(csv_file)\n",
    "\n",
    "    updated_prompts = []\n",
    "    for original_prompt in df[\"Dialect_Prompt\"]:\n",
    "        prompt_path = (\n",
    "            prompt_base / dialect_name / \"dalle3\" / \"dialect_imgs\" / original_prompt / \"revised_prompt.txt\"\n",
    "        )\n",
    "        if prompt_path.exists():\n",
    "            new_prompt = prompt_path.read_text(encoding=\"utf-8\").strip()\n",
    "        else:\n",
    "            new_prompt = original_prompt  # fallback to original if file not found\n",
    "        updated_prompts.append(new_prompt)\n",
    "\n",
    "    df[\"Dialect_Prompt\"] = updated_prompts\n",
    "    df.to_csv(csv_file, index=False)\n",
    "\n",
    "\"Done updating all CSVs with revised prompts.\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8bc187c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ All CSV files updated with cleaned revised prompts (no quotes).\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "# Define base paths\n",
    "csv_dir = Path(\"./Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed\")\n",
    "prompt_base = Path(\"./Dialect/multimodal-dialectal-bias/data/image/detailed\")\n",
    "\n",
    "# Get all CSV files\n",
    "csv_files = sorted(csv_dir.glob(\"*.csv\"))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    dialect_name = csv_file.stem  # e.g., \"aae\", \"sge\"\n",
    "    df = pd.read_csv(csv_file)\n",
    "\n",
    "    updated_prompts = []\n",
    "    for original_prompt in df[\"Dialect_Prompt\"]:\n",
    "        prompt_path = (\n",
    "            prompt_base / dialect_name / \"dalle3\" / \"dialect_imgs\" / original_prompt / \"revised_prompt.txt\"\n",
    "        )\n",
    "        if prompt_path.exists():\n",
    "            new_prompt = prompt_path.read_text(encoding=\"utf-8\").strip()\n",
    "        else:\n",
    "            new_prompt = original_prompt  # fallback to original if file not found\n",
    "        updated_prompts.append(new_prompt)\n",
    "\n",
    "    df[\"Dialect_Prompt\"] = updated_prompts\n",
    "    df.to_csv(csv_file, index=False)\n",
    "\n",
    "\"Done updating all CSVs with revised prompts.\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1840e42",
   "metadata": {},
   "source": [
    "# Gather Rewritten SAE Prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ed6ec5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Done updating all CSVs with revised prompts.'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "# Define base paths\n",
    "csv_dir = Path(\"./Dialect/multimodal-dialectal-bias/data/text/rewrite_concise\")\n",
    "prompt_base = Path(\"./Dialect/multimodal-dialectal-bias/data/image/concise\")\n",
    "\n",
    "# Get all CSV files\n",
    "csv_files = sorted(csv_dir.glob(\"*.csv\"))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    dialect_name = csv_file.stem  # e.g., \"aae\", \"sge\"\n",
    "    df = pd.read_csv(csv_file)\n",
    "\n",
    "    updated_prompts = []\n",
    "    for original_prompt in df[\"SAE_Prompt\"]:\n",
    "        prompt_path = (\n",
    "            prompt_base / dialect_name / \"dalle3\" / \"sae_imgs\" / original_prompt / \"revised_prompt.txt\"\n",
    "        )\n",
    "        if prompt_path.exists():\n",
    "            new_prompt = prompt_path.read_text(encoding=\"utf-8\").strip()\n",
    "        else:\n",
    "            new_prompt = original_prompt  # fallback to original if file not found\n",
    "        updated_prompts.append(new_prompt)\n",
    "\n",
    "    df[\"SAE_Prompt\"] = updated_prompts\n",
    "    df.to_csv(csv_file, index=False)\n",
    "\n",
    "\"Done updating all CSVs with revised prompts.\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "815a8dc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "# Define base paths\n",
    "csv_dir = Path(\"./Dialect/multimodal-dialectal-bias/data/text/rewrite_detailed\")\n",
    "prompt_base = Path(\"./Dialect/multimodal-dialectal-bias/data/image/detailed\")\n",
    "\n",
    "# Get all CSV files\n",
    "csv_files = sorted(csv_dir.glob(\"*.csv\"))\n",
    "\n",
    "for csv_file in csv_files:\n",
    "    dialect_name = csv_file.stem  # e.g., \"aae\", \"sge\"\n",
    "    df = pd.read_csv(csv_file)\n",
    "\n",
    "    updated_prompts = []\n",
    "    for original_prompt in df[\"SAE_Prompt\"]:\n",
    "        prompt_path = (\n",
    "            prompt_base / dialect_name / \"dalle3\" / \"sae_imgs\" / original_prompt / \"revised_prompt.txt\"\n",
    "        )\n",
    "        if prompt_path.exists():\n",
    "            new_prompt = prompt_path.read_text(encoding=\"utf-8\").strip()\n",
    "        else:\n",
    "            new_prompt = original_prompt  # fallback to original if file not found\n",
    "        updated_prompts.append(new_prompt)\n",
    "\n",
    "    df[\"SAE_Prompt\"] = updated_prompts\n",
    "    df.to_csv(csv_file, index=False)\n",
    "\n",
    "\"Done updating all CSVs with revised prompts.\"\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dialect",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
