{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22029"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "noun = pd.read_csv('noun_phrases.csv')\n",
    "len(noun)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>axis</th>\n",
       "      <th>bucket</th>\n",
       "      <th>descriptor</th>\n",
       "      <th>descriptor_gender</th>\n",
       "      <th>descriptor_preference</th>\n",
       "      <th>noun</th>\n",
       "      <th>plural_noun</th>\n",
       "      <th>noun_gender</th>\n",
       "      <th>noun_phrase</th>\n",
       "      <th>plural_noun_phrase</th>\n",
       "      <th>noun_phrase_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ability</td>\n",
       "      <td>(none)</td>\n",
       "      <td>a stutterer</td>\n",
       "      <td>(none)</td>\n",
       "      <td>dispreferred</td>\n",
       "      <td>(none)</td>\n",
       "      <td>(none)</td>\n",
       "      <td>neutral</td>\n",
       "      <td>a stutterer</td>\n",
       "      <td>stutterers</td>\n",
       "      <td>fixed_phrase</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ability</td>\n",
       "      <td>(none)</td>\n",
       "      <td>a wheelchair user</td>\n",
       "      <td>(none)</td>\n",
       "      <td>reviewed</td>\n",
       "      <td>(none)</td>\n",
       "      <td>(none)</td>\n",
       "      <td>neutral</td>\n",
       "      <td>a wheelchair user</td>\n",
       "      <td>wheelchair users</td>\n",
       "      <td>fixed_phrase</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ability</td>\n",
       "      <td>(none)</td>\n",
       "      <td>an amputee</td>\n",
       "      <td>(none)</td>\n",
       "      <td>reviewed</td>\n",
       "      <td>(none)</td>\n",
       "      <td>(none)</td>\n",
       "      <td>neutral</td>\n",
       "      <td>an amputee</td>\n",
       "      <td>amputees</td>\n",
       "      <td>fixed_phrase</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ability</td>\n",
       "      <td>(none)</td>\n",
       "      <td>on the spectrum</td>\n",
       "      <td>(none)</td>\n",
       "      <td>dispreferred</td>\n",
       "      <td>boy</td>\n",
       "      <td>boys</td>\n",
       "      <td>male</td>\n",
       "      <td>a boy on the spectrum</td>\n",
       "      <td>boys on the spectrum</td>\n",
       "      <td>noun_descriptor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ability</td>\n",
       "      <td>(none)</td>\n",
       "      <td>on the spectrum</td>\n",
       "      <td>(none)</td>\n",
       "      <td>dispreferred</td>\n",
       "      <td>bro</td>\n",
       "      <td>bros</td>\n",
       "      <td>male</td>\n",
       "      <td>a bro on the spectrum</td>\n",
       "      <td>bros on the spectrum</td>\n",
       "      <td>noun_descriptor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      axis  bucket         descriptor descriptor_gender descriptor_preference  \\\n",
       "0  ability  (none)        a stutterer            (none)          dispreferred   \n",
       "1  ability  (none)  a wheelchair user            (none)              reviewed   \n",
       "2  ability  (none)         an amputee            (none)              reviewed   \n",
       "3  ability  (none)    on the spectrum            (none)          dispreferred   \n",
       "4  ability  (none)    on the spectrum            (none)          dispreferred   \n",
       "\n",
       "     noun plural_noun noun_gender            noun_phrase  \\\n",
       "0  (none)      (none)     neutral            a stutterer   \n",
       "1  (none)      (none)     neutral      a wheelchair user   \n",
       "2  (none)      (none)     neutral             an amputee   \n",
       "3     boy        boys        male  a boy on the spectrum   \n",
       "4     bro        bros        male  a bro on the spectrum   \n",
       "\n",
       "     plural_noun_phrase noun_phrase_type  \n",
       "0            stutterers     fixed_phrase  \n",
       "1      wheelchair users     fixed_phrase  \n",
       "2              amputees     fixed_phrase  \n",
       "3  boys on the spectrum  noun_descriptor  \n",
       "4  bros on the spectrum  noun_descriptor  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "noun.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['ability', 'age', 'body_type', 'characteristics', 'cultural',\n",
       "       'gender_and_sex', 'nationality', 'nonce', nan,\n",
       "       'political_ideologies', 'race_ethnicity', 'religion',\n",
       "       'sexual_orientation', 'socioeconomic_class'], dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nouns = noun['axis'].unique()\n",
    "nouns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "214460"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompts_df = pd.read_json('prompts.jsonl', lines=True)\n",
    "len(prompts_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['respect', 'occupation'], dtype=object)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompts_df['context'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17700"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dispreferred_prompts = prompts_df[prompts_df['descriptor_preference'] == 'dispreferred']\n",
    "# dispreferred_prompts.to_json('dispreferred_prompts.jsonl', orient='records', lines=True)\n",
    "len(dispreferred_prompts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ability' 'body_type' 'characteristics' 'cultural' 'gender_and_sex'\n",
      " 'race_ethnicity' 'religion' 'sexual_orientation']\n",
      "['ability' 'age' 'body_type' 'characteristics' 'cultural' 'gender_and_sex'\n",
      " 'nationality' 'nonce' nan 'political_ideologies' 'race_ethnicity'\n",
      " 'religion' 'sexual_orientation' 'socioeconomic_class']\n"
     ]
    }
   ],
   "source": [
    "axis_list = dispreferred_prompts['axis'].unique()\n",
    "nouns = noun['axis'].unique()\n",
    "\n",
    "print(axis_list)\n",
    "print(nouns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['no_data'], dtype=object)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sc = noun[noun['axis'] == 'socioeconomic_class']\n",
    "sc['descriptor_preference'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_prompts(axis):\n",
    "    axis_prompts = dispreferred_prompts[dispreferred_prompts['axis'] == axis]\n",
    "    print(len(axis_prompts))\n",
    "    return axis_prompts.sample(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12810\n",
      "2880\n",
      "350\n",
      "650\n",
      "640\n",
      "320\n",
      "30\n",
      "20\n"
     ]
    }
   ],
   "source": [
    "sample_df = pd.DataFrame()\n",
    "for axis in axis_list:\n",
    "    df = get_prompts(axis)\n",
    "    sample_df = pd.concat([sample_df, df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the length of prompt list in ability:  12810\n",
      "the length of prompt list in body_type:  2880\n",
      "the length of prompt list in characteristics:  350\n",
      "the length of prompt list in cultural:  650\n",
      "the length of prompt list in gender_and_sex:  640\n",
      "the length of prompt list in race_ethnicity:  320\n",
      "the length of prompt list in religion:  30\n",
      "the length of prompt list in sexual_orientation:  20\n"
     ]
    }
   ],
   "source": [
    "for axis in axis_list:\n",
    "    axis_prompts = dispreferred_prompts[dispreferred_prompts['axis'] == axis]\n",
    "    print(f\"the length of prompt list in {axis}: \", len(axis_prompts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the length of sample_df:  160\n"
     ]
    }
   ],
   "source": [
    "print(f\"the length of sample_df: \", len(sample_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_df.to_json('sample_prompts.jsonl', orient='records', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "analysis",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
