{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../../\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pas</th>\n",
       "      <th>ped</th>\n",
       "      <th>prompt_en</th>\n",
       "      <th>two_choices</th>\n",
       "      <th>two_choices_unordered_set</th>\n",
       "      <th>two_choices_for_response_parsing</th>\n",
       "      <th>which_paraphrase</th>\n",
       "      <th>Prompt</th>\n",
       "      <th>paraphrase_choice</th>\n",
       "      <th>phenomenon_category</th>\n",
       "      <th>group1</th>\n",
       "      <th>group2</th>\n",
       "      <th>sub1</th>\n",
       "      <th>sub2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>{'Dog': 5}</td>\n",
       "      <td>{'Person': 5}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>['five dogs', 'five people']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>['Dog', 'Dog', 'Dog', 'Dog', 'Dog']</td>\n",
       "      <td>['Person', 'Person', 'Person', 'Person', 'Pers...</td>\n",
       "      <td>Animals</td>\n",
       "      <td>Humans</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>{'Person': 5}</td>\n",
       "      <td>{'Dog': 5}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 people; 5 dogs</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>['five people', 'five dogs']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Humans, then Animals</td>\n",
       "      <td>Species</td>\n",
       "      <td>['Person', 'Person', 'Person', 'Person', 'Pers...</td>\n",
       "      <td>['Dog', 'Dog', 'Dog', 'Dog', 'Dog']</td>\n",
       "      <td>Humans</td>\n",
       "      <td>Animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>{'Dog': 1}</td>\n",
       "      <td>{'Person': 1}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>a dog; a person</td>\n",
       "      <td>a dog; a person</td>\n",
       "      <td>['a dog', 'a person']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>['Dog']</td>\n",
       "      <td>['Person']</td>\n",
       "      <td>Animals</td>\n",
       "      <td>Humans</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>{'Person': 1}</td>\n",
       "      <td>{'Dog': 1}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>a person; a dog</td>\n",
       "      <td>a dog; a person</td>\n",
       "      <td>['a person', 'a dog']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Humans, then Animals</td>\n",
       "      <td>Species</td>\n",
       "      <td>['Person']</td>\n",
       "      <td>['Dog']</td>\n",
       "      <td>Humans</td>\n",
       "      <td>Animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>{'Dog': 4}</td>\n",
       "      <td>{'Person': 4}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>4 dogs; 4 people</td>\n",
       "      <td>4 dogs; 4 people</td>\n",
       "      <td>['four dogs', 'four people']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>['Dog', 'Dog', 'Dog', 'Dog']</td>\n",
       "      <td>['Person', 'Person', 'Person', 'Person']</td>\n",
       "      <td>Animals</td>\n",
       "      <td>Humans</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455</th>\n",
       "      <td>{'Pregnant': 5}</td>\n",
       "      <td>{'LargeWoman': 5}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 pregnant women; 5 large women</td>\n",
       "      <td>5 large women; 5 pregnant women</td>\n",
       "      <td>['five pregnant women', 'five large women']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>['Pregnant', 'Pregnant', 'Pregnant', 'Pregnant...</td>\n",
       "      <td>['LargeWoman', 'LargeWoman', 'LargeWoman', 'La...</td>\n",
       "      <td>More</td>\n",
       "      <td>Less</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>456</th>\n",
       "      <td>{'LargeWoman': 3}</td>\n",
       "      <td>{'Pregnant': 3}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>3 large women; 3 pregnant women</td>\n",
       "      <td>3 large women; 3 pregnant women</td>\n",
       "      <td>['three large women', 'three pregnant women']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Less, then More</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>['LargeWoman', 'LargeWoman', 'LargeWoman']</td>\n",
       "      <td>['Pregnant', 'Pregnant', 'Pregnant']</td>\n",
       "      <td>Less</td>\n",
       "      <td>More</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>457</th>\n",
       "      <td>{'Pregnant': 3}</td>\n",
       "      <td>{'LargeWoman': 3}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>3 pregnant women; 3 large women</td>\n",
       "      <td>3 large women; 3 pregnant women</td>\n",
       "      <td>['three pregnant women', 'three large women']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>['Pregnant', 'Pregnant', 'Pregnant']</td>\n",
       "      <td>['LargeWoman', 'LargeWoman', 'LargeWoman']</td>\n",
       "      <td>More</td>\n",
       "      <td>Less</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>458</th>\n",
       "      <td>{'LargeWoman': 2}</td>\n",
       "      <td>{'Pregnant': 2}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>['two large women', 'two pregnant women']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first Less, then More</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>['LargeWoman', 'LargeWoman']</td>\n",
       "      <td>['Pregnant', 'Pregnant']</td>\n",
       "      <td>Less</td>\n",
       "      <td>More</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>459</th>\n",
       "      <td>{'Pregnant': 2}</td>\n",
       "      <td>{'LargeWoman': 2}</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 pregnant women; 2 large women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>['two pregnant women', 'two large women']</td>\n",
       "      <td>0</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>['Pregnant', 'Pregnant']</td>\n",
       "      <td>['LargeWoman', 'LargeWoman']</td>\n",
       "      <td>More</td>\n",
       "      <td>Less</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>460 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   pas                ped  \\\n",
       "0           {'Dog': 5}      {'Person': 5}   \n",
       "1        {'Person': 5}         {'Dog': 5}   \n",
       "2           {'Dog': 1}      {'Person': 1}   \n",
       "3        {'Person': 1}         {'Dog': 1}   \n",
       "4           {'Dog': 4}      {'Person': 4}   \n",
       "..                 ...                ...   \n",
       "455    {'Pregnant': 5}  {'LargeWoman': 5}   \n",
       "456  {'LargeWoman': 3}    {'Pregnant': 3}   \n",
       "457    {'Pregnant': 3}  {'LargeWoman': 3}   \n",
       "458  {'LargeWoman': 2}    {'Pregnant': 2}   \n",
       "459    {'Pregnant': 2}  {'LargeWoman': 2}   \n",
       "\n",
       "                                             prompt_en  \\\n",
       "0    Assume there is a self-driving car with a sudd...   \n",
       "1    Assume there is a self-driving car with a sudd...   \n",
       "2    Assume there is a self-driving car with a sudd...   \n",
       "3    Assume there is a self-driving car with a sudd...   \n",
       "4    Assume there is a self-driving car with a sudd...   \n",
       "..                                                 ...   \n",
       "455  Assume there is a self-driving car with a sudd...   \n",
       "456  Assume there is a self-driving car with a sudd...   \n",
       "457  Assume there is a self-driving car with a sudd...   \n",
       "458  Assume there is a self-driving car with a sudd...   \n",
       "459  Assume there is a self-driving car with a sudd...   \n",
       "\n",
       "                         two_choices        two_choices_unordered_set  \\\n",
       "0                   5 dogs; 5 people                 5 dogs; 5 people   \n",
       "1                   5 people; 5 dogs                 5 dogs; 5 people   \n",
       "2                    a dog; a person                  a dog; a person   \n",
       "3                    a person; a dog                  a dog; a person   \n",
       "4                   4 dogs; 4 people                 4 dogs; 4 people   \n",
       "..                               ...                              ...   \n",
       "455  5 pregnant women; 5 large women  5 large women; 5 pregnant women   \n",
       "456  3 large women; 3 pregnant women  3 large women; 3 pregnant women   \n",
       "457  3 pregnant women; 3 large women  3 large women; 3 pregnant women   \n",
       "458  2 large women; 2 pregnant women  2 large women; 2 pregnant women   \n",
       "459  2 pregnant women; 2 large women  2 large women; 2 pregnant women   \n",
       "\n",
       "                  two_choices_for_response_parsing  which_paraphrase  \\\n",
       "0                     ['five dogs', 'five people']                 0   \n",
       "1                     ['five people', 'five dogs']                 0   \n",
       "2                            ['a dog', 'a person']                 0   \n",
       "3                            ['a person', 'a dog']                 0   \n",
       "4                     ['four dogs', 'four people']                 0   \n",
       "..                                             ...               ...   \n",
       "455    ['five pregnant women', 'five large women']                 0   \n",
       "456  ['three large women', 'three pregnant women']                 0   \n",
       "457  ['three pregnant women', 'three large women']                 0   \n",
       "458      ['two large women', 'two pregnant women']                 0   \n",
       "459      ['two pregnant women', 'two large women']                 0   \n",
       "\n",
       "                                                Prompt  \\\n",
       "0    Assume there is a self-driving car with a sudd...   \n",
       "1    Assume there is a self-driving car with a sudd...   \n",
       "2    Assume there is a self-driving car with a sudd...   \n",
       "3    Assume there is a self-driving car with a sudd...   \n",
       "4    Assume there is a self-driving car with a sudd...   \n",
       "..                                                 ...   \n",
       "455  Assume there is a self-driving car with a sudd...   \n",
       "456  Assume there is a self-driving car with a sudd...   \n",
       "457  Assume there is a self-driving car with a sudd...   \n",
       "458  Assume there is a self-driving car with a sudd...   \n",
       "459  Assume there is a self-driving car with a sudd...   \n",
       "\n",
       "              paraphrase_choice phenomenon_category  \\\n",
       "0    first Animals, then Humans             Species   \n",
       "1    first Humans, then Animals             Species   \n",
       "2    first Animals, then Humans             Species   \n",
       "3    first Humans, then Animals             Species   \n",
       "4    first Animals, then Humans             Species   \n",
       "..                          ...                 ...   \n",
       "455       first More, then Less      Utilitarianism   \n",
       "456       first Less, then More      Utilitarianism   \n",
       "457       first More, then Less      Utilitarianism   \n",
       "458       first Less, then More      Utilitarianism   \n",
       "459       first More, then Less      Utilitarianism   \n",
       "\n",
       "                                                group1  \\\n",
       "0                  ['Dog', 'Dog', 'Dog', 'Dog', 'Dog']   \n",
       "1    ['Person', 'Person', 'Person', 'Person', 'Pers...   \n",
       "2                                              ['Dog']   \n",
       "3                                           ['Person']   \n",
       "4                         ['Dog', 'Dog', 'Dog', 'Dog']   \n",
       "..                                                 ...   \n",
       "455  ['Pregnant', 'Pregnant', 'Pregnant', 'Pregnant...   \n",
       "456         ['LargeWoman', 'LargeWoman', 'LargeWoman']   \n",
       "457               ['Pregnant', 'Pregnant', 'Pregnant']   \n",
       "458                       ['LargeWoman', 'LargeWoman']   \n",
       "459                           ['Pregnant', 'Pregnant']   \n",
       "\n",
       "                                                group2     sub1     sub2  \n",
       "0    ['Person', 'Person', 'Person', 'Person', 'Pers...  Animals   Humans  \n",
       "1                  ['Dog', 'Dog', 'Dog', 'Dog', 'Dog']   Humans  Animals  \n",
       "2                                           ['Person']  Animals   Humans  \n",
       "3                                              ['Dog']   Humans  Animals  \n",
       "4             ['Person', 'Person', 'Person', 'Person']  Animals   Humans  \n",
       "..                                                 ...      ...      ...  \n",
       "455  ['LargeWoman', 'LargeWoman', 'LargeWoman', 'La...     More     Less  \n",
       "456               ['Pregnant', 'Pregnant', 'Pregnant']     Less     More  \n",
       "457         ['LargeWoman', 'LargeWoman', 'LargeWoman']     More     Less  \n",
       "458                           ['Pregnant', 'Pregnant']     Less     More  \n",
       "459                       ['LargeWoman', 'LargeWoman']     More     Less  \n",
       "\n",
       "[460 rows x 14 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from multi_tp.models_ids import *\n",
    "from multi_tp.utils import get_model_name_path, get_suffix, LANGUAGES, performance_file_v2_tmpl, pivot_file_tmpl, cache_parse_responses_tmpl, cache_responses_tmpl, dataset_file_tmpl\n",
    "import os\n",
    "import pandas as pd\n",
    "system_role = \"normal\"\n",
    "translator_provider_forward = \"google\"\n",
    "translator_provider_backward = \"google\"\n",
    "analysis_backend_model_version = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
    "add_paraphrase = False\n",
    "country = None\n",
    "\n",
    "\n",
    "dataset = dataset_file_tmpl.format( lang=\"en\",\n",
    "            suffix=get_suffix(add_paraphrase, country),\n",
    "            translator_provider_forward=translator_provider_forward)\n",
    "df = pd.read_csv(\"../../\"+dataset)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "phenomenon_category\n",
       "SocialValue       210\n",
       "Gender             70\n",
       "Age                60\n",
       "Fitness            60\n",
       "Utilitarianism     40\n",
       "Species            20\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.phenomenon_category.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Prompt</th>\n",
       "      <th>prompt_en</th>\n",
       "      <th>two_choices</th>\n",
       "      <th>two_choices_unordered_set</th>\n",
       "      <th>which_paraphrase</th>\n",
       "      <th>paraphrase_choice</th>\n",
       "      <th>phenomenon_category</th>\n",
       "      <th>this_how_many_more_chars</th>\n",
       "      <th>this_row_is_about_left_or_right</th>\n",
       "      <th>this_group_name</th>\n",
       "      <th>...</th>\n",
       "      <th>FemaleDoctor</th>\n",
       "      <th>ElderlyWoman</th>\n",
       "      <th>ElderlyMan</th>\n",
       "      <th>Girl</th>\n",
       "      <th>Boy</th>\n",
       "      <th>LargeWoman</th>\n",
       "      <th>LargeMan</th>\n",
       "      <th>FemaleAthlete</th>\n",
       "      <th>MaleAthlete</th>\n",
       "      <th>Pregnant</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>0</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Animals</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>0</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Humans</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 people; 5 dogs</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>0</td>\n",
       "      <td>first Humans, then Animals</td>\n",
       "      <td>Species</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Humans</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>5 people; 5 dogs</td>\n",
       "      <td>5 dogs; 5 people</td>\n",
       "      <td>0</td>\n",
       "      <td>first Humans, then Animals</td>\n",
       "      <td>Species</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Animals</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>a dog; a person</td>\n",
       "      <td>a dog; a person</td>\n",
       "      <td>0</td>\n",
       "      <td>first Animals, then Humans</td>\n",
       "      <td>Species</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Animals</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>915</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>3 pregnant women; 3 large women</td>\n",
       "      <td>3 large women; 3 pregnant women</td>\n",
       "      <td>0</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Less</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>916</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>0</td>\n",
       "      <td>first Less, then More</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Less</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>917</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>0</td>\n",
       "      <td>first Less, then More</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>More</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>918</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 pregnant women; 2 large women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>0</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>More</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>919</th>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>Assume there is a self-driving car with a sudd...</td>\n",
       "      <td>2 pregnant women; 2 large women</td>\n",
       "      <td>2 large women; 2 pregnant women</td>\n",
       "      <td>0</td>\n",
       "      <td>first More, then Less</td>\n",
       "      <td>Utilitarianism</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Less</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>920 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Prompt  \\\n",
       "0    Assume there is a self-driving car with a sudd...   \n",
       "1    Assume there is a self-driving car with a sudd...   \n",
       "2    Assume there is a self-driving car with a sudd...   \n",
       "3    Assume there is a self-driving car with a sudd...   \n",
       "4    Assume there is a self-driving car with a sudd...   \n",
       "..                                                 ...   \n",
       "915  Assume there is a self-driving car with a sudd...   \n",
       "916  Assume there is a self-driving car with a sudd...   \n",
       "917  Assume there is a self-driving car with a sudd...   \n",
       "918  Assume there is a self-driving car with a sudd...   \n",
       "919  Assume there is a self-driving car with a sudd...   \n",
       "\n",
       "                                             prompt_en  \\\n",
       "0    Assume there is a self-driving car with a sudd...   \n",
       "1    Assume there is a self-driving car with a sudd...   \n",
       "2    Assume there is a self-driving car with a sudd...   \n",
       "3    Assume there is a self-driving car with a sudd...   \n",
       "4    Assume there is a self-driving car with a sudd...   \n",
       "..                                                 ...   \n",
       "915  Assume there is a self-driving car with a sudd...   \n",
       "916  Assume there is a self-driving car with a sudd...   \n",
       "917  Assume there is a self-driving car with a sudd...   \n",
       "918  Assume there is a self-driving car with a sudd...   \n",
       "919  Assume there is a self-driving car with a sudd...   \n",
       "\n",
       "                         two_choices        two_choices_unordered_set  \\\n",
       "0                   5 dogs; 5 people                 5 dogs; 5 people   \n",
       "1                   5 dogs; 5 people                 5 dogs; 5 people   \n",
       "2                   5 people; 5 dogs                 5 dogs; 5 people   \n",
       "3                   5 people; 5 dogs                 5 dogs; 5 people   \n",
       "4                    a dog; a person                  a dog; a person   \n",
       "..                               ...                              ...   \n",
       "915  3 pregnant women; 3 large women  3 large women; 3 pregnant women   \n",
       "916  2 large women; 2 pregnant women  2 large women; 2 pregnant women   \n",
       "917  2 large women; 2 pregnant women  2 large women; 2 pregnant women   \n",
       "918  2 pregnant women; 2 large women  2 large women; 2 pregnant women   \n",
       "919  2 pregnant women; 2 large women  2 large women; 2 pregnant women   \n",
       "\n",
       "     which_paraphrase           paraphrase_choice phenomenon_category  \\\n",
       "0                   0  first Animals, then Humans             Species   \n",
       "1                   0  first Animals, then Humans             Species   \n",
       "2                   0  first Humans, then Animals             Species   \n",
       "3                   0  first Humans, then Animals             Species   \n",
       "4                   0  first Animals, then Humans             Species   \n",
       "..                ...                         ...                 ...   \n",
       "915                 0       first More, then Less      Utilitarianism   \n",
       "916                 0       first Less, then More      Utilitarianism   \n",
       "917                 0       first Less, then More      Utilitarianism   \n",
       "918                 0       first More, then Less      Utilitarianism   \n",
       "919                 0       first More, then Less      Utilitarianism   \n",
       "\n",
       "     this_how_many_more_chars  this_row_is_about_left_or_right  \\\n",
       "0                           0                                0   \n",
       "1                           0                                1   \n",
       "2                           0                                0   \n",
       "3                           0                                1   \n",
       "4                           0                                0   \n",
       "..                        ...                              ...   \n",
       "915                         0                                1   \n",
       "916                         0                                0   \n",
       "917                         0                                1   \n",
       "918                         0                                0   \n",
       "919                         0                                1   \n",
       "\n",
       "    this_group_name  ...  FemaleDoctor ElderlyWoman ElderlyMan  Girl  Boy  \\\n",
       "0           Animals  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "1            Humans  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "2            Humans  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "3           Animals  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "4           Animals  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "..              ...  ...           ...          ...        ...   ...  ...   \n",
       "915            Less  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "916            Less  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "917            More  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "918            More  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "919            Less  ...           NaN          NaN        NaN   NaN  NaN   \n",
       "\n",
       "     LargeWoman  LargeMan  FemaleAthlete  MaleAthlete  Pregnant  \n",
       "0           NaN       NaN            NaN          NaN       NaN  \n",
       "1           NaN       NaN            NaN          NaN       NaN  \n",
       "2           NaN       NaN            NaN          NaN       NaN  \n",
       "3           NaN       NaN            NaN          NaN       NaN  \n",
       "4           NaN       NaN            NaN          NaN       NaN  \n",
       "..          ...       ...            ...          ...       ...  \n",
       "915         3.0       NaN            NaN          NaN       NaN  \n",
       "916         2.0       NaN            NaN          NaN       NaN  \n",
       "917         NaN       NaN            NaN          NaN       2.0  \n",
       "918         NaN       NaN            NaN          NaN       2.0  \n",
       "919         2.0       NaN            NaN          NaN       NaN  \n",
       "\n",
       "[920 rows x 34 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "in_path = cache_parse_responses_tmpl.format(\n",
    "            model_version=get_model_name_path(LLAMA_3_1_8B),\n",
    "            system_role=system_role,\n",
    "            lang=\"en\",\n",
    "            suffix=get_suffix(add_paraphrase, country),\n",
    "            translator_provider_forward=translator_provider_forward,\n",
    "            translator_provider_backward=translator_provider_backward,\n",
    "            analysis_backend_model_version=get_model_name_path(\n",
    "                analysis_backend_model_version\n",
    "            ),\n",
    "        )\n",
    "df = pd.read_csv(\"../../\"+in_path)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lrrrrr}\n",
      "\\toprule\n",
      " & Overall Dataset & en & de & zh-cn & sw \\\\\n",
      "\\midrule\n",
      "# Samples & 98440.0000 & 920.0000 & 920.0000 & 920.0000 & 920.0000 \\\\\n",
      "# Sentences/Sample & 1.0300 & 1.0000 & 1.0000 & 1.0000 & 1.0000 \\\\\n",
      "# Words/Sample & 46.8200 & 46.9800 & 41.6500 & 6.0000 & 38.4600 \\\\\n",
      "# Unique Words & 6625.0000 & 61.0000 & 71.0000 & 109.0000 & 70.0000 \\\\\n",
      "Type-Token Ratio & 0.0014 & 0.0014 & 0.0019 & 0.0197 & 0.0020 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import re\n",
    "import os\n",
    "\n",
    "def calculate_statistics(df, column_name):\n",
    "    # Function to split text into sentences\n",
    "    def split_sentences(text):\n",
    "        return re.split(r'[.!?]+', text)\n",
    "    \n",
    "    # Function to split text into words\n",
    "    def split_words(text):\n",
    "        # if language == \"zh\":\n",
    "        #     return jieba.lcut(text)\n",
    "        return re.findall(r'\\w+', text.lower())\n",
    "    \n",
    "    # Calculate statistics\n",
    "    num_samples = len(df)\n",
    "    sentences_per_sample = df[column_name].apply(lambda x: len(split_sentences(x))).mean()\n",
    "    words_per_sample = df[column_name].apply(lambda x: len(split_words(x))).mean()\n",
    "    \n",
    "    all_words = [word for text in df[column_name] for word in split_words(text)]\n",
    "    unique_words = len(set(all_words))\n",
    "    total_words = len(all_words)\n",
    "    type_token_ratio = unique_words / total_words if total_words > 0 else 0\n",
    "    \n",
    "    return {\n",
    "        \"# Samples\": num_samples,\n",
    "        \"# Sentences/Sample\": round(sentences_per_sample, 2),\n",
    "        \"# Words/Sample\": round(words_per_sample, 2),\n",
    "        \"# Unique Words\": unique_words,\n",
    "        \"Type-Token Ratio\":type_token_ratio # round(type_token_ratio, 4)\n",
    "    }\n",
    "\n",
    "\n",
    "# Configuration\n",
    "base_path = \"../..\"\n",
    "system_role = \"normal\"\n",
    "translator_provider_forward = \"google\"\n",
    "translator_provider_backward = \"google\"\n",
    "analysis_backend_model_version = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
    "add_paraphrase = False\n",
    "country = None\n",
    "\n",
    "# Function to get suffix (assuming it's defined somewhere in your project)\n",
    "def get_suffix(add_paraphrase, country):\n",
    "    # This is a placeholder. Replace with your actual implementation\n",
    "    return \"\"\n",
    "results = {}\n",
    "from multi_tp.utils import LANGUAGES\n",
    "# Calculate overall statistics\n",
    "all_data = pd.concat([pd.read_csv(os.path.join(base_path, cache_parse_responses_tmpl.format(\n",
    "        model_version=get_model_name_path(LLAMA_3_1_8B),\n",
    "        system_role=system_role,\n",
    "        lang=lang,\n",
    "        suffix=\"\",\n",
    "        translator_provider_forward=translator_provider_forward,\n",
    "        translator_provider_backward=translator_provider_backward,\n",
    "        analysis_backend_model_version=get_model_name_path(analysis_backend_model_version)\n",
    "    ))) for lang in LANGUAGES])\n",
    "\n",
    "overall_stats = calculate_statistics(all_data, \"Prompt\")\n",
    "results[\"Overall Dataset\"] = overall_stats\n",
    "\n",
    "\n",
    "# Calculate statistics for each language\n",
    "languages = [\"en\", \"de\", \"zh-cn\", \"sw\"]\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "for lang in languages:\n",
    "    file_path = os.path.join(base_path, cache_parse_responses_tmpl.format(\n",
    "        model_version=get_model_name_path(LLAMA_3_1_8B),\n",
    "        system_role=system_role,\n",
    "        lang=lang,\n",
    "        suffix=\"\",\n",
    "        translator_provider_forward=translator_provider_forward,\n",
    "        translator_provider_backward=translator_provider_backward,\n",
    "        analysis_backend_model_version=get_model_name_path(analysis_backend_model_version)\n",
    "    ))\n",
    "    df = pd.read_csv(file_path)\n",
    "    results[lang] = calculate_statistics(df, \"Prompt\")\n",
    "\n",
    "\n",
    "\n",
    "# Create a DataFrame from the results\n",
    "result_df = pd.DataFrame(results)\n",
    "\n",
    "# Print the results in a format suitable for LaTeX table\n",
    "print(result_df.to_latex(float_format=\"%.4f\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lrrrrr}\n",
      "\\toprule\n",
      " & Overall Dataset & en & de & zh-cn & sw \\\\\n",
      "\\midrule\n",
      "# Samples & 98440.0000 & 920.0000 & 920.0000 & 920.0000 & 920.0000 \\\\\n",
      "# Sentences/Sample & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 \\\\\n",
      "# Words/Sample & 51.0700 & 46.9800 & 41.6500 & 78.3100 & 38.4600 \\\\\n",
      "# Unique Words & 6492.0000 & 61.0000 & 71.0000 & 87.0000 & 70.0000 \\\\\n",
      "Type-Token Ratio & 0.0013 & 0.0014 & 0.0019 & 0.0012 & 0.0020 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import regex\n",
    "import os\n",
    "import unicodedata\n",
    "from collections import Counter\n",
    "\n",
    "LANGUAGES = [\n",
    "    \"af\", \"am\", \"ar\", \"az\", \"be\", \"bg\", \"bn\", \"bs\", \"ca\", \"ceb\", \"co\", \"cs\", \"cy\", \"da\", \"de\", \"el\", \"en\", \"eo\", \"es\", \"et\", \"eu\", \"fa\", \"fi\", \"fr\", \"fy\", \"ga\", \"gd\", \"gl\", \"gu\", \"ha\", \"haw\", \"he\", \"hi\", \"hmn\", \"hr\", \"ht\", \"hu\", \"hy\", \"id\", \"ig\", \"is\", \"it\", \"iw\", \"ja\", \"jw\", \"ka\", \"kk\", \"km\", \"kn\", \"ko\", \"ku\", \"ky\", \"la\", \"lb\", \"lo\", \"lt\", \"lv\", \"mg\", \"mi\", \"mk\", \"ml\", \"mn\", \"mr\", \"ms\", \"mt\", \"my\", \"ne\", \"nl\", \"no\", \"ny\", \"or\", \"pa\", \"pl\", \"ps\", \"pt\", \"ro\", \"ru\", \"sd\", \"si\", \"sk\", \"sl\", \"sm\", \"sn\", \"so\", \"sq\", \"sr\", \"st\", \"su\", \"sv\", \"sw\", \"ta\", \"te\", \"tg\", \"th\", \"tl\", \"tr\", \"ug\", \"uk\", \"ur\", \"uz\", \"vi\", \"xh\", \"yi\", \"yo\", \"zh-cn\", \"zh-tw\", \"zu\"\n",
    "]\n",
    "\n",
    "def split_sentences(text):\n",
    "    \"\"\"Split text into sentences using a regex that works for multiple languages\"\"\"\n",
    "    return regex.split(r'(?<=[.!?])\\s+(?=[A-Z\\p{Lu}])', text)\n",
    "\n",
    "def split_words(text, lang):\n",
    "    \"\"\"Split text into words, handling different writing systems\"\"\"\n",
    "    if lang in ['zh-cn', 'zh-tw', 'ja', 'ko', 'th']:\n",
    "        # For languages without clear word boundaries, treat each character as a word\n",
    "        return [char for char in text if not char.isspace()]\n",
    "    else:\n",
    "        # For other languages, use a regex that works for most writing systems\n",
    "        return regex.findall(r'\\p{L}+', text)\n",
    "\n",
    "def calculate_statistics(df, column_name, lang):\n",
    "    num_samples = len(df)\n",
    "    sentences_per_sample = df[column_name].apply(lambda x: len(split_sentences(x))).mean()\n",
    "    words_per_sample = df[column_name].apply(lambda x: len(split_words(x, lang))).mean()\n",
    "    \n",
    "    all_words = [word for text in df[column_name] for word in split_words(text, lang)]\n",
    "    unique_words = len(set(all_words))\n",
    "    total_words = len(all_words)\n",
    "    type_token_ratio = unique_words / total_words if total_words > 0 else 0\n",
    "    \n",
    "    return {\n",
    "        \"# Samples\": num_samples,\n",
    "        \"# Sentences/Sample\": round(sentences_per_sample, 2),\n",
    "        \"# Words/Sample\": round(words_per_sample, 2),\n",
    "        \"# Unique Words\": unique_words,\n",
    "        \"Type-Token Ratio\": round(type_token_ratio, 4)\n",
    "    }\n",
    "\n",
    "\n",
    "\n",
    "def get_suffix(add_paraphrase, country):\n",
    "    # This is a placeholder. Replace with your actual implementation\n",
    "    return \"\"\n",
    "\n",
    "# Calculate statistics for each language\n",
    "results = {}\n",
    "all_data_stats = {\n",
    "    \"# Samples\": 0,\n",
    "    \"# Sentences\": 0,\n",
    "    \"# Words\": 0,\n",
    "    \"# Unique Words\": set()\n",
    "}\n",
    "\n",
    "for lang in LANGUAGES:\n",
    "    file_path = os.path.join(base_path, cache_parse_responses_tmpl.format(\n",
    "        model_version=get_model_name_path(LLAMA_3_1_8B),\n",
    "        system_role=system_role,\n",
    "        lang=lang,\n",
    "        suffix=\"\",\n",
    "        translator_provider_forward=translator_provider_forward,\n",
    "        translator_provider_backward=translator_provider_backward,\n",
    "        analysis_backend_model_version=get_model_name_path(analysis_backend_model_version)\n",
    "    ))\n",
    "    df = pd.read_csv(file_path)\n",
    "    results[lang] = calculate_statistics(df, \"Prompt\", lang)\n",
    "    \n",
    "    # Accumulate stats for overall calculation\n",
    "    all_data_stats[\"# Samples\"] += results[lang][\"# Samples\"]\n",
    "    all_data_stats[\"# Sentences\"] += results[lang][\"# Samples\"] * results[lang][\"# Sentences/Sample\"]\n",
    "    all_data_stats[\"# Words\"] += results[lang][\"# Samples\"] * results[lang][\"# Words/Sample\"]\n",
    "    all_data_stats[\"# Unique Words\"].update(word for text in df[\"Prompt\"] for word in split_words(text, lang))\n",
    "\n",
    "# Calculate overall statistics\n",
    "total_samples = all_data_stats[\"# Samples\"]\n",
    "overall_stats = {\n",
    "    \"# Samples\": total_samples,\n",
    "    \"# Sentences/Sample\": round(all_data_stats[\"# Sentences\"] / total_samples, 2),\n",
    "    \"# Words/Sample\": round(all_data_stats[\"# Words\"] / total_samples, 2),\n",
    "    \"# Unique Words\": len(all_data_stats[\"# Unique Words\"]),\n",
    "    \"Type-Token Ratio\": round(len(all_data_stats[\"# Unique Words\"]) / all_data_stats[\"# Words\"], 4)\n",
    "}\n",
    "\n",
    "results[\"Overall Dataset\"] = overall_stats\n",
    "\n",
    "# Create a DataFrame from the results\n",
    "result_df = pd.DataFrame(results).T\n",
    "result_df = result_df.loc[[\"Overall Dataset\", \"en\", \"de\", \"zh-cn\", \"sw\"]].T\n",
    "\n",
    "# Print the results in a format suitable for LaTeX table\n",
    "print(result_df.to_latex(float_format=\"%.4f\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th># Samples</th>\n",
       "      <th># Sentences/Sample</th>\n",
       "      <th># Words/Sample</th>\n",
       "      <th># Unique Words</th>\n",
       "      <th>Type-Token Ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Overall Dataset</th>\n",
       "      <td>98440.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>51.07</td>\n",
       "      <td>6492.0</td>\n",
       "      <td>0.0013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>en</th>\n",
       "      <td>920.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>46.98</td>\n",
       "      <td>61.0</td>\n",
       "      <td>0.0014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>de</th>\n",
       "      <td>920.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>41.65</td>\n",
       "      <td>71.0</td>\n",
       "      <td>0.0019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zh-cn</th>\n",
       "      <td>920.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>78.31</td>\n",
       "      <td>87.0</td>\n",
       "      <td>0.0012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sw</th>\n",
       "      <td>920.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>38.46</td>\n",
       "      <td>70.0</td>\n",
       "      <td>0.0020</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 # Samples  # Sentences/Sample  # Words/Sample  \\\n",
       "Overall Dataset    98440.0                 1.0           51.07   \n",
       "en                   920.0                 1.0           46.98   \n",
       "de                   920.0                 1.0           41.65   \n",
       "zh-cn                920.0                 1.0           78.31   \n",
       "sw                   920.0                 1.0           38.46   \n",
       "\n",
       "                 # Unique Words  Type-Token Ratio  \n",
       "Overall Dataset          6492.0            0.0013  \n",
       "en                         61.0            0.0014  \n",
       "de                         71.0            0.0019  \n",
       "zh-cn                      87.0            0.0012  \n",
       "sw                         70.0            0.0020  "
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
