{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../../\")\n",
    "\n",
    "from multi_tp.utils import LANGUAGES\n",
    "from multi_tp.translation import get_translator\n",
    "translator = get_translator(\"google\")\n",
    "from efficiency.log import  fread\n",
    "from tqdm import tqdm   \n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/gio/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from tqdm.autonotebook import tqdm, trange\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(384,)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "sentences = [\"This is an example sentence\", \"Each sentence is converted\"]\n",
    "\n",
    "\n",
    "embeddings = model.encode(sentences)\n",
    "print(embeddings[0].shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def cosine_similarity(a,b):\n",
    "    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  9%|▉         | 10/107 [00:26<04:14,  2.62s/it]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[6], line 13\u001b[0m\n\u001b[1;32m     11\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m fread(dataset_path, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m     12\u001b[0m         emb_back_translated \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mencode(row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprompt_en_back_translated\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m---> 13\u001b[0m         emb_original \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt_en\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     14\u001b[0m         acc\u001b[38;5;241m.\u001b[39mappend({\n\u001b[1;32m     15\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlang\u001b[39m\u001b[38;5;124m\"\u001b[39m: lang,\n\u001b[1;32m     16\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimilarity\u001b[39m\u001b[38;5;124m\"\u001b[39m: cosine_similarity(emb_back_translated, emb_original)\n\u001b[1;32m     17\u001b[0m         })\n\u001b[1;32m     18\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(acc)\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:477\u001b[0m, in \u001b[0;36mSentenceTransformer.encode\u001b[0;34m(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)\u001b[0m\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m device \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    475\u001b[0m     device \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice\n\u001b[0;32m--> 477\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    479\u001b[0m all_embeddings \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m    480\u001b[0m length_sorted_idx \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39margsort([\u001b[38;5;241m-\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_text_length(sen) \u001b[38;5;28;01mfor\u001b[39;00m sen \u001b[38;5;129;01min\u001b[39;00m sentences])\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:1174\u001b[0m, in \u001b[0;36mModule.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1171\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1172\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m-> 1174\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:780\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m    778\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m    779\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 780\u001b[0m         \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    782\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m    784\u001b[0m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m    785\u001b[0m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    790\u001b[0m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m    791\u001b[0m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:780\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m    778\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m    779\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 780\u001b[0m         \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    782\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m    784\u001b[0m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m    785\u001b[0m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    790\u001b[0m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m    791\u001b[0m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
      "    \u001b[0;31m[... skipping similar frames: Module._apply at line 780 (4 times)]\u001b[0m\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:780\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m    778\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m    779\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 780\u001b[0m         \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    782\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m    783\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m    784\u001b[0m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m    785\u001b[0m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    790\u001b[0m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m    791\u001b[0m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:779\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m    777\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_apply\u001b[39m(\u001b[38;5;28mself\u001b[39m, fn, recurse\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m    778\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[0;32m--> 779\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmodule\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchildren\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m    780\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    782\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n",
      "File \u001b[0;32m~/anaconda3/envs/TrolleyClean/lib/python3.11/site-packages/torch/nn/modules/module.py:2339\u001b[0m, in \u001b[0;36mModule.children\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   2334\u001b[0m     gen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_named_members(\n\u001b[1;32m   2335\u001b[0m         \u001b[38;5;28;01mlambda\u001b[39;00m module: module\u001b[38;5;241m.\u001b[39m_buffers\u001b[38;5;241m.\u001b[39mitems(),\n\u001b[1;32m   2336\u001b[0m         prefix\u001b[38;5;241m=\u001b[39mprefix, recurse\u001b[38;5;241m=\u001b[39mrecurse, remove_duplicate\u001b[38;5;241m=\u001b[39mremove_duplicate)\n\u001b[1;32m   2337\u001b[0m     \u001b[38;5;28;01myield from\u001b[39;00m gen\n\u001b[0;32m-> 2339\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mchildren\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModule\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m   2340\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Return an iterator over immediate children modules.\u001b[39;00m\n\u001b[1;32m   2341\u001b[0m \n\u001b[1;32m   2342\u001b[0m \u001b[38;5;124;03m    Yields:\u001b[39;00m\n\u001b[1;32m   2343\u001b[0m \u001b[38;5;124;03m        Module: a child module\u001b[39;00m\n\u001b[1;32m   2344\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m   2345\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m name, module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnamed_children():\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "acc = []\n",
    "for lang in tqdm(LANGUAGES):\n",
    "    dataset_back_translated_file_tmpl = (\n",
    "        \"../../data/datasets/dataset_{lang}+{translator_provider_forward}{suffix}_back_translated.csv\"\n",
    "    )\n",
    "    dataset_path = dataset_back_translated_file_tmpl.format(\n",
    "        lang=lang,\n",
    "        translator_provider_forward=\"google\",\n",
    "        suffix=\"\"\n",
    "    )\n",
    "    for row in fread(dataset_path, verbose=False):\n",
    "        emb_back_translated = model.encode(row[\"prompt_en_back_translated\"])\n",
    "        emb_original = model.encode(row[\"prompt_en\"])\n",
    "        acc.append({\n",
    "            \"lang\": lang,\n",
    "            \"similarity\": cosine_similarity(emb_back_translated, emb_original)\n",
    "        })\n",
    "df = pd.DataFrame(acc)\n",
    "df.to_csv(\"analysis_translation_dataset.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAbK0lEQVR4nO3de5DVZf3A8Q+33SVhl0DdhVwCL4n3RkrcsjLaXIlRTCqczMgxm2p1kq0MKkXrN0I1g5aBNo3KOKNSjoqjGGZb4FSLl9UuZjJeMCnctSx2EWMh9vn90bDTBuqeZc+zl16vmfPHfs/3POc5n77Bm7Nn12EppRQAAJkM7+8NAAD/W8QHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkNbK/N/DfOjs7Y8uWLTF27NgYNmxYf28HAOiBlFJs27YtJk2aFMOHv/57GwMuPrZs2RLV1dX9vQ0AoBc2b94chxxyyOueM+DiY+zYsRHx782Xl5f3824AgJ5ob2+P6urqrr/HX8+Ai48932opLy8XHwAwyPTkIxM+cAoAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyGpkf28AgOKYsnBNf2+hYM8vnd3fWyAD73wAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGS1X/GxdOnSGDZsWFxyySVdx3bs2BH19fUxYcKEGDNmTMydOzdaW1v3d58AwBDR6/h45JFH4gc/+EEcf/zx3Y4vWLAg7rnnnrj99ttj/fr1sWXLljj77LP3e6MAwNDQq/h45ZVX4txzz40f/vCH8eY3v7nreFtbW9xwww2xbNmymDlzZkyfPj1uuumm+PWvfx0bNmzos00DAINXr+Kjvr4+Zs+eHbW1td2ONzc3x65du7odnzZtWkyePDmampr2uVZHR0e0t7d3uwEAQ9fIQh+watWqeOyxx+KRRx7Z676WlpYoKSmJcePGdTteWVkZLS0t+1xvyZIlceWVVxa6DQBgkCronY/NmzfHF77whbjllluirKysTzawaNGiaGtr67pt3ry5T9YFAAamguKjubk5XnrppTjxxBNj5MiRMXLkyFi/fn1873vfi5EjR0ZlZWXs3Lkztm7d2u1xra2tUVVVtc81S0tLo7y8vNsNABi6Cvq2ywc+8IH4/e9/3+3Y+eefH9OmTYuvfOUrUV1dHaNGjYrGxsaYO3duRERs3LgxXnjhhaipqem7XQMAg1ZB8TF27Ng49thjux074IADYsKECV3HL7jggmhoaIjx48dHeXl5XHzxxVFTUxMnn3xy3+0aABi0Cv7A6Ru5+uqrY/jw4TF37tzo6OiIurq6WLFiRV8/DQAwSA1LKaX+3sR/am9vj4qKimhra/P5D4D9MGXhmv7eQsGeXzq7v7dALxXy97f/tgsAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWRUUH9ddd10cf/zxUV5eHuXl5VFTUxM/+clPuu7fsWNH1NfXx4QJE2LMmDExd+7caG1t7fNNAwCDV0Hxccghh8TSpUujubk5Hn300Zg5c2bMmTMn/vCHP0RExIIFC+Kee+6J22+/PdavXx9btmyJs88+uygbBwAGp2EppbQ/C4wfPz6+853vxEc+8pE46KCD4tZbb42PfOQjERHx1FNPxVFHHRVNTU1x8skn92i99vb2qKioiLa2tigvL9+frQH8T5uycE1/b6Fgzy+d3d9boJcK+fu715/52L17d6xatSq2b98eNTU10dzcHLt27Yra2tquc6ZNmxaTJ0+Opqam11yno6Mj2tvbu90AgKGr4Pj4/e9/H2PGjInS0tL47Gc/G3fddVccffTR0dLSEiUlJTFu3Lhu51dWVkZLS8trrrdkyZKoqKjoulVXVxf8IgCAwaPg+DjyyCPjN7/5TTz00EPxuc99LubPnx9PPvlkrzewaNGiaGtr67pt3ry512sBAAPfyEIfUFJSEocffnhEREyfPj0eeeSR+O53vxvz5s2LnTt3xtatW7u9+9Ha2hpVVVWvuV5paWmUlpYWvnMAYFDa79/z0dnZGR0dHTF9+vQYNWpUNDY2dt23cePGeOGFF6KmpmZ/nwYAGCIKeudj0aJFMWvWrJg8eXJs27Ytbr311li3bl3cf//9UVFRERdccEE0NDTE+PHjo7y8PC6++OKoqanp8U+6AABDX0Hx8dJLL8UnP/nJePHFF6OioiKOP/74uP/+++ODH/xgRERcffXVMXz48Jg7d250dHREXV1drFixoigbBwAGp/3+PR99ze/5AOgbfs8HOWX5PR8AAL0hPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZDWyvzcAAHtMWbimv7dQsOeXzu7vLQw63vkAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMiqoPhYsmRJvPOd74yxY8fGwQcfHGeddVZs3Lix2zk7duyI+vr6mDBhQowZMybmzp0bra2tfbppAGDwKig+1q9fH/X19bFhw4Z44IEHYteuXXHaaafF9u3bu85ZsGBB3HPPPXH77bfH+vXrY8uWLXH22Wf3+cYBgMFpZCEnr127ttvXK1eujIMPPjiam5vjve99b7S1tcUNN9wQt956a8ycOTMiIm666aY46qijYsOGDXHyySf33c4BgEFpvz7z0dbWFhER48ePj4iI5ubm2LVrV9TW1nadM23atJg8eXI0NTXtc42Ojo5ob2/vdgMAhq5ex0dnZ2dccskl8e53vzuOPfbYiIhoaWmJkpKSGDduXLdzKysro6WlZZ/rLFmyJCoqKrpu1dXVvd0SADAI9Do+6uvr44knnohVq1bt1wYWLVoUbW1tXbfNmzfv13oAwMBW0Gc+9rjooovi3nvvjQcffDAOOeSQruNVVVWxc+fO2Lp1a7d3P1pbW6Oqqmqfa5WWlkZpaWlvtgEADEIFvfORUoqLLroo7rrrrvj5z38eU6dO7Xb/9OnTY9SoUdHY2Nh1bOPGjfHCCy9ETU1N3+wYABjUCnrno76+Pm699da4++67Y+zYsV2f46ioqIjRo0dHRUVFXHDBBdHQ0BDjx4+P8vLyuPjii6OmpsZPugAAEVFgfFx33XUREXHqqad2O37TTTfFpz71qYiIuPrqq2P48OExd+7c6OjoiLq6ulixYkWfbBYAGPwKio+U0hueU1ZWFsuXL4/ly5f3elMAwNDlv+0CAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQ1sr83ADDQTVm4pr+3AEOKdz4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAsio4Ph588ME444wzYtKkSTFs2LBYvXp1t/tTSnH55ZfHxIkTY/To0VFbWxtPP/10X+0XABjkCo6P7du3xwknnBDLly/f5/3f/va343vf+15cf/318dBDD8UBBxwQdXV1sWPHjv3eLAAw+I0s9AGzZs2KWbNm7fO+lFJcc8018fWvfz3mzJkTERE333xzVFZWxurVq+Occ87Zv90CAINen37mY9OmTdHS0hK1tbVdxyoqKmLGjBnR1NS0z8d0dHREe3t7txsAMHT1aXy0tLRERERlZWW345WVlV33/bclS5ZERUVF1626urovtwQADDD9/tMuixYtira2tq7b5s2b+3tLAEAR9Wl8VFVVRUREa2trt+Otra1d9/230tLSKC8v73YDAIauPo2PqVOnRlVVVTQ2NnYda29vj4ceeihqamr68qkAgEGq4J92eeWVV+KZZ57p+nrTpk3xm9/8JsaPHx+TJ0+OSy65JP7v//4vjjjiiJg6dWpcdtllMWnSpDjrrLP6ct8AwCBVcHw8+uij8f73v7/r64aGhoiImD9/fqxcuTIuvfTS2L59e3zmM5+JrVu3ximnnBJr166NsrKyvts1ADBoDUsppf7exH9qb2+PioqKaGtr8/kPYECYsnBNf2+BAez5pbP7ewsDQiF/f/f7T7sAAP9bCv62CzBwDMZ/kftXIuCdDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyMrv+QCyGoy/mwToW975AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDISnwAAFmJDwAgK/EBAGQlPgCArMQHAJCV+AAAshIfAEBW4gMAyEp8AABZiQ8AICvxAQBkJT4AgKxG9vcGAGAwm7JwTX9voWDPL53dr8/vnQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICsxAcAkJX4AACyEh8AQFbiAwDIamR/byC3KQvX9PcW/ic8v3R2f2+hYK4NgDy88wEAZCU+AICsxAcAkFXR4mP58uUxZcqUKCsrixkzZsTDDz9crKcCAAaRosTHj370o2hoaIjFixfHY489FieccELU1dXFSy+9VIynAwAGkaLEx7Jly+LCCy+M888/P44++ui4/vrr401velPceOONxXg6AGAQ6fMftd25c2c0NzfHokWLuo4NHz48amtro6mpaa/zOzo6oqOjo+vrtra2iIhob2/v661FRERnx6tFWZfuivW/XzG5NoD/FcX4M3rPmimlNzy3z+Pjb3/7W+zevTsqKyu7Ha+srIynnnpqr/OXLFkSV1555V7Hq6ur+3prZFRxTX/vAIDXUsw/o7dt2xYVFRWve06//5KxRYsWRUNDQ9fXnZ2d8fe//z0mTJgQw4YNK/rzt7e3R3V1dWzevDnKy8uL/nyDlTn1nFn1jDn1jDn1nFn1TLHmlFKKbdu2xaRJk97w3D6PjwMPPDBGjBgRra2t3Y63trZGVVXVXueXlpZGaWlpt2Pjxo3r6229ofLychdrD5hTz5lVz5hTz5hTz5lVzxRjTm/0jsceff6B05KSkpg+fXo0NjZ2Hevs7IzGxsaoqanp66cDAAaZonzbpaGhIebPnx/veMc74qSTToprrrkmtm/fHueff34xng4AGESKEh/z5s2Lv/71r3H55ZdHS0tLvP3tb4+1a9fu9SHUgaC0tDQWL16817d+6M6ces6sesacesaces6semYgzGlY6snPxAAA9BH/bRcAICvxAQBkJT4AgKzEBwCQ1ZCMj+XLl8eUKVOirKwsZsyYEQ8//PBrnnvqqafGsGHD9rrNnj2765yUUlx++eUxceLEGD16dNTW1sbTTz+d46UUVV/P6VOf+tRe959++uk5XkpRFTKniIhrrrkmjjzyyBg9enRUV1fHggULYseOHfu15mDQ13O64oor9rqepk2bVuyXkUUhs9q1a1d84xvfiMMOOyzKysrihBNOiLVr1+7XmoNFX89pKF5TDz74YJxxxhkxadKkGDZsWKxevfoNH7Nu3bo48cQTo7S0NA4//PBYuXLlXucU/XpKQ8yqVatSSUlJuvHGG9Mf/vCHdOGFF6Zx48al1tbWfZ7/8ssvpxdffLHr9sQTT6QRI0akm266qeucpUuXpoqKirR69er029/+Np155plp6tSp6Z///GemV9X3ijGn+fPnp9NPP73beX//+98zvaLiKHROt9xySyotLU233HJL2rRpU7r//vvTxIkT04IFC3q95mBQjDktXrw4HXPMMd2up7/+9a+5XlLRFDqrSy+9NE2aNCmtWbMmPfvss2nFihWprKwsPfbYY71eczAoxpyG4jV13333pa997WvpzjvvTBGR7rrrrtc9/7nnnktvetObUkNDQ3ryySfTtddem0aMGJHWrl3bdU6O62nIxcdJJ52U6uvru77evXt3mjRpUlqyZEmPHn/11VensWPHpldeeSWllFJnZ2eqqqpK3/nOd7rO2bp1ayotLU233XZb324+o76eU0r/jo85c+b09Vb7VaFzqq+vTzNnzux2rKGhIb373e/u9ZqDQTHmtHjx4nTCCScUZb/9qdBZTZw4MX3/+9/vduzss89O5557bq/XHAyKMaehek3t0ZP4uPTSS9MxxxzT7di8efNSXV1d19c5rqch9W2XnTt3RnNzc9TW1nYdGz58eNTW1kZTU1OP1rjhhhvinHPOiQMOOCAiIjZt2hQtLS3d1qyoqIgZM2b0eM2Bphhz2mPdunVx8MEHx5FHHhmf+9zn4uWXX+7TvefUmzm9613viubm5q63KJ977rm477774kMf+lCv1xzoijGnPZ5++umYNGlSHHrooXHuuefGCy+8ULwXkkFvZtXR0RFlZWXdjo0ePTp++ctf9nrNga4Yc9pjqF1ThWpqauo214iIurq6rrnmup6GVHz87W9/i927d+/1m1QrKyujpaXlDR//8MMPxxNPPBGf/vSnu47teVxv1xyIijGniIjTTz89br755mhsbIxvfetbsX79+pg1a1bs3r27T/efS2/m9PGPfzy+8Y1vxCmnnBKjRo2Kww47LE499dT46le/2us1B7pizCkiYsaMGbFy5cpYu3ZtXHfddbFp06Z4z3veE9u2bSvq6ymm3syqrq4uli1bFk8//XR0dnbGAw88EHfeeWe8+OKLvV5zoCvGnCKG5jVVqJaWln3Otb29Pf75z39mu56GVHzsrxtuuCGOO+64OOmkk/p7KwPaa83pnHPOiTPPPDOOO+64OOuss+Lee++NRx55JNatW9c/G+0H69ati6uuuipWrFgRjz32WNx5552xZs2a+OY3v9nfWxtQejKnWbNmxUc/+tE4/vjjo66uLu67777YunVr/PjHP+7Hnef33e9+N4444oiYNm1alJSUxEUXXRTnn39+DB/uj+//1JM5uaYGjiF19R544IExYsSIaG1t7Xa8tbU1qqqqXvex27dvj1WrVsUFF1zQ7fiex/VmzYGqGHPal0MPPTQOPPDAeOaZZ/Zrv/2lN3O67LLL4rzzzotPf/rTcdxxx8WHP/zhuOqqq2LJkiXR2dm5X7MfqIoxp30ZN25cvO1tbxu011NE72Z10EEHxerVq2P79u3xpz/9KZ566qkYM2ZMHHroob1ec6Arxpz2ZShcU4Wqqqra51zLy8tj9OjR2a6nIRUfJSUlMX369GhsbOw61tnZGY2NjVFTU/O6j7399tujo6MjPvGJT3Q7PnXq1Kiqquq2Znt7ezz00ENvuOZAVYw57cuf//znePnll2PixIn7vef+0Js5vfrqq3v9i3TEiBER8e8f2d6f2Q9UxZjTvrzyyivx7LPPDtrrKWL//r9XVlYWb3nLW+Jf//pX3HHHHTFnzpz9XnOgKsac9mUoXFOFqqmp6TbXiIgHHniga67Zrqc+++jqALFq1apUWlqaVq5cmZ588sn0mc98Jo0bNy61tLSklFI677zz0sKFC/d63CmnnJLmzZu3zzWXLl2axo0bl+6+++70u9/9Ls2ZM2dI/KhtX85p27Zt6Utf+lJqampKmzZtSj/72c/SiSeemI444oi0Y8eOor+eYil0TosXL05jx45Nt912W3ruuefST3/603TYYYelj33sYz1eczAqxpy++MUvpnXr1qVNmzalX/3qV6m2tjYdeOCB6aWXXsr++vpSobPasGFDuuOOO9Kzzz6bHnzwwTRz5sw0derU9I9//KPHaw5GxZjTULymtm3blh5//PH0+OOPp4hIy5YtS48//nj605/+lFJKaeHChem8887rOn/Pj9p++ctfTn/84x/T8uXL9/mjtsW+noZcfKSU0rXXXpsmT56cSkpK0kknnZQ2bNjQdd/73ve+NH/+/G7nP/XUUyki0k9/+tN9rtfZ2Zkuu+yyVFlZmUpLS9MHPvCBtHHjxmK+hCz6ck6vvvpqOu2009JBBx2URo0ald761remCy+8cFD/4bdHIXPatWtXuuKKK9Jhhx2WysrKUnV1dfr85z/f7Q/AN1pzsOrrOc2bNy9NnDgxlZSUpLe85S1p3rx56Zlnnsn4ioqnkFmtW7cuHXXUUam0tDRNmDAhnXfeeekvf/lLQWsOVn09p6F4Tf3iF79IEbHXbc9s5s+fn973vvft9Zi3v/3tqaSkJB166KHdfl/THsW+noal9BrvcQIAFMGQ+swHADDwiQ8AICvxAQBkJT4AgKzEBwCQlfgAALISHwBAVuIDAMhKfAAAWYkPACAr8QEAZCU+AICs/h8LULMcTrxQCwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# make average by language and plot distribution\n",
    "df_lang = df.groupby(\"lang\").mean()\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "# make a bin histogram\n",
    "plt.hist(df_lang[\"similarity\"], bins=10)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df_lang' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf_lang\u001b[49m[df_lang[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimilarity\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0.9\u001b[39m]\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df_lang' is not defined"
     ]
    }
   ],
   "source": [
    "df_lang[df_lang[\"similarity\"] < 0.9] # corr, with number of speaker and alignment (to show it doe not influenced)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TrolleyClean",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
