{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f47f6552",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor, SeamlessM4Tv2ForTextToText\n",
    "import torch\n",
    "\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "model_name = \"facebook/seamless-m4t-v2-large\"\n",
    "processor = AutoProcessor.from_pretrained(model_name)\n",
    "tr_model = SeamlessM4Tv2ForTextToText.from_pretrained(model_name).to(device)\n",
    "\n",
    "lang_code_map = {\n",
    "    'chinese': 'cmn',\n",
    "    'hindi': 'hin',\n",
    "    'urdu': 'urd',\n",
    "    'german': 'deu',\n",
    "    'spanish': 'spa',\n",
    "    'english': 'eng'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b944731",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "def translate_from_english_seamless(text, target_lang):\n",
    "    src_lang = lang_code_map['english']\n",
    "    tgt_lang = lang_code_map[target_lang]\n",
    "    \n",
    "    inputs = processor(text=text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors=\"pt\").to(device)\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        generated = tr_model.generate(**inputs, tgt_lang=tgt_lang)\n",
    "        decoded = processor.batch_decode(generated, skip_special_tokens=True)\n",
    "    \n",
    "    return decoded\n",
    "\n",
    "def translate_to_english_seamless(text, source_lang): \n",
    "    src_lang = lang_code_map[source_lang]\n",
    "    tgt_lang = lang_code_map['english']\n",
    "    \n",
    "    inputs = processor(text=text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors=\"pt\").to(device)\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        generated = tr_model.generate(**inputs, tgt_lang=tgt_lang)\n",
    "        decoded = processor.batch_decode(generated, skip_special_tokens=True)\n",
    "    \n",
    "    return decoded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8099f1d2",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "emfd_df = pd.read_csv(\"/topic_modelling/eMFD_wordlist.csv\")\n",
    "emfd_df.fillna(0, inplace=True)\n",
    "\n",
    "emfd_dict_english = {}\n",
    "for _, row in emfd_df.iterrows():\n",
    "    word = row['word']\n",
    "    emfd_dict_english[word] = row.drop('word').to_dict()\n",
    "\n",
    "emfd_dict_chinese, emfd_dict_german, emfd_dict_hindi, emfd_dict_spanish, emfd_dict_urdu = {}, {}, {}, {}, {}\n",
    "\n",
    "for key,value in tqdm(emfd_dict_english.items()):\n",
    "    tr_key = translate_from_english_seamless(key, target_lang=\"chinese\")[0]\n",
    "    emfd_dict_chinese[tr_key] = value\n",
    "\n",
    "    tr_key = translate_from_english_seamless(key, target_lang=\"german\")[0]\n",
    "    emfd_dict_german[tr_key] = value\n",
    "\n",
    "    tr_key = translate_from_english_seamless(key, target_lang=\"hindi\")[0]\n",
    "    emfd_dict_hindi[tr_key] = value\n",
    "\n",
    "    tr_key = translate_from_english_seamless(key, target_lang=\"spanish\")[0]\n",
    "    emfd_dict_spanish[tr_key] = value\n",
    "\n",
    "    tr_key = translate_from_english_seamless(key, target_lang=\"urdu\")[0]\n",
    "    emfd_dict_urdu[tr_key] = value\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cab287ce",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open(\"MoralDicts/emfd_dict_english_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_english, f)\n",
    "with open(\"MoralDicts/emfd_dict_chinese_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_chinese, f)\n",
    "with open(\"MoralDicts/emfd_dict_german_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_german, f)\n",
    "with open(\"MoralDicts/emfd_dict_hindi_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_hindi, f)\n",
    "with open(\"MoralDicts/emfd_dict_spanish_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_spanish, f)\n",
    "with open(\"MoralDicts/emfd_dict_urdu_seamless.pkl\", \"wb\") as f:\n",
    "    pickle.dump(emfd_dict_urdu, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd7e2eff",
   "metadata": {},
   "source": [
    "### Verifying the translations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0286fac",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "emfd_df = pd.read_csv(\"/topic_modelling/eMFD_wordlist.csv\")\n",
    "emfd_df.fillna(0, inplace=True)\n",
    "\n",
    "emfd_dict_english = {}\n",
    "for _, row in emfd_df.iterrows():\n",
    "    word = row['word']\n",
    "    emfd_dict_english[word] = row.drop('word').to_dict()\n",
    "\n",
    "eng_keys = list(emfd_dict_english.keys())\n",
    "batch_size = 8\n",
    "\n",
    "import torch\n",
    "\n",
    "chinese_keys = []\n",
    "for i in tqdm(range(0, len(eng_keys), batch_size)):\n",
    "    batch = eng_keys[i:i+batch_size]\n",
    "    chinese_keys.extend(translate_from_english_seamless(batch, target_lang=\"chinese\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "german_keys = []\n",
    "for i in tqdm(range(0, len(eng_keys), batch_size)):\n",
    "    batch = eng_keys[i:i+batch_size]\n",
    "    german_keys.extend(translate_from_english_seamless(batch, target_lang=\"german\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "hindi_keys = []\n",
    "for i in tqdm(range(0, len(eng_keys), batch_size)):\n",
    "    batch = eng_keys[i:i+batch_size]\n",
    "    hindi_keys.extend(translate_from_english_seamless(batch, target_lang=\"hindi\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "spanish_keys = []\n",
    "for i in tqdm(range(0, len(eng_keys), batch_size)):\n",
    "    batch = eng_keys[i:i+batch_size]\n",
    "    spanish_keys.extend(translate_from_english_seamless(batch, target_lang=\"spanish\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "urdu_keys = []\n",
    "for i in tqdm(range(0, len(eng_keys), batch_size)):\n",
    "    batch = eng_keys[i:i+batch_size]\n",
    "    urdu_keys.extend(translate_from_english_seamless(batch, target_lang=\"urdu\"))\n",
    "torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6e30801",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "batch_size = 8\n",
    "\n",
    "chinese_keys_back = []\n",
    "for i in tqdm(range(0, len(chinese_keys), batch_size)):\n",
    "    batch = chinese_keys[i:i+batch_size]\n",
    "    chinese_keys_back.extend(translate_to_english_seamless(batch, source_lang=\"chinese\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "german_keys_back = []\n",
    "for i in tqdm(range(0, len(german_keys), batch_size)):\n",
    "    batch = german_keys[i:i+batch_size]\n",
    "    german_keys_back.extend(translate_to_english_seamless(batch, source_lang=\"german\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "hindi_keys_back = []\n",
    "for i in tqdm(range(0, len(hindi_keys), batch_size)):\n",
    "    batch = hindi_keys[i:i+batch_size]\n",
    "    hindi_keys_back.extend(translate_to_english_seamless(batch, source_lang=\"hindi\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "spanish_keys_back = []\n",
    "for i in tqdm(range(0, len(spanish_keys), batch_size)):\n",
    "    batch = spanish_keys[i:i+batch_size]\n",
    "    spanish_keys_back.extend(translate_to_english_seamless(batch, source_lang=\"spanish\"))\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "urdu_keys_back = []\n",
    "for i in tqdm(range(0, len(urdu_keys), batch_size)):\n",
    "    batch = urdu_keys[i:i+batch_size]\n",
    "    urdu_keys_back.extend(translate_to_english_seamless(batch, source_lang=\"urdu\"))\n",
    "torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52e01621",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer, util\n",
    "import pandas as pd\n",
    "\n",
    "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
    "\n",
    "results = []\n",
    "for eng_key, lang_key, back_key in zip(eng_keys, chinese_keys, chinese_keys_back):\n",
    "    emb1 = model.encode(eng_key, convert_to_tensor=True)\n",
    "    emb2 = model.encode(back_key, convert_to_tensor=True)\n",
    "    similarity = util.cos_sim(emb1, emb2).item()\n",
    "    results.append((eng_key, lang_key, back_key, similarity))\n",
    "df = pd.DataFrame(results, columns=[\n",
    "    'Original English', \n",
    "    'Chinese Translation', \n",
    "    'Back-translated English', \n",
    "    'Cosine Similarity'\n",
    "])\n",
    "df_sorted = df.sort_values(by='Cosine Similarity')\n",
    "print(df_sorted.head(5))\n",
    "print(\"Chinese average: \", sum(df_sorted['Cosine Similarity']) / len(df))\n",
    "\n",
    "results = []\n",
    "for eng_key, lang_key, back_key in zip(eng_keys, german_keys, german_keys_back):\n",
    "    emb1 = model.encode(eng_key, convert_to_tensor=True)\n",
    "    emb2 = model.encode(back_key, convert_to_tensor=True)\n",
    "    similarity = util.cos_sim(emb1, emb2).item()\n",
    "    results.append((eng_key, lang_key, back_key, similarity))\n",
    "df = pd.DataFrame(results, columns=[\n",
    "    'Original English', \n",
    "    'German Translation', \n",
    "    'Back-translated English', \n",
    "    'Cosine Similarity'\n",
    "])\n",
    "df_sorted = df.sort_values(by='Cosine Similarity')\n",
    "print(df_sorted.head(5))\n",
    "print(\"German average: \", sum(df_sorted['Cosine Similarity']) / len(df))\n",
    "\n",
    "results = []\n",
    "for eng_key, lang_key, back_key in zip(eng_keys, hindi_keys, hindi_keys_back):\n",
    "    emb1 = model.encode(eng_key, convert_to_tensor=True)\n",
    "    emb2 = model.encode(back_key, convert_to_tensor=True)\n",
    "    similarity = util.cos_sim(emb1, emb2).item()\n",
    "    results.append((eng_key, lang_key, back_key, similarity))\n",
    "df = pd.DataFrame(results, columns=[\n",
    "    'Original English', \n",
    "    'Hindi Translation', \n",
    "    'Back-translated English', \n",
    "    'Cosine Similarity'\n",
    "])\n",
    "df_sorted = df.sort_values(by='Cosine Similarity')\n",
    "print(df_sorted.head(5))\n",
    "print(\"Hindi average: \", sum(df_sorted['Cosine Similarity']) / len(df))\n",
    "\n",
    "results = []\n",
    "for eng_key, lang_key, back_key in zip(eng_keys, spanish_keys, spanish_keys_back):\n",
    "    emb1 = model.encode(eng_key, convert_to_tensor=True)\n",
    "    emb2 = model.encode(back_key, convert_to_tensor=True)\n",
    "    similarity = util.cos_sim(emb1, emb2).item()\n",
    "    results.append((eng_key, lang_key, back_key, similarity))\n",
    "df = pd.DataFrame(results, columns=[\n",
    "    'Original English', \n",
    "    'Spanish Translation', \n",
    "    'Back-translated English', \n",
    "    'Cosine Similarity'\n",
    "])\n",
    "df_sorted = df.sort_values(by='Cosine Similarity')\n",
    "print(df_sorted.head(5))\n",
    "print(\"Spanish average: \", sum(df_sorted['Cosine Similarity']) / len(df))\n",
    "\n",
    "results = []\n",
    "for eng_key, lang_key, back_key in zip(eng_keys, urdu_keys, urdu_keys_back):\n",
    "    emb1 = model.encode(eng_key, convert_to_tensor=True)\n",
    "    emb2 = model.encode(back_key, convert_to_tensor=True)\n",
    "    similarity = util.cos_sim(emb1, emb2).item()\n",
    "    results.append((eng_key, lang_key, back_key, similarity))\n",
    "df = pd.DataFrame(results, columns=[\n",
    "    'Original English', \n",
    "    'Urdu Translation', \n",
    "    'Back-translated English', \n",
    "    'Cosine Similarity'\n",
    "])\n",
    "df_sorted = df.sort_values(by='Cosine Similarity')\n",
    "print(df_sorted.head(5))\n",
    "print(\"Urdu average: \", sum(df_sorted['Cosine Similarity']) / len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bcf5ec4",
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
