{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "File link : https://elrc-share.eu/repository/browse/parallel-global-voices-english-french-processed/7cbda8bc522d11e9a7e100155d026706614fcb572b9841ee8215cf4dd75eb54d/#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# tree = ET.parse(\"output_ILSP-FC_eng-fra.tmx\")\n",
    "root = tree.getroot()\n",
    "\n",
    "en_text = []\n",
    "fr_text = []\n",
    "\n",
    "xml_ns = {'xml': 'http://www.w3.org/XML/1998/namespace'}\n",
    "\n",
    "for tu in root.findall(\".//tu\"):\n",
    "    en_segment = tu.find(\".//tuv[@xml:lang='en']/seg\", namespaces=xml_ns)\n",
    "    fr_segment = tu.find(\".//tuv[@xml:lang='fr']/seg\", namespaces=xml_ns)\n",
    "    \n",
    "    if en_segment is not None and fr_segment is not None:\n",
    "        en_text.append(en_segment.text)\n",
    "        fr_text.append(fr_segment.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "342060"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({'en': en_text, 'fr': fr_text})\n",
    "len(df.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To HuggingFace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 343/343 [00:01<00:00, 339.22ba/s]\n",
      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [01:06<00:00, 66.25s/it]\n",
      "Downloading metadata: 100%|██████████| 27.0/27.0 [00:00<00:00, 95.1kB/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "dataset = Dataset.from_pandas(df)\n",
    "dataset.push_to_hub(\"Nicolas-BZRD/Parallel_Global_Voices_English_French\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "web_scraped",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
