{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import pandas as pd\n",
    "from langdetect import detect\n",
    "from tqdm import tqdm\n",
    "\n",
    "df = pd.read_csv(\"lyrics_dataframe.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.rename(columns={\"en\": \"orig\"}, inplace=True)\n",
    "df[\"lang\"] = \"nan\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>artist_name</th>\n",
       "      <th>album_name</th>\n",
       "      <th>year</th>\n",
       "      <th>title</th>\n",
       "      <th>number</th>\n",
       "      <th>orig</th>\n",
       "      <th>fr</th>\n",
       "      <th>lang</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The Beatles</td>\n",
       "      <td>Beatles For Sale</td>\n",
       "      <td>1964.0</td>\n",
       "      <td>Rock and Roll Music</td>\n",
       "      <td>4.0</td>\n",
       "      <td>chorus\\nJust let me hear some of that rock and...</td>\n",
       "      <td>Laisse moi juste écouter un peu de cette musiq...</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Imagine Dragons</td>\n",
       "      <td>Infinity Blade II [OST]</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Monster</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Ever since I could remember\\nEverything inside...</td>\n",
       "      <td>Aussi longtemps que je m'en souvienne\\nTout ce...</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Beatles</td>\n",
       "      <td>Let It Be</td>\n",
       "      <td>1970.0</td>\n",
       "      <td>I Me Mine</td>\n",
       "      <td>4.0</td>\n",
       "      <td>All through the day I me mine, I me mine, I me...</td>\n",
       "      <td>Tout le jour : je, moi, à moi, je, moi, à moi,...</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       artist_name               album_name    year                title  \\\n",
       "0      The Beatles         Beatles For Sale  1964.0  Rock and Roll Music   \n",
       "1  Imagine Dragons  Infinity Blade II [OST]  2011.0              Monster   \n",
       "2      The Beatles                Let It Be  1970.0            I Me Mine   \n",
       "\n",
       "   number                                               orig  \\\n",
       "0     4.0  chorus\\nJust let me hear some of that rock and...   \n",
       "1     3.0  Ever since I could remember\\nEverything inside...   \n",
       "2     4.0  All through the day I me mine, I me mine, I me...   \n",
       "\n",
       "                                                  fr lang  \n",
       "0  Laisse moi juste écouter un peu de cette musiq...  nan  \n",
       "1  Aussi longtemps que je m'en souvienne\\nTout ce...  nan  \n",
       "2  Tout le jour : je, moi, à moi, je, moi, à moi,...  nan  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove small strings (not lyrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_short_with_nan(value):\n",
    "    if isinstance(value, str) and len(value) < 100:\n",
    "        return math.nan\n",
    "    else:\n",
    "        return value\n",
    "\n",
    "df['orig'] = df['orig'].apply(replace_short_with_nan)\n",
    "df['fr'] = df['fr'].apply(replace_short_with_nan)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove nan rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dropna(subset=['orig', 'fr'], how='all', inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Put French texts only in the fr column, get the original lang and delete the row if there is no French version at all."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 101508/101508 [17:39<00:00, 95.78it/s] \n"
     ]
    }
   ],
   "source": [
    "fr_fr_idx = []\n",
    "not_fr_idx = []\n",
    "\n",
    "for idx in tqdm(df.index):\n",
    "    fr_lang = \"nan\"\n",
    "    orig_lang = \"nan\"\n",
    "\n",
    "    if isinstance(df['fr'][idx], str):\n",
    "        try:fr_lang = detect(df['fr'][idx])\n",
    "        except:pass\n",
    "    if isinstance(df['orig'][idx], str):\n",
    "        try:orig_lang = detect(df['orig'][idx])\n",
    "        except:pass\n",
    "\n",
    "    if fr_lang != 'fr' and orig_lang == \"fr\":\n",
    "        df.loc[idx, ['fr', 'orig']] = df.loc[idx, ['orig', 'fr']].values\n",
    "        df.loc[idx, 'lang'] = fr_lang\n",
    "    elif fr_lang == 'fr' and orig_lang != \"fr\":\n",
    "        df.loc[idx, 'lang'] = orig_lang\n",
    "    elif fr_lang == \"fr\" and orig_lang == \"fr\":\n",
    "        fr_fr_idx.append(idx)\n",
    "    elif fr_lang != \"fr\" and orig_lang != \"fr\":\n",
    "        not_fr_idx.append(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1655\n",
      "564\n"
     ]
    }
   ],
   "source": [
    "print(len(fr_fr_idx))\n",
    "print(len(not_fr_idx))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop(fr_fr_idx)\n",
    "df = df.drop(not_fr_idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>artist_name</th>\n",
       "      <th>album_name</th>\n",
       "      <th>year</th>\n",
       "      <th>title</th>\n",
       "      <th>number</th>\n",
       "      <th>orig</th>\n",
       "      <th>fr</th>\n",
       "      <th>lang</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The Beatles</td>\n",
       "      <td>Beatles For Sale</td>\n",
       "      <td>1964.0</td>\n",
       "      <td>Rock and Roll Music</td>\n",
       "      <td>4.0</td>\n",
       "      <td>chorus\\nJust let me hear some of that rock and...</td>\n",
       "      <td>Laisse moi juste écouter un peu de cette musiq...</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Imagine Dragons</td>\n",
       "      <td>Infinity Blade II [OST]</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Monster</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Ever since I could remember\\nEverything inside...</td>\n",
       "      <td>Aussi longtemps que je m'en souvienne\\nTout ce...</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Beatles</td>\n",
       "      <td>Let It Be</td>\n",
       "      <td>1970.0</td>\n",
       "      <td>I Me Mine</td>\n",
       "      <td>4.0</td>\n",
       "      <td>All through the day I me mine, I me mine, I me...</td>\n",
       "      <td>Tout le jour : je, moi, à moi, je, moi, à moi,...</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The Beatles</td>\n",
       "      <td>Paperback Writer [Single]</td>\n",
       "      <td>1966.0</td>\n",
       "      <td>Rain</td>\n",
       "      <td>2.0</td>\n",
       "      <td>If the rain comes they run and hide their head...</td>\n",
       "      <td>Quand la pluie arrive, ils courent et protègen...</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Imagine Dragons</td>\n",
       "      <td>Divergente 2 - L'insurrection (The Divergent S...</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>Warriors</td>\n",
       "      <td>7.0</td>\n",
       "      <td>Warriors\\nAs a child, you would wait\\nAnd watc...</td>\n",
       "      <td>(Les guerriers\\nEnfant, tu attendais\\nEt garda...</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       artist_name                                         album_name    year  \\\n",
       "0      The Beatles                                   Beatles For Sale  1964.0   \n",
       "1  Imagine Dragons                            Infinity Blade II [OST]  2011.0   \n",
       "2      The Beatles                                          Let It Be  1970.0   \n",
       "3      The Beatles                         Paperback Writer [Single]   1966.0   \n",
       "4  Imagine Dragons  Divergente 2 - L'insurrection (The Divergent S...  2015.0   \n",
       "\n",
       "                 title  number  \\\n",
       "0  Rock and Roll Music     4.0   \n",
       "1              Monster     3.0   \n",
       "2            I Me Mine     4.0   \n",
       "3                 Rain     2.0   \n",
       "4             Warriors     7.0   \n",
       "\n",
       "                                                orig  \\\n",
       "0  chorus\\nJust let me hear some of that rock and...   \n",
       "1  Ever since I could remember\\nEverything inside...   \n",
       "2  All through the day I me mine, I me mine, I me...   \n",
       "3  If the rain comes they run and hide their head...   \n",
       "4  Warriors\\nAs a child, you would wait\\nAnd watc...   \n",
       "\n",
       "                                                  fr lang  \n",
       "0  Laisse moi juste écouter un peu de cette musiq...   en  \n",
       "1  Aussi longtemps que je m'en souvienne\\nTout ce...   en  \n",
       "2  Tout le jour : je, moi, à moi, je, moi, à moi,...   en  \n",
       "3  Quand la pluie arrive, ils courent et protègen...   en  \n",
       "4  (Les guerriers\\nEnfant, tu attendais\\nEt garda...   en  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"lyrics_dataframe_processed.csv\", index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lyrics",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
