{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.font_manager\n",
    "import matplotlib.image as mpimg\n",
    "print(f\"available fonts: {sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])}\")\n",
    "\n",
    "plt.style.use('seaborn-muted')\n",
    "\n",
    "plt.rcParams[\"figure.dpi\"] = 150\n",
    "plt.rcParams[\"savefig.dpi\"] = 300\n",
    "plt.rcParams[\"savefig.format\"] = \"pdf\"\n",
    "plt.rcParams[\"savefig.bbox\"] = \"tight\"\n",
    "plt.rcParams[\"savefig.pad_inches\"] = 0.1\n",
    "\n",
    "plt.rcParams['figure.titlesize'] = 18\n",
    "plt.rcParams['axes.titlesize'] = 18\n",
    "plt.rcParams['font.family'] = 'Helvetica'\n",
    "plt.rcParams['font.size'] = 18\n",
    "\n",
    "plt.rcParams[\"lines.linewidth\"] = 2\n",
    "plt.rcParams['axes.labelsize'] = 16\n",
    "plt.rcParams['axes.labelweight'] = 'bold'\n",
    "plt.rcParams['xtick.labelsize'] = 16\n",
    "plt.rcParams['ytick.labelsize'] = 16\n",
    "plt.rcParams['legend.fontsize'] = 16\n",
    "plt.rcParams['axes.linewidth'] = 2\n",
    "plt.rcParams['axes.titlepad'] = 6\n",
    "\n",
    "plt.rcParams['mathtext.fontset'] = 'dejavuserif'\n",
    "plt.rcParams['mathtext.it'] = 'serif:italic'\n",
    "plt.rcParams['lines.marker'] = \"\"\n",
    "plt.rcParams['legend.frameon'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/data/vtt/wikihow/wikihow_steps.txt\") as f:\n",
    "    steps = f.readlines()\n",
    "steps = [x.strip() for x in steps]\n",
    "print(f\"Total steps: {len(steps)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "words_all = []\n",
    "length = []\n",
    "for step in steps:\n",
    "    words = step.split()\n",
    "    words_all.extend(words)\n",
    "    length.append(len(words))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# join the list and lowercase all the words\n",
    "text = \" \".join(words_all).lower()\n",
    "\n",
    "# create the wordcloud object\n",
    "wordcloud = WordCloud(\n",
    "    width=1000,\n",
    "    height=500,\n",
    "    stopwords=STOPWORDS,\n",
    "    collocations=True,\n",
    "    background_color=\"white\",\n",
    ").generate(text)\n",
    "\n",
    "# plot the wordcloud object\n",
    "plt.imshow(wordcloud, interpolation=\"bilInear\")\n",
    "plt.axis(\"off\")\n",
    "plt.savefig(\"wikihow_wordcloud.png\", dpi=300)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def list2count(_list):\n",
    "    count = defaultdict(int)\n",
    "    for x in _list:\n",
    "        count[x] += 1\n",
    "    count = {key: val for key, val in sorted(count.items())}\n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for ratio in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999]:\n",
    "    pos = int(len(length) * ratio)\n",
    "    print(f\"{ratio * 100}% ({pos}) of the steps have {length[pos]} words or less\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for key, value in list2count(length).items():\n",
    "    print(f\"{key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "length_count = list2count(length[:int(len(length) * 0.99)])\n",
    "plt.plot(length_count.keys(), length_count.values())\n",
    "plt.xlabel(\"length\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.savefig(\"wikihow_length_dist.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
  },
  "vscode": {
   "interpreter": {
    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
