{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4kg2vMPSmfOM"
   },
   "outputs": [],
   "source": [
    "# !pip install datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "lF4E9e0kmoRl"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from datasets import load_dataset, Dataset, DatasetDict\n",
    "from huggingface_hub.hf_api import HfFolder\n",
    "HfFolder.save_token('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "_TrCv4z-myUO",
    "outputId": "35d58d6a-9d25-400d-f197-9a608646cd6c"
   },
   "outputs": [],
   "source": [
    "wiki_ds = load_dataset('')\n",
    "vanhoc_ds = load_dataset('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "qotrxHj5nYi8",
    "outputId": "3b41600a-6931-49b1-f9b6-0b00f7ff9e8f"
   },
   "outputs": [],
   "source": [
    "wiki_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "uSc0f-6DnZT-",
    "outputId": "791a4ce8-a676-4f02-bf6b-37a40343b1bb"
   },
   "outputs": [],
   "source": [
    "vanhoc_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ds8qdayynaDd",
    "outputId": "ccbfd0dd-b9f6-4289-ab3e-193f19eecbeb"
   },
   "outputs": [],
   "source": [
    "wiki_text = wiki_ds['train']['text']\n",
    "vanhoc_text = vanhoc_ds['train']['text']\n",
    "\n",
    "df_1 = pd.DataFrame({\n",
    "    'text': wiki_text,\n",
    "    'domain': ['wiki'] * len(wiki_text)\n",
    "})\n",
    "\n",
    "df_2 = pd.DataFrame({\n",
    "    'text': vanhoc_text,\n",
    "    'domain': ['vanhoc'] * len(vanhoc_text)\n",
    "})\n",
    "\n",
    "df = pd.concat([df_1, df_2], ignore_index=True)\n",
    "df = df.sample(frac=1)\n",
    "df = df.reset_index(drop=True)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 178
    },
    "id": "t6QVPOx6oawj",
    "outputId": "78276684-b5df-449b-a0c8-eb8286136020"
   },
   "outputs": [],
   "source": [
    "df['domain'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 206
    },
    "id": "BIjEnsoloehC",
    "outputId": "6df19a91-5947-4107-a3d4-ef15f30631a8"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "zuS5O6mgpCB-"
   },
   "outputs": [],
   "source": [
    "del wiki_ds, vanhoc_ds, wiki_text, vanhoc_text, df_1, df_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5MNil6Mkoe_Q"
   },
   "outputs": [],
   "source": [
    "ds = Dataset.from_pandas(df)\n",
    "ds_dict = DatasetDict({\n",
    "    'train': ds\n",
    "})\n",
    "\n",
    "del df, ds\n",
    "\n",
    "ds_dict.push_to_hub('')"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
