{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Main script for running pretrain scripts for BERT\n",
    "\n",
    "Note that for corrupted and non-corrupted pretraining, the only difference is the training/validation data and nothing else! These scripts are directly adapted from HuggingFace public repo with modifications."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf-8\n",
    "# Copyright 2020 The HuggingFace Team All rights reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "\"\"\"\n",
    "Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.\n",
    "Here is the full list of checkpoints on the hub that can be fine-tuned by this script:\n",
    "https://huggingface.co/models?filter=masked-lm\n",
    "\"\"\"\n",
    "# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.\n",
    "\n",
    "import logging\n",
    "import math\n",
    "import os\n",
    "import sys\n",
    "from dataclasses import dataclass, field\n",
    "from typing import Optional\n",
    "\n",
    "from datasets import load_dataset\n",
    "from datasets import DatasetDict\n",
    "\n",
    "import transformers\n",
    "from transformers import (\n",
    "    CONFIG_MAPPING,\n",
    "    MODEL_FOR_MASKED_LM_MAPPING,\n",
    "    AutoConfig,\n",
    "    AutoModelForMaskedLM,\n",
    "    AutoTokenizer,\n",
    "    DataCollatorForLanguageModeling,\n",
    "    HfArgumentParser,\n",
    "    Trainer,\n",
    "    TrainingArguments,\n",
    "    set_seed,\n",
    ")\n",
    "from transformers.trainer_utils import is_main_process\n",
    "\n",
    "\n",
    "logger = logging.getLogger(__name__)\n",
    "MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())\n",
    "MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class ModelArguments:\n",
    "    \"\"\"\n",
    "    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.\n",
    "    \"\"\"\n",
    "\n",
    "    model_name_or_path: Optional[str] = field(\n",
    "        default=None,\n",
    "        metadata={\n",
    "            \"help\": \"The model checkpoint for weights initialization.\"\n",
    "            \"Don't set if you want to train a model from scratch.\"\n",
    "        },\n",
    "    )\n",
    "    model_type: Optional[str] = field(\n",
    "        default=None,\n",
    "        metadata={\"help\": \"If training from scratch, pass a model type from the list: \" + \", \".join(MODEL_TYPES)},\n",
    "    )\n",
    "    config_name: Optional[str] = field(\n",
    "        default=None, metadata={\"help\": \"Pretrained config name or path if not the same as model_name\"}\n",
    "    )\n",
    "    tokenizer_name: Optional[str] = field(\n",
    "        default=None, metadata={\"help\": \"Pretrained tokenizer name or path if not the same as model_name\"}\n",
    "    )\n",
    "    cache_dir: Optional[str] = field(\n",
    "        default=None,\n",
    "        metadata={\"help\": \"Where do you want to store the pretrained models downloaded from huggingface.co\"},\n",
    "    )\n",
    "    use_fast_tokenizer: bool = field(\n",
    "        default=True,\n",
    "        metadata={\"help\": \"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.\"},\n",
    "    )\n",
    "    model_revision: str = field(\n",
    "        default=\"main\",\n",
    "        metadata={\"help\": \"The specific model version to use (can be a branch name, tag name or commit id).\"},\n",
    "    )\n",
    "    use_auth_token: bool = field(\n",
    "        default=False,\n",
    "        metadata={\n",
    "            \"help\": \"Will use the token generated when running `transformers-cli login` (necessary to use this script \"\n",
    "            \"with private models).\"\n",
    "        },\n",
    "    )\n",
    "\n",
    "\n",
    "@dataclass\n",
    "class DataTrainingArguments:\n",
    "    \"\"\"\n",
    "    Arguments pertaining to what data we are going to input our model for training and eval.\n",
    "    \"\"\"\n",
    "\n",
    "    dataset_name: Optional[str] = field(\n",
    "        default=None, metadata={\"help\": \"The name of the dataset to use (via the datasets library).\"}\n",
    "    )\n",
    "    dataset_config_name: Optional[str] = field(\n",
    "        default=None, metadata={\"help\": \"The configuration name of the dataset to use (via the datasets library).\"}\n",
    "    )\n",
    "    train_file: Optional[str] = field(default=None, metadata={\"help\": \"The input training data file (a text file).\"})\n",
    "    validation_file: Optional[str] = field(\n",
    "        default=None,\n",
    "        metadata={\"help\": \"An optional input evaluation data file to evaluate the perplexity on (a text file).\"},\n",
    "    )\n",
    "    overwrite_cache: bool = field(\n",
    "        default=False, metadata={\"help\": \"Overwrite the cached training and evaluation sets\"}\n",
    "    )\n",
    "    validation_split_percentage: Optional[int] = field(\n",
    "        default=5,\n",
    "        metadata={\n",
    "            \"help\": \"The percentage of the train set used as validation set in case there's no validation split\"\n",
    "        },\n",
    "    )\n",
    "    max_seq_length: Optional[int] = field(\n",
    "        default=None,\n",
    "        metadata={\n",
    "            \"help\": \"The maximum total input sequence length after tokenization. Sequences longer \"\n",
    "            \"than this will be truncated.\"\n",
    "        },\n",
    "    )\n",
    "    preprocessing_num_workers: Optional[int] = field(\n",
    "        default=None,\n",
    "        metadata={\"help\": \"The number of processes to use for the preprocessing.\"},\n",
    "    )\n",
    "    mlm_probability: float = field(\n",
    "        default=0.15, metadata={\"help\": \"Ratio of tokens to mask for masked language modeling loss\"}\n",
    "    )\n",
    "    line_by_line: bool = field(\n",
    "        default=False,\n",
    "        metadata={\"help\": \"Whether distinct lines of text in the dataset are to be handled as distinct sequences.\"},\n",
    "    )\n",
    "    pad_to_max_length: bool = field(\n",
    "        default=False,\n",
    "        metadata={\n",
    "            \"help\": \"Whether to pad all samples to `max_seq_length`. \"\n",
    "            \"If False, will pad the samples dynamically when batching to the maximum length in the batch.\"\n",
    "        },\n",
    "    )\n",
    "\n",
    "    def __post_init__(self):\n",
    "        if self.dataset_name is None and self.train_file is None and self.validation_file is None:\n",
    "            raise ValueError(\"Need either a dataset name or a training/validation file.\")\n",
    "        else:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main():\n",
    "    # See all possible arguments in src/transformers/training_args.py\n",
    "    # or by passing the --help flag to this script.\n",
    "    # We now keep distinct sets of args, for a cleaner separation of concerns.\n",
    "\n",
    "    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))\n",
    "    \n",
    "    if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n",
    "        # If we pass only one argument to the script and it's the path to a json file,\n",
    "        # let's parse it to get our arguments.\n",
    "        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))\n",
    "    else:\n",
    "        model_args, data_args, training_args = parser.parse_args_into_dataclasses()\n",
    "\n",
    "    if (\n",
    "        os.path.exists(training_args.output_dir)\n",
    "        and os.listdir(training_args.output_dir)\n",
    "        and training_args.do_train\n",
    "        and not training_args.overwrite_output_dir\n",
    "    ):\n",
    "        raise ValueError(\n",
    "            f\"Output directory ({training_args.output_dir}) already exists and is not empty.\"\n",
    "            \"Use --overwrite_output_dir to overcome.\"\n",
    "        )\n",
    "\n",
    "    # Setup logging\n",
    "    logging.basicConfig(\n",
    "        format=\"%(asctime)s - %(levelname)s - %(name)s -   %(message)s\",\n",
    "        datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
    "        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,\n",
    "    )\n",
    "\n",
    "    # Log on each process the small summary:\n",
    "    logger.warning(\n",
    "        f\"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}\"\n",
    "        + f\"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}\"\n",
    "    )\n",
    "    # Set the verbosity to info of the Transformers logger (on main process only):\n",
    "    if is_main_process(training_args.local_rank):\n",
    "        transformers.utils.logging.set_verbosity_info()\n",
    "        transformers.utils.logging.enable_default_handler()\n",
    "        transformers.utils.logging.enable_explicit_format()\n",
    "    logger.info(\"Training/evaluation parameters %s\", training_args)\n",
    "\n",
    "    # Set seed before initializing model.\n",
    "    set_seed(training_args.seed)\n",
    "\n",
    "    # load from pre-processed wikitext data files\n",
    "    if data_args.train_file is None:\n",
    "        raise ValueError(\"This code requires a training/validation file.\")\n",
    "    if \"corrupted\" in data_args.train_file:\n",
    "        print(\"****************************************\")\n",
    "        print(\"*                                      *\")\n",
    "        print(\"* you are working with corrupted BERT! *\")\n",
    "        print(\"*                                      *\")\n",
    "        print(\"****************************************\")\n",
    "    datasets = DatasetDict.load_from_disk(data_args.train_file)\n",
    "\n",
    "    # Load pretrained model and tokenizer\n",
    "    #\n",
    "    # Distributed training:\n",
    "    # The .from_pretrained methods guarantee that only one local process can concurrently\n",
    "    # download model & vocab.\n",
    "    config_kwargs = {\n",
    "        \"cache_dir\": model_args.cache_dir,\n",
    "        \"revision\": model_args.model_revision,\n",
    "        \"use_auth_token\": True if model_args.use_auth_token else None,\n",
    "    }\n",
    "    if model_args.config_name:\n",
    "        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)\n",
    "    elif model_args.model_name_or_path:\n",
    "        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)\n",
    "    else:\n",
    "        config = CONFIG_MAPPING[model_args.model_type]()\n",
    "        logger.warning(\"You are instantiating a new config instance from scratch.\")\n",
    "\n",
    "    # this is our own tokenizer\n",
    "    TOKENIZER_MAPPING = dict()\n",
    "    TOKENIZER_MAPPING[\"bert\"] = 'bert-base-uncased'\n",
    "    tokenizer = AutoTokenizer.from_pretrained(\n",
    "        TOKENIZER_MAPPING[model_args.model_type],\n",
    "        use_fast=False,\n",
    "        cache_dir=model_args.cache_dir\n",
    "    )\n",
    "\n",
    "    if model_args.model_name_or_path:\n",
    "        model = AutoModelForMaskedLM.from_pretrained(\n",
    "            model_args.model_name_or_path,\n",
    "            from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n",
    "            config=config,\n",
    "            cache_dir=model_args.cache_dir,\n",
    "            revision=model_args.model_revision,\n",
    "            use_auth_token=True if model_args.use_auth_token else None,\n",
    "        )\n",
    "    else:\n",
    "        logger.info(\"Training new model from scratch\")\n",
    "        model = AutoModelForMaskedLM.from_config(config)\n",
    "\n",
    "    model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "    # Preprocessing the datasets.\n",
    "    # First we tokenize all the texts.\n",
    "    if training_args.do_train:\n",
    "        column_names = datasets[\"train\"].column_names\n",
    "    else:\n",
    "        column_names = datasets[\"validation\"].column_names\n",
    "    text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
    "\n",
    "    if data_args.line_by_line:\n",
    "        # When using line_by_line, we just tokenize each nonempty line.\n",
    "        padding = \"max_length\" if data_args.pad_to_max_length else False\n",
    "\n",
    "        print(\"****************************************\")\n",
    "        print(\"*                                      *\")\n",
    "        print(\"*  you are parsing line by line here!  *\")\n",
    "        print(\"*                                      *\")\n",
    "        print(\"****************************************\")\n",
    "        \n",
    "        def tokenize_function(examples):\n",
    "            # Remove empty lines\n",
    "            examples[\"text\"] = [line for line in examples[\"text\"] if len(line) > 0 and not line.isspace()]\n",
    "            return tokenizer(\n",
    "                examples[\"text\"],\n",
    "                padding=padding,\n",
    "                truncation=True,\n",
    "                max_length=data_args.max_seq_length,\n",
    "                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it\n",
    "                # receives the `special_tokens_mask`.\n",
    "                return_special_tokens_mask=True,\n",
    "            )\n",
    "\n",
    "        tokenized_datasets = datasets.map(\n",
    "            tokenize_function,\n",
    "            batched=True,\n",
    "            num_proc=data_args.preprocessing_num_workers,\n",
    "            remove_columns=[text_column_name],\n",
    "            load_from_cache_file=not data_args.overwrite_cache,\n",
    "        )\n",
    "    else:\n",
    "        raise ValueError(\"I think parsing it line by line will be preferred. Please use --line_by_line.\")\n",
    "\n",
    "    # Data collator\n",
    "    # This one will take care of randomly masking the tokens.\n",
    "    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)\n",
    "\n",
    "    # Initialize our Trainer\n",
    "    trainer = Trainer(\n",
    "        model=model,\n",
    "        args=training_args,\n",
    "        train_dataset=tokenized_datasets[\"train\"] if training_args.do_train else None,\n",
    "        eval_dataset=tokenized_datasets[\"validation\"] if training_args.do_eval else None,\n",
    "        tokenizer=tokenizer,\n",
    "        data_collator=data_collator,\n",
    "    )\n",
    "\n",
    "    # Training\n",
    "    if training_args.do_train:\n",
    "        model_path = (\n",
    "            model_args.model_name_or_path\n",
    "            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))\n",
    "            else None\n",
    "        )\n",
    "        train_result = trainer.train(model_path=model_path)\n",
    "        trainer.save_model()  # Saves the tokenizer too for easy upload\n",
    "\n",
    "        output_train_file = os.path.join(training_args.output_dir, \"train_results.txt\")\n",
    "        if trainer.is_world_process_zero():\n",
    "            with open(output_train_file, \"w\") as writer:\n",
    "                logger.info(\"***** Train results *****\")\n",
    "                for key, value in sorted(train_result.metrics.items()):\n",
    "                    logger.info(f\"  {key} = {value}\")\n",
    "                    writer.write(f\"{key} = {value}\\n\")\n",
    "\n",
    "            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model\n",
    "            trainer.state.save_to_json(os.path.join(training_args.output_dir, \"trainer_state.json\"))\n",
    "\n",
    "    # Evaluation\n",
    "    results = {}\n",
    "    if training_args.do_eval:\n",
    "        logger.info(\"*** Evaluate ***\")\n",
    "\n",
    "        eval_output = trainer.evaluate()\n",
    "\n",
    "        perplexity = math.exp(eval_output[\"eval_loss\"])\n",
    "        results[\"perplexity\"] = perplexity\n",
    "\n",
    "        output_eval_file = os.path.join(training_args.output_dir, \"eval_results_mlm.txt\")\n",
    "        if trainer.is_world_process_zero():\n",
    "            with open(output_eval_file, \"w\") as writer:\n",
    "                logger.info(\"***** Eval results *****\")\n",
    "                for key, value in sorted(results.items()):\n",
    "                    logger.info(f\"  {key} = {value}\")\n",
    "                    writer.write(f\"{key} = {value}\\n\")\n",
    "\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# datasets = load_dataset(\"wikitext\", \"wikitext-103-v1\", cache_dir=\"../.huggingface_cache/\")\n",
    "# print(len(datasets[\"train\"]))\n",
    "# datasets[\"train\"] = datasets[\"train\"].select(range(int(len(datasets[\"train\"])*0.15)))\n",
    "# datasets.save_to_disk(\"../data_files/wikitext-15M\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
