{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "\n",
    "sagemaker_session = sagemaker.Session()\n",
    "\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "\n",
    "role = sagemaker.get_execution_role()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sagemaker.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.pytorch import PyTorch\n",
    "from sagemaker.tuner import HyperparameterTuner\n",
    "from sagemaker.parameter import IntegerParameter, CategoricalParameter\n",
    "\n",
    "import os\n",
    "import uuid\n",
    "\n",
    "metrics = [\n",
    "    {'Name': 'top1-accuracy', 'Regex': '\\*\\sAcc@1\\s(.*)\\sAcc@5'},\n",
    "    {'Name': 'top5-accuracy', 'Regex': '\\*\\sAcc@1\\s.*\\sAcc@5\\s(.*)'},\n",
    "]\n",
    "\n",
    "instances = 1\n",
    "processes = 8\n",
    "\n",
    "one_day_in_seconds = 86400\n",
    "max_run = one_day_in_seconds * 5\n",
    "\n",
    "base_job_name = 'rigl'\n",
    "uu = uuid.uuid1()\n",
    "print('using uuid:', uu)\n",
    "\n",
    "estimator = PyTorch(entry_point='train_imagenet_rigl.py',\n",
    "                    source_dir='..',\n",
    "                    role=role,\n",
    "                    framework_version='1.4.0',\n",
    "                    py_version='py3',\n",
    "                    checkpoint_s3_uri='s3://%s/checkpoints/%s/%s/' % (bucket, base_job_name, uu),\n",
    "                    base_job_name=base_job_name,\n",
    "                    \n",
    "                    # Instances Setup\n",
    "                    train_instance_count=instances,\n",
    "                    train_instance_type='ml.p3.16xlarge',\n",
    "                    train_use_spot_instances=True,\n",
    "                    train_max_wait=max_run,\n",
    "                    train_max_run=max_run,\n",
    "                    train_volume_size=300,\n",
    "                    \n",
    "                    hyperparameters={\n",
    "#                         'multiprocessing-distributed': 1, # separate process per GPU\n",
    "                        'data':'/opt/ml/input/data/training',\n",
    "                        'run-extract-script': 1, # if you are using the 1000 .tar file s3 bucket for imagenet, use this = 1. otherwise, use = 0\n",
    "                        'output-dir': '/opt/ml/model',\n",
    "#                         'checkpoint-dir': '/opt/ml/checkpoints/',\n",
    "                        'arch': 'resnet50',\n",
    "                        'workers': 40,\n",
    "                        'dense-allocation': 0.1,\n",
    "                        'static-topo': 0,\n",
    "                        'T-end-epochs': 100, # simulate epochs = 100 for T_end\n",
    "#                         'alpha': 0.3,\n",
    "#                         'delta': 400,\n",
    "#                         'grad-accumulation-n': 4, # if using a smaller batch size, this may be useful\n",
    "                        'batch-size': 1024 // instances, # batch size per instance\n",
    "                        'eval-batch-size': 1024,\n",
    "                        'lr': 0.1,\n",
    "#                         'lr-warmup-end': 5,\n",
    "                        'lr-scaling-stop': 91,\n",
    "                        'epochs': 3,\n",
    "                    },\n",
    "                    metric_definitions=metrics\n",
    "                   )\n",
    "\n",
    "tuner = HyperparameterTuner(estimator,\n",
    "                            base_tuning_job_name='rigl-tuning',\n",
    "#                             estimator_name='%s-tuning' % base_job_name,\n",
    "                            objective_metric_name='top1-accuracy',\n",
    "                            objective_type='Maximize',\n",
    "                            max_jobs=100,\n",
    "                            hyperparameter_ranges={\n",
    "                                'delta': IntegerParameter(100, 1000),\n",
    "                                'alpha': CategoricalParameter(['0.3', '0.5']),\n",
    "                                'grad-accumulation-n': IntegerParameter(1, 95),\n",
    "                            },\n",
    "                            max_parallel_jobs=8,\n",
    "                            early_stopping_type='Auto',\n",
    "                            metric_definitions=metrics\n",
    "                           )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tuner.fit('s3://imagenet-compressed-oregon') # use imagenet s3 bucket"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_pytorch_p36",
   "language": "python",
   "name": "conda_pytorch_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  },
  "notice": "Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.  Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
