{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3ca7cebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Root path\n",
    "root_path = \"rgb\"\n",
    "\n",
    "# Load JSON file\n",
    "with open('/svl/data/kinetics-400/diving48/Diving48_V2_train.json', 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# Open the output file\n",
    "with open('diving48_video_train_labels.txt', 'w') as f:\n",
    "    for item in data:\n",
    "        # Construct file path\n",
    "        file_path = f\"{root_path}/{item['vid_name']}\"\n",
    "        # Write file path and label to file\n",
    "        f.write(f\"{file_path} {item['label']}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a2484813",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load JSON file\n",
    "with open('/svl/data/kinetics-400/diving48/Diving48_V2_test.json', 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# Open the output file\n",
    "with open('diving48_video_val_labels.txt', 'w') as f:\n",
    "    for item in data:\n",
    "        # Construct file path\n",
    "        file_path = f\"{root_path}/{item['vid_name']}\"\n",
    "        # Write file path and label to file\n",
    "        f.write(f\"{file_path} {item['label']}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7fd69215",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "from functools import partial\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "76d640d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../\")\n",
    "import diving48\n",
    "from datasets import get_dataloaders, get_toy_dataloader\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d9e1330c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "_StoreAction(option_strings=['--classification_layer_dropout'], dest='classification_layer_dropout', nargs=None, const=None, default=0.5, type=<class 'float'>, choices=None, required=False, help=None, metavar=None)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parser = ArgumentParser()\n",
    "# training arguments\n",
    "parser.add_argument('--batch_size', default=512, type=int)\n",
    "parser.add_argument(\"--criterion_name\", type=str, default=\"binary_crossentropy\", choices=[\"binary_crossentropy\"])\n",
    "parser.add_argument('--balance_classes', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--epochs', default=40, type=int)\n",
    "parser.add_argument('--gradient_clip_val', default=1, type=float)\n",
    "parser.add_argument('--gpus', default=1, type=int)\n",
    "parser.add_argument('--num_workers', default=0, type=int)\n",
    "parser.add_argument('--seed', default=None, type=int)\n",
    "parser.add_argument('--checkpoint_every_n_epochs', type=int, default=5)\n",
    "parser.add_argument('--wandb_group', type=str, default=\"latest\")\n",
    "parser.add_argument('--toy_dataloader', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--validate_before_train', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--mode', type=str, default=\"train\", choices=[\"train\", \"eval\"])\n",
    "parser.add_argument('--checkpoint_path', type=str, default=None)\n",
    "parser.add_argument('--included_classes_path', type=str, default=None)\n",
    "parser.add_argument('--log_per_class_acc', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "\n",
    "# Optimizer arguments\n",
    "parser.add_argument('--lr', default=5e-4, type=float)\n",
    "parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',\n",
    "                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')\n",
    "parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',\n",
    "                    help='epochs to warmup LR, if scheduler supports')\n",
    "parser.add_argument('--warmup_steps', type=int, default=-1, metavar='N', \n",
    "                    help='num of steps to warmup LR, will overload warmup_epochs if set > 0')\n",
    "parser.add_argument('--wd', default=0.05, type=float, \n",
    "                    help=\"Weight decay (will use Adam if set to 0, AdamW otherwise).\")\n",
    "parser.add_argument('--gradient_accumulation_steps', default=1, type=int)\n",
    "parser.add_argument('--adam_betas', nargs='+', type=float, default=(0.9, 0.999), help='Adam betas')\n",
    "parser.add_argument('--adam_eps', type=float, default=1e-8, help='Adam epsilon')\n",
    "parser.add_argument('--backbone_lr', type=float, default=-1, help='backbone learning rate (if -1 uses model lr)')\n",
    "parser.add_argument('--min_backbone_lr', type=float, default=-1, help='backbone min learning rate (if -1 uses model min lr)')\n",
    "\n",
    "\n",
    "# Augmentation params\n",
    "parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing (default: 0.1) - only works if mixup is enabled')\n",
    "parser.add_argument('--num_aug_sample', type=int, default=2,\n",
    "                    help='Repeated_aug (default: 2)')\n",
    "parser.add_argument('--aa', type=str, default='rand-m7-n4-mstd0.5-inc1', metavar='NAME',\n",
    "                    help='Use AutoAugment policy. \"v0\" or \"original\". \" + \"(default: rand-m7-n4-mstd0.5-inc1). Set to \"None\" to disable.'),\n",
    "parser.add_argument('--train_interpolation', type=str, default='bicubic',\n",
    "                    help='Training interpolation (random, bilinear, bicubic default: \"bicubic\")')\n",
    "\n",
    "# Random Erase params\n",
    "parser.add_argument('--reprob', type=float, default=0, metavar='PCT',\n",
    "                    help='Random erase prob (default: 0)')\n",
    "parser.add_argument('--remode', type=str, default='pixel',\n",
    "                    help='Random erase mode (default: \"pixel\")')\n",
    "parser.add_argument('--recount', type=int, default=1,\n",
    "                    help='Random erase count (default: 1)')\n",
    "\n",
    "# Mixup params\n",
    "parser.add_argument('--mixup', type=float, default=0,\n",
    "                    help='mixup alpha, mixup enabled if > 0.')\n",
    "parser.add_argument('--cutmix', type=float, default=0,\n",
    "                    help='cutmix alpha, cutmix enabled if > 0.')\n",
    "parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,\n",
    "                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')\n",
    "parser.add_argument('--mixup_prob', type=float, default=1.0,\n",
    "                    help='Probability of performing mixup or cutmix when either/both is enabled')\n",
    "parser.add_argument('--mixup_switch_prob', type=float, default=0.5,\n",
    "                    help='Probability of switching to cutmix when both mixup and cutmix enabled')\n",
    "parser.add_argument('--mixup_mode', type=str, default='batch',\n",
    "                    help='How to apply mixup/cutmix params. Per \"batch\", \"pair\", or \"elem\"')\n",
    "\n",
    "# dataset arguments\n",
    "parser.add_argument('--task_name', type=str, required=True)\n",
    "parser.add_argument('--data_path', type=str, required=True)\n",
    "parser.add_argument('--label_path', type=str, required=True)\n",
    "parser.add_argument('--n_frames', default=32, type=int)\n",
    "parser.add_argument('--test_temporal_views', default=1, type=int)\n",
    "parser.add_argument('--test_spatial_views', default=3, type=int)\n",
    "parser.add_argument('--frame_sample_rate', default=4, type=int)\n",
    "parser.add_argument('--load_from', type=str, default=\"video\", choices=[\"video\", \"rgb\"])\n",
    "\n",
    "# model structure arguments\n",
    "parser.add_argument(\"--model_name\", type=str, default=\"encode_pool_classify\", choices=[\"encode_pool_classify\", \"text4vis\"])\n",
    "\n",
    "# backbone arguments\n",
    "parser.add_argument(\"--backbone_name\", type=str, default=\"clip_ViT-B/32\")\n",
    "parser.add_argument('--backbone_freeze', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--backbone_unfreeze_layer_norm', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--backbone_drop_path_rate', default=0.1, type=float,\n",
    "                    help=\"Drop path rate (Stochastic Depth) (default: 0.1). Currently only implemented for non default backbones\")\n",
    "parser.add_argument('--backbone_proj_after', default=True, type=lambda x: (str(x).lower() == 'true'), help=\"Whether to apply projection to output of backbone (currently not supported for default clip backbone)\")\n",
    "parser.add_argument('--temporal_pool_backbone', default=False, type=lambda x: (str(x).lower() == 'true'), help=\"Whether to apply mean \\\n",
    "                        temporal pooling in backbone instead of after. Automatically sets temporal pooling name to None. \\\n",
    "                      \"\"  (currently not supported for default clip backbone)\")\n",
    "\n",
    "# AIM + QRNN Shared\n",
    "parser.add_argument('--adapter_upsample_zero_init', default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "\n",
    "## AIM Backbone arguments\n",
    "parser.add_argument('--adapter_settings', type=str, default=\"i\", help=\"Adapter settings if using Adapter backbone. \\\n",
    "                    Choices of i, t, s, m (i.e. to use input adapter + temporal adapter (which implies temporal attention), you would input \\\n",
    "                    'it', to use all 4 adapters, do `itsm`. Order doesn't matter)\" )\n",
    "parser.add_argument('--adapter_checkpoint', type=str, default=None, help=\"Optional checkpoint to use when using adapter backbone - used for loading pretrained AIM backbones\")\n",
    "\n",
    "## QRNN-Adapter arguments\n",
    "parser.add_argument(\"--backbone_qrnn_bidirectional\", default=False, type=lambda x: (str(x).lower() == 'true'))\n",
    "parser.add_argument('--num_qrnn_adapters', type=int, default=1, help=\"Number of QRNN adapters to use if using QRNN Adapter backbone\")\n",
    "parser.add_argument('--vanilla_adapter', default=False, type=lambda x: (str(x).lower() == 'true'), help=\"Whether to use vanilla adapter (i.e. no QRNN)\")\n",
    "parser.add_argument('--downsample_qrnn_adapter', default=False, type=lambda x: (str(x).lower() == 'true'), help=\"Whether to use downsample QRNN adapter (downsample before QRNN)\")\n",
    "parser.add_argument('--num_qrnn_layers', type=int, default=1, help=\"Number of QRNN layers to use if using QRNN Adapter backbone\")\n",
    "parser.add_argument('--qrnn_lookback', type=int, default=1, help=\"Number of previous frames to look at if using QRNN Adapter backbone\")\n",
    "parser.add_argument('--qrnn_lookahead', type=int, default=0, help=\"Number of future frames to look at if using QRNN Adapter backbone\")\n",
    "parser.add_argument('--adapter_downsample_ratio', type=float, default=0.25, help=\"Ratio to downsample in adapter\")\n",
    "\n",
    "# encoder arguments\n",
    "parser.add_argument(\"--temporal_pooling_name\", type=str, default=\"mean\", choices=[\"mean\", \"transformer\", \"identity\", \"last\"])\n",
    "# transformer specific arguments used if `temporal_pooling_name` is `transformer`\n",
    "parser.add_argument('--temporal_pooling_transformer_depth', default=3, type=int)\n",
    "parser.add_argument('--temporal_pooling_transformer_heads', default=4, type=int)\n",
    "parser.add_argument('--temporal_pooling_transformer_dim', default=512, type=int)\n",
    "parser.add_argument('--temporal_pooling_transformer_ff_dim', default=512, type=int)\n",
    "parser.add_argument('--temporal_pooling_transformer_input_dim', default=512, type=int)\n",
    "parser.add_argument('--temporal_pooling_transformer_emb_dropout', default=0.1, type=float)\n",
    "\n",
    "# classifier arguments\n",
    "parser.add_argument(\"--classification_layer_name\", type=str, default=\"linear\", choices=[\"linear\"])\n",
    "parser.add_argument(\"--classification_input_dim\", type=int, default=512)\n",
    "parser.add_argument(\"--num_classes\", type=int, required=True)\n",
    "parser.add_argument(\"--classification_layer_dropout\", type=float, default=0.5)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "8827d779",
   "metadata": {},
   "outputs": [],
   "source": [
    "hparams = parser.parse_args(\"--task_name diving48 --data_path /svl/data/kinetics-400/diving48 \\\n",
    "    --label_path /vision/u/eatang/leaky_video/datasets/diving48 --n_frames 8 --num_classes 174 --num_aug_sample 1 --aa rand-m9-n2-mstd0.5-inc1 --backbone_name qrnn_adapter_clip_ViT-B/16 \\\n",
    "    --temporal_pooling_name last --backbone_freeze True --backbone_unfreeze_layer_norm False --backbone_qrnn_bidirectional False --num_qrnn_adapters 2 \\\n",
    "     --adapter_upsample_zero_init True --backbone_proj_after False --classification_input_dim 768 \\\n",
    "    --num_qrnn_layers 1 --downsample_qrnn_adapter False --temporal_pool_backbone True \\\n",
    "    --qrnn_lookahead 0 --qrnn_lookback 1 --adapter_downsample_ratio 0.282 --downsample_qrnn_adapter True\".split())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "090a3c13",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_loader, val_loaders = get_dataloaders(hparams)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36180f73",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, batch in enumerate(val_loaders[0]):\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "11c22726",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch.utils.data.dataloader.DataLoader at 0x7fa6b5859180>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mode = \"val\"\n",
    "dataset = diving48.Diving48(\"/svl/data/kinetics-400/diving48/\", \n",
    "           \"/vision/u/eatang/leaky_video/datasets/diving48\",\n",
    "            8, mode=mode, \n",
    "            crop_size=224, \n",
    "            short_side_size=224,\n",
    "            num_aug_sample=1,\n",
    "            args=args,\n",
    "            test_num_segment=1,\n",
    "            test_num_crop=1, load_from=\"rgb\")\n",
    "\n",
    "j = 0\n",
    "data = dataset.__getitem__(j)\n",
    "frames = data[\"video_features\"]\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "w = 10\n",
    "h = 10\n",
    "fig = plt.figure(figsize=(8, 8))\n",
    "columns = 4\n",
    "rows = 4\n",
    "for i in range(1, 9):\n",
    "    img = frames[:,i - 1,:,:]\n",
    "    fig.add_subplot(rows, columns, i)\n",
    "    plt.imshow(img.numpy().transpose((1,2,0)))\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8db03b1d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
