{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `wandb` which is not installed yet, install it with `pip install wandb`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `gym` which is not installed yet, install it with `pip install gym`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "Global seed set to 42\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7fb6fe2030d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "Files already downloaded and verified\n",
      "Files already downloaded and verified\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `wandb` which is not installed yet, install it with `pip install wandb`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `gym` which is not installed yet, install it with `pip install gym`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "Global seed set to 42\n",
      "Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7fad7d3db0d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
      "hparams:  \"args\": Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7fad7d3db0d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "[LBFGS] number of param groups: 1\n",
      "[LBFGS] initialize LBFGS optimizer:\n",
      "-------------------------------------\n",
      "Base Hessian update frequency: 100\n",
      "History vector size: 10\n",
      "Enable damping: True\n",
      "Momentum for param: 0.99\n",
      "Momentum for grad: 0.99\n",
      "-------------------------------------\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Global seed set to 42\n",
      "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `wandb` which is not installed yet, install it with `pip install wandb`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `gym` which is not installed yet, install it with `pip install gym`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "Global seed set to 42\n",
      "Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7f27ac20b0d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
      "hparams:  \"args\": Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7f27ac20b0d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "[LBFGS] number of param groups: 1\n",
      "[LBFGS] initialize LBFGS optimizer:\n",
      "-------------------------------------\n",
      "Base Hessian update frequency: 100\n",
      "History vector size: 10\n",
      "Enable damping: True\n",
      "Momentum for param: 0.99\n",
      "Momentum for grad: 0.99\n",
      "-------------------------------------\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Global seed set to 42\n",
      "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `wandb` which is not installed yet, install it with `pip install wandb`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "/usr/local/lib/python3.6/dist-packages/pl_bolts/utils/warnings.py:32: UserWarning: You want to use `gym` which is not installed yet, install it with `pip install gym`.\n",
      "  f' install it with `pip install {pypi_name}`.' + extra_text\n",
      "Global seed set to 42\n",
      "Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7f6e6a3d50d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
      "hparams:  \"args\": Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7f6e6a3d50d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "[LBFGS] number of param groups: 1\n",
      "[LBFGS] initialize LBFGS optimizer:\n",
      "-------------------------------------\n",
      "Base Hessian update frequency: 100\n",
      "History vector size: 10\n",
      "Enable damping: True\n",
      "Momentum for param: 0.99\n",
      "Momentum for grad: 0.99\n",
      "-------------------------------------\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Global seed set to 42\n",
      "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4\n",
      "hparams:  \"args\": Namespace(accelerator='ddp', accumulate_grad_batches=1, amp_backend='native', amp_level='O2', auto_lr_find=False, auto_scale_batch_size=False, auto_select_gpus=False, automatic_optimization=None, batch_size=64, benchmark=False, check_val_every_n_epoch=1, checkpoint_callback=None, dataset='cifar10', default_root_dir='./logs/vit_cifar10_lbfgs_test', deterministic=True, distributed_backend=None, enable_pl_optimizer=None, fast_dev_run=False, flush_logs_every_n_steps=100, gpus=4, grad_clip=1.0, grad_clip_fix_scaling=True, gradient_clip_val=0, history_size=10, lbfgs_damping=0.01, limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, log_every_n_steps=50, log_gpu_memory=None, logger=None, lr=0.025, lr_decay_at=[55], lr_gamma=0.1, lr_num_warmup_epochs=5, max_epochs=65, max_steps=None, min_epochs=None, min_steps=None, momentum=0.9, move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, optimizer='lbfgs', overfit_batches=0.0, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stat_decay_grad=0.99, stat_decay_param=0.99, stochastic_weight_avg=False, sync_batchnorm=False, terminate_on_nan=False, tpu_cores=<function _gpus_arg_default at 0x7fb6fe2030d0>, track_grad_norm=-1, truncated_bptt_steps=None, update_freq=100, val_check_interval=1.0, vit_depth=6, vit_dim=512, vit_dropout=0.1, vit_emb_dropout=0.1, vit_heads=8, vit_mlp_dim=512, vit_patch_size=4, weight_decay=0.0001, weights_save_path=None, weights_summary='top')\n",
      "[LBFGS] number of param groups: 1\n",
      "[LBFGS] initialize LBFGS optimizer:\n",
      "-------------------------------------\n",
      "Base Hessian update frequency: 100\n",
      "History vector size: 10\n",
      "Enable damping: True\n",
      "Momentum for param: 0.99\n",
      "Momentum for grad: 0.99\n",
      "-------------------------------------\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Global seed set to 42\n",
      "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.\n",
      "libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.\n",
      "libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.\n",
      "libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.\n",
      "2021-05-31 21:59:41.385180: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0\n",
      "2021-05-31 21:59:41.391633: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0\n",
      "\n",
      "  | Name  | Type | Params\n",
      "-------------------------------\n",
      "0 | model | ViT  | 9.5 M \n",
      "-------------------------------\n",
      "9.5 M     Trainable params\n",
      "0         Non-trainable params\n",
      "9.5 M     Total params\n",
      "38.095    Total estimated model params size (MB)\n",
      "Validation sanity check:   0%|                            | 0/2 [00:00<?, ?it/s]/usr/local/lib/python3.6/dist-packages/pytorch_lightning/utilities/distributed.py:52: UserWarning: Your validation_step returned None. Did you forget to return an output?\n",
      "  warnings.warn(*args, **kwargs)\n",
      "Epoch 0:   0%|                                          | 0/197 [00:00<?, ?it/s][W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())\n",
      "/root/slbfgs/pl_slbfgs/optimizer/lbfgs.py:359: UserWarning: This overload of add_ is deprecated:\n",
      "\tadd_(Number alpha, Tensor other)\n",
      "Consider using one of the following signatures instead:\n",
      "\tadd_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)\n",
      "  p.data.add_(-group[\"lr\"], dp)\n",
      "[W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())\n",
      "[W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())\n",
      "/root/slbfgs/pl_slbfgs/optimizer/lbfgs.py:359: UserWarning: This overload of add_ is deprecated:\n",
      "\tadd_(Number alpha, Tensor other)\n",
      "Consider using one of the following signatures instead:\n",
      "\tadd_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)\n",
      "  p.data.add_(-group[\"lr\"], dp)\n",
      "/root/slbfgs/pl_slbfgs/optimizer/lbfgs.py:359: UserWarning: This overload of add_ is deprecated:\n",
      "\tadd_(Number alpha, Tensor other)\n",
      "Consider using one of the following signatures instead:\n",
      "\tadd_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)\n",
      "  p.data.add_(-group[\"lr\"], dp)\n",
      "[W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())\n",
      "Epoch 0:   1%| | 1/197 [00:01<05:36,  1.72s/it, loss=2.33, v_num=5, val_loss=2.3/root/slbfgs/pl_slbfgs/optimizer/lbfgs.py:359: UserWarning: This overload of add_ is deprecated:\n",
      "\tadd_(Number alpha, Tensor other)\n",
      "Consider using one of the following signatures instead:\n",
      "\tadd_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)\n",
      "  p.data.add_(-group[\"lr\"], dp)\n",
      "Epoch 0:  80%|7| 157/197 [00:11<00:02, 13.79it/s, loss=2.19, v_num=5, val_loss=2\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 0:  81%|8| 159/197 [00:12<00:03, 12.52it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  84%|8| 165/197 [00:12<00:02, 12.88it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  87%|8| 171/197 [00:12<00:01, 13.22it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  90%|8| 177/197 [00:13<00:01, 13.58it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  93%|9| 183/197 [00:13<00:01, 13.93it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  96%|9| 189/197 [00:13<00:00, 14.28it/s, loss=2.19, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 0:  99%|9| 195/197 [00:13<00:00, 14.62it/s, loss=2.19, v_num=5, val_loss=2\u001b[AAdjusting learning rate of group 0 to 5.0000e-03.\n",
      "Epoch 0: 100%|#| 197/197 [00:13<00:00, 14.60it/s, loss=2.19, v_num=5, val_loss=2\n",
      "Epoch 1:   0%| | 0/197 [00:00<?, ?it/s, loss=2.19, v_num=5, val_loss=2.160, val_\u001b[AAdjusting learning rate of group 0 to 5.0000e-03.\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Adjusting learning rate of group 0 to 5.0000e-03.\n",
      "Epoch 1:  80%|7| 157/197 [00:12<00:03, 12.91it/s, loss=2.07, v_num=5, val_loss=2\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 1:  82%|8| 162/197 [00:13<00:02, 11.82it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 1:  85%|8| 168/197 [00:13<00:02, 12.14it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 1:  88%|8| 174/197 [00:13<00:01, 12.48it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 1:  91%|9| 180/197 [00:14<00:01, 12.81it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 1:  94%|9| 186/197 [00:14<00:00, 13.13it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 1:  97%|9| 192/197 [00:14<00:00, 13.45it/s, loss=2.07, v_num=5, val_loss=2\u001b[A\n",
      "Validating:  88%|###########################1   | 35/40 [00:02<00:00,  5.33it/s]\u001b[AAdjusting learning rate of group 0 to 1.0000e-02.\n",
      "Adjusting learning rate of group 0 to 1.0000e-02.\n",
      "Epoch 1: 100%|#| 197/197 [00:14<00:00, 13.57it/s, loss=2.07, v_num=5, val_loss=2\n",
      "Epoch 2:   0%| | 0/197 [00:00<?, ?it/s, loss=2.07, v_num=5, val_loss=2.070, val_\u001b[AAdjusting learning rate of group 0 to 1.0000e-02.\n",
      "Adjusting learning rate of group 0 to 1.0000e-02.\n",
      "Epoch 2:  80%|7| 157/197 [00:12<00:03, 12.23it/s, loss=1.94, v_num=5, val_loss=2\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 2:  82%|8| 162/197 [00:14<00:03, 11.26it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 2:  85%|8| 168/197 [00:14<00:02, 11.57it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 2:  88%|8| 174/197 [00:14<00:01, 11.89it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 2:  91%|9| 180/197 [00:14<00:01, 12.21it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 2:  94%|9| 186/197 [00:14<00:00, 12.53it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 2:  97%|9| 192/197 [00:14<00:00, 12.84it/s, loss=1.94, v_num=5, val_loss=2\u001b[A\n",
      "Validating:  88%|###########################1   | 35/40 [00:02<00:00,  5.34it/s]\u001b[AAdjusting learning rate of group 0 to 1.5000e-02.\n",
      "Adjusting learning rate of group 0 to 1.5000e-02.\n",
      "Epoch 2: 100%|#| 197/197 [00:15<00:00, 12.97it/s, loss=1.94, v_num=5, val_loss=2\n",
      "Epoch 3:   0%| | 0/197 [00:00<?, ?it/s, loss=1.94, v_num=5, val_loss=2.000, val_\u001b[AAdjusting learning rate of group 0 to 1.5000e-02.\n",
      "Adjusting learning rate of group 0 to 1.5000e-02.\n",
      "Epoch 3:  80%|7| 157/197 [00:13<00:03, 12.02it/s, loss=1.84, v_num=5, val_loss=2\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 3:  82%|8| 162/197 [00:14<00:03, 11.12it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 3:  85%|8| 168/197 [00:14<00:02, 11.45it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 3:  88%|8| 174/197 [00:14<00:01, 11.77it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 3:  91%|9| 180/197 [00:14<00:01, 12.09it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 3:  94%|9| 186/197 [00:15<00:00, 12.40it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Epoch 3:  97%|9| 192/197 [00:15<00:00, 12.71it/s, loss=1.84, v_num=5, val_loss=2\u001b[A\n",
      "Validating:  88%|###########################1   | 35/40 [00:02<00:00,  5.53it/s]\u001b[AAdjusting learning rate of group 0 to 2.0000e-02.\n",
      "Adjusting learning rate of group 0 to 2.0000e-02.\n",
      "Epoch 3: 100%|#| 197/197 [00:15<00:00, 12.85it/s, loss=1.84, v_num=5, val_loss=2\n",
      "Epoch 4:   0%| | 0/197 [00:00<?, ?it/s, loss=1.84, v_num=5, val_loss=2.030, val_\u001b[AAdjusting learning rate of group 0 to 2.0000e-02.\n",
      "Adjusting learning rate of group 0 to 2.0000e-02.\n",
      "Epoch 4:  80%|7| 157/197 [00:13<00:03, 11.98it/s, loss=1.7, v_num=5, val_loss=2.\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 4:  82%|8| 162/197 [00:14<00:03, 11.13it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Epoch 4:  85%|8| 168/197 [00:14<00:02, 11.44it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Epoch 4:  88%|8| 174/197 [00:14<00:01, 11.76it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Epoch 4:  91%|9| 180/197 [00:14<00:01, 12.08it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Epoch 4:  94%|9| 186/197 [00:15<00:00, 12.38it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Epoch 4:  97%|9| 192/197 [00:15<00:00, 12.70it/s, loss=1.7, v_num=5, val_loss=2.\u001b[A\n",
      "Validating:  88%|###########################1   | 35/40 [00:02<00:00,  5.69it/s]\u001b[AAdjusting learning rate of group 0 to 2.5000e-02.\n",
      "Epoch 4: 100%|#| 197/197 [00:15<00:00, 12.84it/s, loss=1.7, v_num=5, val_loss=1.\n",
      "Epoch 5:   0%| | 0/197 [00:00<?, ?it/s, loss=1.7, v_num=5, val_loss=1.730, val_aAdjusting learning rate of group 0 to 2.5000e-02.\n",
      "Adjusting learning rate of group 0 to 2.5000e-02.\n",
      "Adjusting learning rate of group 0 to 2.5000e-02.\n",
      "Epoch 5:  80%|7| 157/197 [00:13<00:03, 11.38it/s, loss=1.57, v_num=5, val_loss=1\n",
      "Validating: 0it [00:00, ?it/s]\u001b[A\n",
      "Validating:   0%|                                        | 0/40 [00:00<?, ?it/s]\u001b[A\n",
      "Epoch 5:  82%|8| 162/197 [00:15<00:03, 10.56it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Epoch 5:  85%|8| 168/197 [00:15<00:02, 10.87it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Epoch 5:  88%|8| 174/197 [00:15<00:02, 11.19it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Epoch 5:  91%|9| 180/197 [00:15<00:01, 11.50it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Epoch 5:  94%|9| 186/197 [00:15<00:00, 11.80it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Epoch 5:  97%|9| 192/197 [00:15<00:00, 12.10it/s, loss=1.57, v_num=5, val_loss=1\u001b[A\n",
      "Validating:  90%|###########################9   | 36/40 [00:02<00:00,  5.34it/s]\u001b[AAdjusting learning rate of group 0 to 2.5000e-02.\n",
      "Adjusting learning rate of group 0 to 2.5000e-02.\n",
      "Epoch 5: 100%|#| 197/197 [00:16<00:00, 12.25it/s, loss=1.57, v_num=5, val_loss=1\n",
      "Epoch 6:   0%| | 0/197 [00:00<?, ?it/s, loss=1.57, v_num=5, val_loss=1.520, val_\u001b[AAdjusting learning rate of group 0 to 2.5000e-02.\n",
      "Adjusting learning rate of group 0 to 2.5000e-02.\n",
      "Epoch 6:  65%|6| 128/197 [00:11<00:06, 11.14it/s, loss=1.54, v_num=5, val_loss=1^C\n",
      "/usr/local/lib/python3.6/dist-packages/pytorch_lightning/utilities/distributed.py:52: UserWarning: Detected KeyboardInterrupt, attempting graceful shutdown...\n",
      "  warnings.warn(*args, **kwargs)\n",
      "Epoch 6:  65%|6| 129/197 [00:11<00:06, 10.95it/s, loss=1.52, v_num=5, val_loss=1\n"
     ]
    }
   ],
   "source": [
    "!python train_vit_cifar10.py    --gpus 4\\\n",
    "                                --optimizer lbfgs\\\n",
    "                                --default_root_dir './logs/vit_cifar10_slimqn' \\\n",
    "                                --vit_patch_size 4 \\\n",
    "                                --momentum 0.9\\\n",
    "                                --weight_decay 0.0001\\\n",
    "                                --max_epochs 100\\\n",
    "                                --lr 0.025\\\n",
    "                                --lr_num_warmup_epochs 5\\\n",
    "                                --lr_decay_at 90\\\n",
    "                                --lr_gamma 0.1 \\\n",
    "                                --batch_size 256 \\\n",
    "                                --stat_decay_param 0.99 \\\n",
    "                                --stat_decay_grad 0.99 \\\n",
    "                                --update_freq 100 \\\n",
    "                                --history_size 10 \\\n",
    "                                --lbfgs_damping 0.01\\\n",
    "                                --grad_clip 1.0 \\\n",
    "                                --grad_clip_fix_scaling"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
