| distributed init (rank 0): env://, gpu 0
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 6): env://, gpu 6
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=256, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_256_11.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(256, 256), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=292, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(256, 256))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7feb8687af90>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.034)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.045)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.056)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.068)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.079)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.090)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.102)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.113)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.124)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.135)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.147)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.158)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.169)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.181)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.192)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.203)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.215)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.226)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.237)
    )
    (10): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.248)
    )
    (11): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.260)
    )
    (12): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.271)
    )
    (13): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.282)
    )
    (14): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.294)
    )
    (15): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.305)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.316)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.327)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.339)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.350)
    )
  )
  (head): ConvX(
    (conv): Conv2d(768, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 50901626
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer1.3.mlp.conv_in.conv.weight",
      "layer1.3.mlp.dw.conv.weight",
      "layer1.3.mlp.re.region.0.weight",
      "layer1.3.mlp.re.region.3.weight",
      "layer1.3.mlp.proj.conv.weight",
      "layer1.3.dcnn.conv_in.conv.weight",
      "layer1.3.dcnn.spe.conv.weight",
      "layer1.3.dcnn.att.logit_scale",
      "layer1.3.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer2.5.mlp.conv_in.conv.weight",
      "layer2.5.mlp.dw.conv.weight",
      "layer2.5.mlp.re.region.0.weight",
      "layer2.5.mlp.re.region.3.weight",
      "layer2.5.mlp.proj.conv.weight",
      "layer2.5.dcnn.conv_in.conv.weight",
      "layer2.5.dcnn.spe.conv.weight",
      "layer2.5.dcnn.att.logit_scale",
      "layer2.5.dcnn.proj.conv.weight",
      "layer2.6.mlp.conv_in.conv.weight",
      "layer2.6.mlp.dw.conv.weight",
      "layer2.6.mlp.re.region.0.weight",
      "layer2.6.mlp.re.region.3.weight",
      "layer2.6.mlp.proj.conv.weight",
      "layer2.6.dcnn.conv_in.conv.weight",
      "layer2.6.dcnn.spe.conv.weight",
      "layer2.6.dcnn.att.logit_scale",
      "layer2.6.dcnn.proj.conv.weight",
      "layer2.7.mlp.conv_in.conv.weight",
      "layer2.7.mlp.dw.conv.weight",
      "layer2.7.mlp.re.region.0.weight",
      "layer2.7.mlp.re.region.3.weight",
      "layer2.7.mlp.proj.conv.weight",
      "layer2.7.dcnn.conv_in.conv.weight",
      "layer2.7.dcnn.spe.conv.weight",
      "layer2.7.dcnn.att.logit_scale",
      "layer2.7.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer3.10.mlp.conv_in.conv.weight",
      "layer3.10.mlp.dw.conv.weight",
      "layer3.10.mlp.re.region.0.weight",
      "layer3.10.mlp.re.region.3.weight",
      "layer3.10.mlp.proj.conv.weight",
      "layer3.10.dcnn.conv_in.conv.weight",
      "layer3.10.dcnn.spe.conv.weight",
      "layer3.10.dcnn.att.logit_scale",
      "layer3.10.dcnn.proj.conv.weight",
      "layer3.11.mlp.conv_in.conv.weight",
      "layer3.11.mlp.dw.conv.weight",
      "layer3.11.mlp.re.region.0.weight",
      "layer3.11.mlp.re.region.3.weight",
      "layer3.11.mlp.proj.conv.weight",
      "layer3.11.dcnn.conv_in.conv.weight",
      "layer3.11.dcnn.spe.conv.weight",
      "layer3.11.dcnn.att.logit_scale",
      "layer3.11.dcnn.proj.conv.weight",
      "layer3.12.mlp.conv_in.conv.weight",
      "layer3.12.mlp.dw.conv.weight",
      "layer3.12.mlp.re.region.0.weight",
      "layer3.12.mlp.re.region.3.weight",
      "layer3.12.mlp.proj.conv.weight",
      "layer3.12.dcnn.conv_in.conv.weight",
      "layer3.12.dcnn.spe.conv.weight",
      "layer3.12.dcnn.att.logit_scale",
      "layer3.12.dcnn.proj.conv.weight",
      "layer3.13.mlp.conv_in.conv.weight",
      "layer3.13.mlp.dw.conv.weight",
      "layer3.13.mlp.re.region.0.weight",
      "layer3.13.mlp.re.region.3.weight",
      "layer3.13.mlp.proj.conv.weight",
      "layer3.13.dcnn.conv_in.conv.weight",
      "layer3.13.dcnn.spe.conv.weight",
      "layer3.13.dcnn.att.logit_scale",
      "layer3.13.dcnn.proj.conv.weight",
      "layer3.14.mlp.conv_in.conv.weight",
      "layer3.14.mlp.dw.conv.weight",
      "layer3.14.mlp.re.region.0.weight",
      "layer3.14.mlp.re.region.3.weight",
      "layer3.14.mlp.proj.conv.weight",
      "layer3.14.dcnn.conv_in.conv.weight",
      "layer3.14.dcnn.spe.conv.weight",
      "layer3.14.dcnn.att.logit_scale",
      "layer3.14.dcnn.proj.conv.weight",
      "layer3.15.mlp.conv_in.conv.weight",
      "layer3.15.mlp.dw.conv.weight",
      "layer3.15.mlp.re.region.0.weight",
      "layer3.15.mlp.re.region.3.weight",
      "layer3.15.mlp.proj.conv.weight",
      "layer3.15.dcnn.conv_in.conv.weight",
      "layer3.15.dcnn.spe.conv.weight",
      "layer3.15.dcnn.att.logit_scale",
      "layer3.15.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "layer4.3.mlp.conv_in.conv.weight",
      "layer4.3.mlp.dw.conv.weight",
      "layer4.3.mlp.re.region.0.weight",
      "layer4.3.mlp.re.region.3.weight",
      "layer4.3.mlp.proj.conv.weight",
      "layer4.3.dcnn.conv_in.conv.weight",
      "layer4.3.dcnn.spe.conv.weight",
      "layer4.3.dcnn.att.logit_scale",
      "layer4.3.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer1.3.mlp.conv_in.norm.weight",
      "layer1.3.mlp.conv_in.norm.bias",
      "layer1.3.mlp.dw.norm.weight",
      "layer1.3.mlp.dw.norm.bias",
      "layer1.3.mlp.re.region.1.weight",
      "layer1.3.mlp.re.region.1.bias",
      "layer1.3.mlp.re.region.3.bias",
      "layer1.3.mlp.proj.norm.weight",
      "layer1.3.mlp.proj.norm.bias",
      "layer1.3.dcnn.conv_in.norm.weight",
      "layer1.3.dcnn.conv_in.norm.bias",
      "layer1.3.dcnn.spe.norm.weight",
      "layer1.3.dcnn.spe.norm.bias",
      "layer1.3.dcnn.proj.norm.weight",
      "layer1.3.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer2.5.mlp.conv_in.norm.weight",
      "layer2.5.mlp.conv_in.norm.bias",
      "layer2.5.mlp.dw.norm.weight",
      "layer2.5.mlp.dw.norm.bias",
      "layer2.5.mlp.re.region.1.weight",
      "layer2.5.mlp.re.region.1.bias",
      "layer2.5.mlp.re.region.3.bias",
      "layer2.5.mlp.proj.norm.weight",
      "layer2.5.mlp.proj.norm.bias",
      "layer2.5.dcnn.conv_in.norm.weight",
      "layer2.5.dcnn.conv_in.norm.bias",
      "layer2.5.dcnn.spe.norm.weight",
      "layer2.5.dcnn.spe.norm.bias",
      "layer2.5.dcnn.proj.norm.weight",
      "layer2.5.dcnn.proj.norm.bias",
      "layer2.6.mlp.conv_in.norm.weight",
      "layer2.6.mlp.conv_in.norm.bias",
      "layer2.6.mlp.dw.norm.weight",
      "layer2.6.mlp.dw.norm.bias",
      "layer2.6.mlp.re.region.1.weight",
      "layer2.6.mlp.re.region.1.bias",
      "layer2.6.mlp.re.region.3.bias",
      "layer2.6.mlp.proj.norm.weight",
      "layer2.6.mlp.proj.norm.bias",
      "layer2.6.dcnn.conv_in.norm.weight",
      "layer2.6.dcnn.conv_in.norm.bias",
      "layer2.6.dcnn.spe.norm.weight",
      "layer2.6.dcnn.spe.norm.bias",
      "layer2.6.dcnn.proj.norm.weight",
      "layer2.6.dcnn.proj.norm.bias",
      "layer2.7.mlp.conv_in.norm.weight",
      "layer2.7.mlp.conv_in.norm.bias",
      "layer2.7.mlp.dw.norm.weight",
      "layer2.7.mlp.dw.norm.bias",
      "layer2.7.mlp.re.region.1.weight",
      "layer2.7.mlp.re.region.1.bias",
      "layer2.7.mlp.re.region.3.bias",
      "layer2.7.mlp.proj.norm.weight",
      "layer2.7.mlp.proj.norm.bias",
      "layer2.7.dcnn.conv_in.norm.weight",
      "layer2.7.dcnn.conv_in.norm.bias",
      "layer2.7.dcnn.spe.norm.weight",
      "layer2.7.dcnn.spe.norm.bias",
      "layer2.7.dcnn.proj.norm.weight",
      "layer2.7.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer3.10.mlp.conv_in.norm.weight",
      "layer3.10.mlp.conv_in.norm.bias",
      "layer3.10.mlp.dw.norm.weight",
      "layer3.10.mlp.dw.norm.bias",
      "layer3.10.mlp.re.region.1.weight",
      "layer3.10.mlp.re.region.1.bias",
      "layer3.10.mlp.re.region.3.bias",
      "layer3.10.mlp.proj.norm.weight",
      "layer3.10.mlp.proj.norm.bias",
      "layer3.10.dcnn.conv_in.norm.weight",
      "layer3.10.dcnn.conv_in.norm.bias",
      "layer3.10.dcnn.spe.norm.weight",
      "layer3.10.dcnn.spe.norm.bias",
      "layer3.10.dcnn.proj.norm.weight",
      "layer3.10.dcnn.proj.norm.bias",
      "layer3.11.mlp.conv_in.norm.weight",
      "layer3.11.mlp.conv_in.norm.bias",
      "layer3.11.mlp.dw.norm.weight",
      "layer3.11.mlp.dw.norm.bias",
      "layer3.11.mlp.re.region.1.weight",
      "layer3.11.mlp.re.region.1.bias",
      "layer3.11.mlp.re.region.3.bias",
      "layer3.11.mlp.proj.norm.weight",
      "layer3.11.mlp.proj.norm.bias",
      "layer3.11.dcnn.conv_in.norm.weight",
      "layer3.11.dcnn.conv_in.norm.bias",
      "layer3.11.dcnn.spe.norm.weight",
      "layer3.11.dcnn.spe.norm.bias",
      "layer3.11.dcnn.proj.norm.weight",
      "layer3.11.dcnn.proj.norm.bias",
      "layer3.12.mlp.conv_in.norm.weight",
      "layer3.12.mlp.conv_in.norm.bias",
      "layer3.12.mlp.dw.norm.weight",
      "layer3.12.mlp.dw.norm.bias",
      "layer3.12.mlp.re.region.1.weight",
      "layer3.12.mlp.re.region.1.bias",
      "layer3.12.mlp.re.region.3.bias",
      "layer3.12.mlp.proj.norm.weight",
      "layer3.12.mlp.proj.norm.bias",
      "layer3.12.dcnn.conv_in.norm.weight",
      "layer3.12.dcnn.conv_in.norm.bias",
      "layer3.12.dcnn.spe.norm.weight",
      "layer3.12.dcnn.spe.norm.bias",
      "layer3.12.dcnn.proj.norm.weight",
      "layer3.12.dcnn.proj.norm.bias",
      "layer3.13.mlp.conv_in.norm.weight",
      "layer3.13.mlp.conv_in.norm.bias",
      "layer3.13.mlp.dw.norm.weight",
      "layer3.13.mlp.dw.norm.bias",
      "layer3.13.mlp.re.region.1.weight",
      "layer3.13.mlp.re.region.1.bias",
      "layer3.13.mlp.re.region.3.bias",
      "layer3.13.mlp.proj.norm.weight",
      "layer3.13.mlp.proj.norm.bias",
      "layer3.13.dcnn.conv_in.norm.weight",
      "layer3.13.dcnn.conv_in.norm.bias",
      "layer3.13.dcnn.spe.norm.weight",
      "layer3.13.dcnn.spe.norm.bias",
      "layer3.13.dcnn.proj.norm.weight",
      "layer3.13.dcnn.proj.norm.bias",
      "layer3.14.mlp.conv_in.norm.weight",
      "layer3.14.mlp.conv_in.norm.bias",
      "layer3.14.mlp.dw.norm.weight",
      "layer3.14.mlp.dw.norm.bias",
      "layer3.14.mlp.re.region.1.weight",
      "layer3.14.mlp.re.region.1.bias",
      "layer3.14.mlp.re.region.3.bias",
      "layer3.14.mlp.proj.norm.weight",
      "layer3.14.mlp.proj.norm.bias",
      "layer3.14.dcnn.conv_in.norm.weight",
      "layer3.14.dcnn.conv_in.norm.bias",
      "layer3.14.dcnn.spe.norm.weight",
      "layer3.14.dcnn.spe.norm.bias",
      "layer3.14.dcnn.proj.norm.weight",
      "layer3.14.dcnn.proj.norm.bias",
      "layer3.15.mlp.conv_in.norm.weight",
      "layer3.15.mlp.conv_in.norm.bias",
      "layer3.15.mlp.dw.norm.weight",
      "layer3.15.mlp.dw.norm.bias",
      "layer3.15.mlp.re.region.1.weight",
      "layer3.15.mlp.re.region.1.bias",
      "layer3.15.mlp.re.region.3.bias",
      "layer3.15.mlp.proj.norm.weight",
      "layer3.15.mlp.proj.norm.bias",
      "layer3.15.dcnn.conv_in.norm.weight",
      "layer3.15.dcnn.conv_in.norm.bias",
      "layer3.15.dcnn.spe.norm.weight",
      "layer3.15.dcnn.spe.norm.bias",
      "layer3.15.dcnn.proj.norm.weight",
      "layer3.15.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "layer4.3.mlp.conv_in.norm.weight",
      "layer4.3.mlp.conv_in.norm.bias",
      "layer4.3.mlp.dw.norm.weight",
      "layer4.3.mlp.dw.norm.bias",
      "layer4.3.mlp.re.region.1.weight",
      "layer4.3.mlp.re.region.1.bias",
      "layer4.3.mlp.re.region.3.bias",
      "layer4.3.mlp.proj.norm.weight",
      "layer4.3.mlp.proj.norm.bias",
      "layer4.3.dcnn.conv_in.norm.weight",
      "layer4.3.dcnn.conv_in.norm.bias",
      "layer4.3.dcnn.spe.norm.weight",
      "layer4.3.dcnn.spe.norm.bias",
      "layer4.3.dcnn.proj.norm.weight",
      "layer4.3.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 5:09:42  lr: 0.000000  min_lr: 0.000000  loss: 7.0058 (7.0058)  weight_decay: 0.0500 (0.0500)  time: 14.8545  data: 3.4346  max mem: 54228
Epoch: [0]  [ 200/1251]  eta: 0:12:26  lr: 0.000032  min_lr: 0.000032  loss: 6.9405 (6.9557)  weight_decay: 0.0500 (0.0500)  grad_norm: 47.3418 (nan)  time: 0.6375  data: 0.0005  max mem: 54228
Epoch: [0]  [ 400/1251]  eta: 0:09:33  lr: 0.000064  min_lr: 0.000064  loss: 6.8562 (6.9297)  weight_decay: 0.0500 (0.0500)  grad_norm: 5.1069 (nan)  time: 0.6373  data: 0.0006  max mem: 54228
Epoch: [0]  [ 600/1251]  eta: 0:07:11  lr: 0.000096  min_lr: 0.000096  loss: 6.7502 (6.8873)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2165 (nan)  time: 0.6375  data: 0.0006  max mem: 54228
Epoch: [0]  [ 800/1251]  eta: 0:04:55  lr: 0.000128  min_lr: 0.000128  loss: 6.6914 (6.8399)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7478 (nan)  time: 0.6374  data: 0.0006  max mem: 54228
Epoch: [0]  [1000/1251]  eta: 0:02:43  lr: 0.000160  min_lr: 0.000160  loss: 6.5877 (6.7914)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7546 (nan)  time: 0.6370  data: 0.0005  max mem: 54228
Epoch: [0]  [1200/1251]  eta: 0:00:33  lr: 0.000192  min_lr: 0.000192  loss: 6.6099 (6.7511)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5521 (nan)  time: 0.6374  data: 0.0006  max mem: 54228
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.4014 (6.7402)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3838 (nan)  time: 0.5409  data: 0.0005  max mem: 54228
Epoch: [0] Total time: 0:13:31 (0.6486 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.4014 (6.7408)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3838 (nan)
Test:  [ 0/25]  eta: 0:05:05  loss: 5.6778 (5.6778)  acc1: 3.2000 (3.2000)  acc5: 12.8000 (12.8000)  time: 12.2206  data: 7.4760  max mem: 54228
Test:  [10/25]  eta: 0:00:20  loss: 5.8257 (5.8078)  acc1: 3.2000 (3.6000)  acc5: 9.2000 (10.4000)  time: 1.3824  data: 0.6799  max mem: 54228
Test:  [20/25]  eta: 0:00:04  loss: 5.7519 (5.7716)  acc1: 3.2000 (3.3333)  acc5: 12.4000 (11.1238)  time: 0.2985  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 5.7262 (5.7147)  acc1: 3.6000 (3.8720)  acc5: 12.4000 (12.4480)  time: 0.2984  data: 0.0001  max mem: 54228
Test: Total time: 0:00:19 (0.7790 s / it)
* Acc@1 3.522 Acc@5 12.160 loss 5.733
Accuracy of the model on the 50000 test images: 3.5%
Max accuracy: 3.52%
Epoch: [1]  [   0/1251]  eta: 1:16:02  lr: 0.000200  min_lr: 0.000200  loss: 6.2042 (6.2042)  weight_decay: 0.0500 (0.0500)  time: 3.6473  data: 2.9438  max mem: 54228
Epoch: [1]  [ 200/1251]  eta: 0:11:16  lr: 0.000232  min_lr: 0.000232  loss: 6.3481 (6.4225)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3053 (3.3928)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [1]  [ 400/1251]  eta: 0:09:01  lr: 0.000264  min_lr: 0.000264  loss: 6.3174 (6.4045)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4157 (3.3844)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [1]  [ 600/1251]  eta: 0:06:52  lr: 0.000296  min_lr: 0.000296  loss: 6.3375 (6.3776)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0929 (3.3074)  time: 0.6330  data: 0.0005  max mem: 54228
Epoch: [1]  [ 800/1251]  eta: 0:04:45  lr: 0.000328  min_lr: 0.000328  loss: 6.3113 (6.3575)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0446 (3.2577)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [1]  [1000/1251]  eta: 0:02:38  lr: 0.000360  min_lr: 0.000360  loss: 6.2780 (6.3232)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0823 (3.2289)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [1]  [1200/1251]  eta: 0:00:32  lr: 0.000392  min_lr: 0.000392  loss: 6.0030 (6.2875)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1166 (3.2225)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 6.2721 (6.2834)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1585 (3.2209)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [1] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 6.2721 (6.2820)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1585 (3.2209)
Test:  [ 0/25]  eta: 0:02:39  loss: 4.6252 (4.6252)  acc1: 10.8000 (10.8000)  acc5: 31.6000 (31.6000)  time: 6.3756  data: 6.0480  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 4.6413 (4.6339)  acc1: 11.2000 (11.8182)  acc5: 31.6000 (30.3273)  time: 0.8522  data: 0.5501  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 4.7992 (4.7637)  acc1: 10.8000 (11.1619)  acc5: 27.6000 (29.1619)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 4.7992 (4.7353)  acc1: 11.2000 (11.7120)  acc5: 28.0000 (29.8080)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5485 s / it)
* Acc@1 11.672 Acc@5 30.064 loss 4.738
Accuracy of the model on the 50000 test images: 11.7%
Max accuracy: 11.67%
Epoch: [2]  [   0/1251]  eta: 1:13:11  lr: 0.000400  min_lr: 0.000400  loss: 6.3774 (6.3774)  weight_decay: 0.0500 (0.0500)  time: 3.5107  data: 2.8596  max mem: 54228
Epoch: [2]  [ 200/1251]  eta: 0:11:16  lr: 0.000432  min_lr: 0.000432  loss: 6.2198 (6.0581)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7496 (2.8467)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [2]  [ 400/1251]  eta: 0:09:01  lr: 0.000464  min_lr: 0.000464  loss: 6.0309 (6.0316)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1189 (3.0408)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [2]  [ 600/1251]  eta: 0:06:52  lr: 0.000496  min_lr: 0.000496  loss: 5.7242 (5.9680)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2588 (3.0710)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [2]  [ 800/1251]  eta: 0:04:45  lr: 0.000528  min_lr: 0.000528  loss: 5.9071 (5.9279)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8250 (3.0290)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [2]  [1000/1251]  eta: 0:02:38  lr: 0.000560  min_lr: 0.000560  loss: 5.8466 (5.9001)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6338 (2.9730)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [2]  [1200/1251]  eta: 0:00:32  lr: 0.000592  min_lr: 0.000592  loss: 5.8442 (5.8836)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4301 (2.9490)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 5.6134 (5.8748)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5347 (2.9497)  time: 0.5339  data: 0.0005  max mem: 54228
Epoch: [2] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 5.6134 (5.8917)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5347 (2.9497)
Test:  [ 0/25]  eta: 0:02:43  loss: 3.5744 (3.5744)  acc1: 27.6000 (27.6000)  acc5: 54.8000 (54.8000)  time: 6.5405  data: 6.2002  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 3.5744 (3.6752)  acc1: 26.4000 (24.2182)  acc5: 52.8000 (51.8909)  time: 0.8674  data: 0.5640  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 4.0204 (3.8995)  acc1: 20.0000 (22.0190)  acc5: 44.8000 (47.2381)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 4.0204 (3.8581)  acc1: 20.0000 (23.2160)  acc5: 44.8000 (48.0160)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5536 s / it)
* Acc@1 23.240 Acc@5 47.846 loss 3.866
Accuracy of the model on the 50000 test images: 23.2%
Max accuracy: 23.24%
Epoch: [3]  [   0/1251]  eta: 1:12:24  lr: 0.000600  min_lr: 0.000600  loss: 5.8018 (5.8018)  weight_decay: 0.0500 (0.0500)  time: 3.4727  data: 2.8269  max mem: 54228
Epoch: [3]  [ 200/1251]  eta: 0:11:14  lr: 0.000632  min_lr: 0.000632  loss: 5.4862 (5.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7466 (2.7127)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [3]  [ 400/1251]  eta: 0:09:00  lr: 0.000664  min_lr: 0.000664  loss: 5.7033 (5.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3865 (2.6683)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [3]  [ 600/1251]  eta: 0:06:52  lr: 0.000696  min_lr: 0.000696  loss: 5.3048 (5.6054)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4417 (2.6263)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [3]  [ 800/1251]  eta: 0:04:45  lr: 0.000728  min_lr: 0.000728  loss: 5.0685 (5.5697)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5690 (2.5699)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [3]  [1000/1251]  eta: 0:02:38  lr: 0.000760  min_lr: 0.000760  loss: 5.6405 (5.5628)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5290 (2.5893)  time: 0.6371  data: 0.0004  max mem: 54228
Epoch: [3]  [1200/1251]  eta: 0:00:32  lr: 0.000792  min_lr: 0.000792  loss: 5.5507 (5.5394)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1871 (2.5409)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.6894 (5.5352)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2040 (2.5272)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [3] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.6894 (5.5569)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2040 (2.5272)
Test:  [ 0/25]  eta: 0:02:29  loss: 2.8023 (2.8023)  acc1: 42.4000 (42.4000)  acc5: 66.8000 (66.8000)  time: 5.9693  data: 5.6394  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 2.8023 (2.8684)  acc1: 41.2000 (37.6727)  acc5: 66.8000 (66.4727)  time: 0.8152  data: 0.5130  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 3.2353 (3.1471)  acc1: 31.2000 (34.0190)  acc5: 57.2000 (60.8952)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 3.2690 (3.1346)  acc1: 32.4000 (34.5440)  acc5: 56.8000 (61.0400)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5304 s / it)
* Acc@1 34.532 Acc@5 61.338 loss 3.139
Accuracy of the model on the 50000 test images: 34.5%
Max accuracy: 34.53%
Epoch: [4]  [   0/1251]  eta: 1:00:47  lr: 0.000800  min_lr: 0.000800  loss: 5.5307 (5.5307)  weight_decay: 0.0500 (0.0500)  time: 2.9154  data: 2.2752  max mem: 54228
Epoch: [4]  [ 200/1251]  eta: 0:11:12  lr: 0.000832  min_lr: 0.000832  loss: 5.2823 (5.4074)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1687 (2.4183)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [4]  [ 400/1251]  eta: 0:09:00  lr: 0.000864  min_lr: 0.000864  loss: 5.4694 (5.3981)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1506 (2.3702)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [4]  [ 600/1251]  eta: 0:06:51  lr: 0.000896  min_lr: 0.000896  loss: 5.2996 (5.3646)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2700 (2.3332)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [4]  [ 800/1251]  eta: 0:04:44  lr: 0.000928  min_lr: 0.000928  loss: 5.1731 (5.3401)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9794 (2.2952)  time: 0.6339  data: 0.0004  max mem: 54228
Epoch: [4]  [1000/1251]  eta: 0:02:38  lr: 0.000960  min_lr: 0.000960  loss: 5.1947 (5.3143)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1528 (2.2683)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [4]  [1200/1251]  eta: 0:00:32  lr: 0.000992  min_lr: 0.000992  loss: 5.3195 (5.2939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9851 (2.2480)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 5.2909 (5.2842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8797 (2.2366)  time: 0.5336  data: 0.0007  max mem: 54228
Epoch: [4] Total time: 0:13:08 (0.6299 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 5.2909 (5.2882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8797 (2.2366)
Test:  [ 0/25]  eta: 0:02:47  loss: 2.2597 (2.2597)  acc1: 54.0000 (54.0000)  acc5: 77.6000 (77.6000)  time: 6.6833  data: 6.3553  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 2.3350 (2.4938)  acc1: 50.0000 (46.5818)  acc5: 78.4000 (74.9818)  time: 0.8801  data: 0.5781  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 2.9451 (2.7938)  acc1: 38.0000 (41.8667)  acc5: 65.6000 (68.4381)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.9940 (2.7817)  acc1: 38.8000 (42.0640)  acc5: 62.8000 (68.4640)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5587 s / it)
* Acc@1 41.882 Acc@5 68.848 loss 2.768
Accuracy of the model on the 50000 test images: 41.9%
Max accuracy: 41.88%
Epoch: [5]  [   0/1251]  eta: 1:04:03  lr: 0.001000  min_lr: 0.001000  loss: 4.9735 (4.9735)  weight_decay: 0.0500 (0.0500)  time: 3.0722  data: 2.4398  max mem: 54228
Epoch: [5]  [ 200/1251]  eta: 0:11:13  lr: 0.001032  min_lr: 0.001032  loss: 5.2866 (5.2462)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0035 (2.0250)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [5]  [ 400/1251]  eta: 0:09:00  lr: 0.001064  min_lr: 0.001064  loss: 5.4317 (5.1770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8497 (2.0158)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [5]  [ 600/1251]  eta: 0:06:51  lr: 0.001096  min_lr: 0.001096  loss: 5.1319 (5.1435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8728 (1.9405)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [5]  [ 800/1251]  eta: 0:04:44  lr: 0.001128  min_lr: 0.001128  loss: 4.9790 (5.1139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8535 (1.8985)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [5]  [1000/1251]  eta: 0:02:38  lr: 0.001160  min_lr: 0.001160  loss: 5.4102 (5.1040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7165 (1.8954)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [5]  [1200/1251]  eta: 0:00:32  lr: 0.001192  min_lr: 0.001192  loss: 5.2496 (5.0904)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7510 (1.8816)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 4.5838 (5.0816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6238 (1.8841)  time: 0.5327  data: 0.0006  max mem: 54228
Epoch: [5] Total time: 0:13:07 (0.6296 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 4.5838 (5.0654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6238 (1.8841)
Test:  [ 0/25]  eta: 0:02:47  loss: 1.9060 (1.9060)  acc1: 62.8000 (62.8000)  acc5: 81.2000 (81.2000)  time: 6.6879  data: 6.3499  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 2.0097 (2.0640)  acc1: 57.6000 (55.0909)  acc5: 81.2000 (80.6545)  time: 0.8794  data: 0.5775  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 2.5725 (2.4235)  acc1: 45.2000 (48.7238)  acc5: 71.2000 (74.5333)  time: 0.2984  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.6423 (2.4175)  acc1: 42.0000 (48.6400)  acc5: 70.0000 (74.4640)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5586 s / it)
* Acc@1 48.136 Acc@5 74.680 loss 2.423
Accuracy of the model on the 50000 test images: 48.1%
Max accuracy: 48.14%
Epoch: [6]  [   0/1251]  eta: 1:08:52  lr: 0.001200  min_lr: 0.001200  loss: 4.0221 (4.0221)  weight_decay: 0.0500 (0.0500)  time: 3.3032  data: 2.6642  max mem: 54228
Epoch: [6]  [ 200/1251]  eta: 0:11:13  lr: 0.001232  min_lr: 0.001232  loss: 4.9700 (4.9693)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5642 (1.5708)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [6]  [ 400/1251]  eta: 0:09:00  lr: 0.001264  min_lr: 0.001264  loss: 5.0941 (4.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6446 (1.5881)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [6]  [ 600/1251]  eta: 0:06:51  lr: 0.001296  min_lr: 0.001296  loss: 4.6157 (4.9307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5452 (1.6091)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [6]  [ 800/1251]  eta: 0:04:44  lr: 0.001328  min_lr: 0.001328  loss: 5.0808 (4.9284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5586 (1.6061)  time: 0.6329  data: 0.0004  max mem: 54228
Epoch: [6]  [1000/1251]  eta: 0:02:38  lr: 0.001360  min_lr: 0.001360  loss: 4.8299 (4.9174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6478 (1.5933)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [6]  [1200/1251]  eta: 0:00:32  lr: 0.001393  min_lr: 0.001393  loss: 5.1096 (4.9194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5222 (1.5858)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 4.9918 (4.9149)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3867 (1.5759)  time: 0.5334  data: 0.0004  max mem: 54228
Epoch: [6] Total time: 0:13:07 (0.6296 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 4.9918 (4.8977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3867 (1.5759)
Test:  [ 0/25]  eta: 0:02:30  loss: 1.7887 (1.7887)  acc1: 62.0000 (62.0000)  acc5: 85.6000 (85.6000)  time: 6.0032  data: 5.6358  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.7889 (1.8719)  acc1: 60.8000 (59.8182)  acc5: 86.4000 (84.2909)  time: 0.8175  data: 0.5127  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 2.3751 (2.1962)  acc1: 46.4000 (53.0476)  acc5: 74.0000 (78.4762)  time: 0.2986  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.4646 (2.2080)  acc1: 46.0000 (52.7200)  acc5: 72.8000 (78.2720)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5306 s / it)
* Acc@1 52.624 Acc@5 78.502 loss 2.205
Accuracy of the model on the 50000 test images: 52.6%
Max accuracy: 52.62%
Epoch: [7]  [   0/1251]  eta: 1:13:44  lr: 0.001400  min_lr: 0.001400  loss: 3.5507 (3.5507)  weight_decay: 0.0500 (0.0500)  time: 3.5367  data: 2.9089  max mem: 54228
Epoch: [7]  [ 200/1251]  eta: 0:11:15  lr: 0.001432  min_lr: 0.001432  loss: 4.9051 (4.8632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3730 (1.3665)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [7]  [ 400/1251]  eta: 0:09:01  lr: 0.001464  min_lr: 0.001464  loss: 5.0385 (4.8181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3142 (1.3629)  time: 0.6294  data: 0.0004  max mem: 54228
Epoch: [7]  [ 600/1251]  eta: 0:06:52  lr: 0.001496  min_lr: 0.001496  loss: 4.9213 (4.7897)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3495 (1.3845)  time: 0.6297  data: 0.0004  max mem: 54228
Epoch: [7]  [ 800/1251]  eta: 0:04:45  lr: 0.001528  min_lr: 0.001528  loss: 5.0026 (4.7728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3152 (1.3809)  time: 0.6296  data: 0.0004  max mem: 54228
Epoch: [7]  [1000/1251]  eta: 0:02:38  lr: 0.001561  min_lr: 0.001561  loss: 4.3929 (4.7545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2641 (1.3827)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [7]  [1200/1251]  eta: 0:00:32  lr: 0.001593  min_lr: 0.001593  loss: 4.8867 (4.7407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2073 (1.3693)  time: 0.6290  data: 0.0004  max mem: 54228
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 4.9378 (4.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2159 (1.3624)  time: 0.5339  data: 0.0005  max mem: 54228
Epoch: [7] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 4.9378 (4.7556)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2159 (1.3624)
Test:  [ 0/25]  eta: 0:02:34  loss: 1.5933 (1.5933)  acc1: 68.8000 (68.8000)  acc5: 88.0000 (88.0000)  time: 6.1989  data: 5.8563  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.6293 (1.7423)  acc1: 61.2000 (61.6000)  acc5: 88.0000 (86.3273)  time: 0.8364  data: 0.5328  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 2.1938 (2.0844)  acc1: 52.0000 (55.6762)  acc5: 78.0000 (80.8191)  time: 0.3000  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.2989 (2.0946)  acc1: 52.0000 (55.5520)  acc5: 75.6000 (80.2880)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5428 s / it)
* Acc@1 55.602 Acc@5 80.660 loss 2.085
Accuracy of the model on the 50000 test images: 55.6%
Max accuracy: 55.60%
Epoch: [8]  [   0/1251]  eta: 1:11:20  lr: 0.001600  min_lr: 0.001600  loss: 5.1118 (5.1118)  weight_decay: 0.0500 (0.0500)  time: 3.4219  data: 2.7825  max mem: 54228
Epoch: [8]  [ 200/1251]  eta: 0:11:16  lr: 0.001632  min_lr: 0.001632  loss: 4.8520 (4.7162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1508 (1.2504)  time: 0.6291  data: 0.0004  max mem: 54228
Epoch: [8]  [ 400/1251]  eta: 0:09:02  lr: 0.001664  min_lr: 0.001664  loss: 4.6912 (4.6805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2746 (1.2677)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [8]  [ 600/1251]  eta: 0:06:53  lr: 0.001696  min_lr: 0.001696  loss: 4.8589 (4.6968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0418 (1.2488)  time: 0.6296  data: 0.0004  max mem: 54228
Epoch: [8]  [ 800/1251]  eta: 0:04:45  lr: 0.001728  min_lr: 0.001728  loss: 4.5691 (4.6743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (1.2304)  time: 0.6334  data: 0.0004  max mem: 54228
Epoch: [8]  [1000/1251]  eta: 0:02:38  lr: 0.001761  min_lr: 0.001761  loss: 4.2105 (4.6648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1097 (1.2175)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [8]  [1200/1251]  eta: 0:00:32  lr: 0.001793  min_lr: 0.001793  loss: 4.7789 (4.6639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1238 (1.2145)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 4.4441 (4.6598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2694 (1.2186)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [8] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 4.4441 (4.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2694 (1.2186)
Test:  [ 0/25]  eta: 0:02:27  loss: 1.4167 (1.4167)  acc1: 70.4000 (70.4000)  acc5: 90.4000 (90.4000)  time: 5.9079  data: 5.5763  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.4573 (1.6230)  acc1: 66.8000 (65.7455)  acc5: 90.0000 (88.1818)  time: 0.8100  data: 0.5074  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 2.0570 (1.9345)  acc1: 55.6000 (59.1048)  acc5: 81.6000 (83.1429)  time: 0.2999  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.1152 (1.9450)  acc1: 53.2000 (58.7840)  acc5: 78.0000 (82.8800)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5293 s / it)
* Acc@1 58.630 Acc@5 82.616 loss 1.948
Accuracy of the model on the 50000 test images: 58.6%
Max accuracy: 58.63%
Epoch: [9]  [   0/1251]  eta: 0:58:41  lr: 0.001800  min_lr: 0.001800  loss: 3.7232 (3.7232)  weight_decay: 0.0500 (0.0500)  time: 2.8153  data: 2.1717  max mem: 54228
Epoch: [9]  [ 200/1251]  eta: 0:11:12  lr: 0.001832  min_lr: 0.001832  loss: 4.8140 (4.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0067 (1.1826)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [9]  [ 400/1251]  eta: 0:08:59  lr: 0.001864  min_lr: 0.001864  loss: 4.4811 (4.5911)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1545 (1.1587)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [9]  [ 600/1251]  eta: 0:06:52  lr: 0.001896  min_lr: 0.001896  loss: 4.6123 (4.5888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1549 (1.1327)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [9]  [ 800/1251]  eta: 0:04:44  lr: 0.001929  min_lr: 0.001929  loss: 4.4740 (4.5719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0224 (1.1086)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [9]  [1000/1251]  eta: 0:02:38  lr: 0.001961  min_lr: 0.001961  loss: 4.8163 (4.5551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0710 (1.1066)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [9]  [1200/1251]  eta: 0:00:32  lr: 0.001993  min_lr: 0.001993  loss: 4.6100 (4.5405)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1394 (1.1110)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.6991 (4.5439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0166 (1.1085)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [9] Total time: 0:13:08 (0.6299 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.6991 (4.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0166 (1.1085)
Test:  [ 0/25]  eta: 0:02:36  loss: 1.4917 (1.4917)  acc1: 72.0000 (72.0000)  acc5: 90.4000 (90.4000)  time: 6.2587  data: 5.9249  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.5072 (1.6136)  acc1: 64.4000 (67.1636)  acc5: 91.2000 (89.8909)  time: 0.8414  data: 0.5389  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 2.0184 (1.9079)  acc1: 58.8000 (61.1238)  acc5: 82.4000 (84.8191)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.0799 (1.9227)  acc1: 58.0000 (60.7680)  acc5: 80.8000 (84.5120)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5437 s / it)
* Acc@1 60.686 Acc@5 84.380 loss 1.914
Accuracy of the model on the 50000 test images: 60.7%
Max accuracy: 60.69%
Epoch: [10]  [   0/1251]  eta: 1:15:09  lr: 0.002000  min_lr: 0.002000  loss: 5.0596 (5.0596)  weight_decay: 0.0500 (0.0500)  time: 3.6045  data: 2.9592  max mem: 54228
Epoch: [10]  [ 200/1251]  eta: 0:11:15  lr: 0.002032  min_lr: 0.002032  loss: 4.6187 (4.4805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8578 (0.9744)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [10]  [ 400/1251]  eta: 0:09:01  lr: 0.002064  min_lr: 0.002064  loss: 4.7396 (4.5233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9592 (0.9898)  time: 0.6389  data: 0.0005  max mem: 54228
Epoch: [10]  [ 600/1251]  eta: 0:06:52  lr: 0.002096  min_lr: 0.002096  loss: 4.2584 (4.5038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8195 (0.9702)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [10]  [ 800/1251]  eta: 0:04:45  lr: 0.002129  min_lr: 0.002129  loss: 4.6331 (4.5126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8753 (0.9583)  time: 0.6334  data: 0.0005  max mem: 54228
Epoch: [10]  [1000/1251]  eta: 0:02:38  lr: 0.002161  min_lr: 0.002161  loss: 4.6591 (4.4960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0742 (0.9749)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [10]  [1200/1251]  eta: 0:00:32  lr: 0.002193  min_lr: 0.002193  loss: 4.0023 (4.4915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9579 (0.9753)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 4.2663 (4.4882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9775)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [10] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 4.2663 (4.4573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9775)
Test:  [ 0/25]  eta: 0:02:44  loss: 1.1814 (1.1814)  acc1: 75.2000 (75.2000)  acc5: 92.4000 (92.4000)  time: 6.5685  data: 6.2328  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 1.3406 (1.4130)  acc1: 72.4000 (69.9273)  acc5: 92.0000 (90.6909)  time: 0.8696  data: 0.5669  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.8144 (1.7198)  acc1: 61.2000 (63.2952)  acc5: 82.8000 (86.0381)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.9291 (1.7317)  acc1: 57.2000 (62.8800)  acc5: 81.2000 (85.7920)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5548 s / it)
* Acc@1 62.638 Acc@5 85.508 loss 1.735
Accuracy of the model on the 50000 test images: 62.6%
Max accuracy: 62.64%
Epoch: [11]  [   0/1251]  eta: 1:12:54  lr: 0.002200  min_lr: 0.002200  loss: 5.0106 (5.0106)  weight_decay: 0.0500 (0.0500)  time: 3.4969  data: 2.8565  max mem: 54228
Epoch: [11]  [ 200/1251]  eta: 0:11:14  lr: 0.002232  min_lr: 0.002232  loss: 4.6643 (4.3919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8962 (0.9396)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [11]  [ 400/1251]  eta: 0:09:01  lr: 0.002264  min_lr: 0.002264  loss: 4.4554 (4.4097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8035 (0.9115)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [11]  [ 600/1251]  eta: 0:06:52  lr: 0.002297  min_lr: 0.002297  loss: 4.3376 (4.4148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8356 (0.9008)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [11]  [ 800/1251]  eta: 0:04:45  lr: 0.002329  min_lr: 0.002329  loss: 4.4092 (4.3992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (0.9019)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [11]  [1000/1251]  eta: 0:02:38  lr: 0.002361  min_lr: 0.002361  loss: 4.5608 (4.4045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8357 (0.9005)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [11]  [1200/1251]  eta: 0:00:32  lr: 0.002393  min_lr: 0.002393  loss: 4.2678 (4.3951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.8972)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 4.5203 (4.3971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.8938)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [11] Total time: 0:13:08 (0.6300 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 4.5203 (4.3837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.8938)
Test:  [ 0/25]  eta: 0:02:40  loss: 1.3342 (1.3342)  acc1: 76.0000 (76.0000)  acc5: 90.0000 (90.0000)  time: 6.4210  data: 6.0643  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.3776 (1.5050)  acc1: 74.8000 (70.8727)  acc5: 92.8000 (91.2727)  time: 0.8562  data: 0.5516  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.9129 (1.8168)  acc1: 60.4000 (64.5714)  acc5: 86.4000 (86.5524)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 2.0354 (1.8198)  acc1: 59.6000 (64.2880)  acc5: 82.8000 (86.5120)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5485 s / it)
* Acc@1 63.934 Acc@5 86.348 loss 1.811
Accuracy of the model on the 50000 test images: 63.9%
Max accuracy: 63.93%
Epoch: [12]  [   0/1251]  eta: 1:10:58  lr: 0.002400  min_lr: 0.002400  loss: 4.3957 (4.3957)  weight_decay: 0.0500 (0.0500)  time: 3.4043  data: 2.7624  max mem: 54228
Epoch: [12]  [ 200/1251]  eta: 0:11:15  lr: 0.002432  min_lr: 0.002432  loss: 4.2841 (4.3855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9772 (0.8980)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [12]  [ 400/1251]  eta: 0:09:01  lr: 0.002464  min_lr: 0.002464  loss: 4.4131 (4.3508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7892 (0.8762)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [12]  [ 600/1251]  eta: 0:06:52  lr: 0.002497  min_lr: 0.002497  loss: 4.1829 (4.3473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7618 (0.8531)  time: 0.6380  data: 0.0005  max mem: 54228
Epoch: [12]  [ 800/1251]  eta: 0:04:45  lr: 0.002529  min_lr: 0.002529  loss: 4.4051 (4.3503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.8649)  time: 0.6293  data: 0.0004  max mem: 54228
Epoch: [12]  [1000/1251]  eta: 0:02:38  lr: 0.002561  min_lr: 0.002561  loss: 4.3611 (4.3272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8681 (0.8656)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [12]  [1200/1251]  eta: 0:00:32  lr: 0.002593  min_lr: 0.002593  loss: 4.4837 (4.3268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7792 (0.8521)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 4.2267 (4.3238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8225 (0.8535)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [12] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 4.2267 (4.3151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8225 (0.8535)
Test:  [ 0/25]  eta: 0:02:10  loss: 1.1474 (1.1474)  acc1: 80.0000 (80.0000)  acc5: 93.6000 (93.6000)  time: 5.2173  data: 4.8727  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.3077 (1.3672)  acc1: 71.2000 (70.9818)  acc5: 93.6000 (91.9636)  time: 0.8286  data: 0.5250  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.7173 (1.6708)  acc1: 63.2000 (65.0095)  acc5: 85.6000 (87.1810)  time: 0.3447  data: 0.0451  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.7970 (1.6862)  acc1: 59.6000 (64.6720)  acc5: 84.0000 (87.0400)  time: 0.2997  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5365 s / it)
* Acc@1 64.714 Acc@5 87.022 loss 1.686
Accuracy of the model on the 50000 test images: 64.7%
Max accuracy: 64.71%
Epoch: [13]  [   0/1251]  eta: 1:03:05  lr: 0.002600  min_lr: 0.002600  loss: 3.7311 (3.7311)  weight_decay: 0.0500 (0.0500)  time: 3.0264  data: 2.3893  max mem: 54228
Epoch: [13]  [ 200/1251]  eta: 0:11:13  lr: 0.002632  min_lr: 0.002632  loss: 4.4510 (4.2246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7068 (0.7378)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [13]  [ 400/1251]  eta: 0:08:59  lr: 0.002665  min_lr: 0.002665  loss: 3.9819 (4.2287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.7968)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [13]  [ 600/1251]  eta: 0:06:51  lr: 0.002697  min_lr: 0.002697  loss: 4.3122 (4.2588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7173 (0.7800)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [13]  [ 800/1251]  eta: 0:04:44  lr: 0.002729  min_lr: 0.002729  loss: 4.1439 (4.2933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.7933)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [13]  [1000/1251]  eta: 0:02:38  lr: 0.002761  min_lr: 0.002761  loss: 4.2262 (4.2768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.7786)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [13]  [1200/1251]  eta: 0:00:32  lr: 0.002793  min_lr: 0.002793  loss: 4.3473 (4.2874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7230 (0.7761)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.3250 (4.2869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (0.7782)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [13] Total time: 0:13:07 (0.6297 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.3250 (4.2757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7727 (0.7782)
Test:  [ 0/25]  eta: 0:02:33  loss: 1.1742 (1.1742)  acc1: 74.8000 (74.8000)  acc5: 92.4000 (92.4000)  time: 6.1402  data: 5.8075  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.1742 (1.3120)  acc1: 74.8000 (72.1455)  acc5: 92.4000 (92.4364)  time: 0.8310  data: 0.5283  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.6807 (1.6005)  acc1: 63.2000 (66.0000)  acc5: 86.4000 (87.8476)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.7842 (1.6126)  acc1: 62.4000 (65.7600)  acc5: 84.4000 (87.8240)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5387 s / it)
* Acc@1 66.008 Acc@5 87.834 loss 1.602
Accuracy of the model on the 50000 test images: 66.0%
Max accuracy: 66.01%
Epoch: [14]  [   0/1251]  eta: 1:15:17  lr: 0.002800  min_lr: 0.002800  loss: 4.4731 (4.4731)  weight_decay: 0.0500 (0.0500)  time: 3.6110  data: 2.9649  max mem: 54228
Epoch: [14]  [ 200/1251]  eta: 0:11:15  lr: 0.002833  min_lr: 0.002833  loss: 4.2878 (4.1504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7299 (0.7542)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [14]  [ 400/1251]  eta: 0:09:01  lr: 0.002865  min_lr: 0.002865  loss: 4.4692 (4.1694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.7265)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [14]  [ 600/1251]  eta: 0:06:52  lr: 0.002897  min_lr: 0.002897  loss: 4.1253 (4.1814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6442 (0.7127)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [14]  [ 800/1251]  eta: 0:04:45  lr: 0.002929  min_lr: 0.002929  loss: 4.4006 (4.2003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7241 (0.7095)  time: 0.6335  data: 0.0004  max mem: 54228
Epoch: [14]  [1000/1251]  eta: 0:02:38  lr: 0.002961  min_lr: 0.002961  loss: 4.1508 (4.1955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6249 (0.7112)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [14]  [1200/1251]  eta: 0:00:32  lr: 0.002993  min_lr: 0.002993  loss: 4.1225 (4.1943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7886 (0.7146)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 4.5098 (4.1884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.7137)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [14] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 4.5098 (4.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.7137)
Test:  [ 0/25]  eta: 0:02:36  loss: 1.1484 (1.1484)  acc1: 76.8000 (76.8000)  acc5: 93.6000 (93.6000)  time: 6.2467  data: 5.9181  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.2652 (1.3265)  acc1: 72.4000 (72.3273)  acc5: 93.6000 (92.6182)  time: 0.8405  data: 0.5383  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.6841 (1.5854)  acc1: 63.6000 (67.4476)  acc5: 88.0000 (88.8191)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.8282 (1.5936)  acc1: 63.6000 (66.9280)  acc5: 86.4000 (88.6720)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5437 s / it)
* Acc@1 66.684 Acc@5 88.194 loss 1.595
Accuracy of the model on the 50000 test images: 66.7%
Max accuracy: 66.68%
Epoch: [15]  [   0/1251]  eta: 1:04:36  lr: 0.003000  min_lr: 0.003000  loss: 3.3382 (3.3382)  weight_decay: 0.0500 (0.0500)  time: 3.0988  data: 2.4676  max mem: 54228
Epoch: [15]  [ 200/1251]  eta: 0:11:14  lr: 0.003033  min_lr: 0.003033  loss: 4.0950 (4.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.7236)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [15]  [ 400/1251]  eta: 0:09:00  lr: 0.003065  min_lr: 0.003065  loss: 4.3977 (4.1886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6442 (0.7100)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [15]  [ 600/1251]  eta: 0:06:52  lr: 0.003097  min_lr: 0.003097  loss: 4.2047 (4.1774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6548 (0.7060)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [15]  [ 800/1251]  eta: 0:04:45  lr: 0.003129  min_lr: 0.003129  loss: 4.5124 (4.1750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6387 (0.7060)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [15]  [1000/1251]  eta: 0:02:38  lr: 0.003161  min_lr: 0.003161  loss: 4.4154 (4.1991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6487 (0.6997)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [15]  [1200/1251]  eta: 0:00:32  lr: 0.003193  min_lr: 0.003193  loss: 3.9618 (4.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5724 (0.6872)  time: 0.6351  data: 0.0005  max mem: 54228
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 4.1273 (4.1908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6792 (0.6911)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [15] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 4.1273 (4.1808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6792 (0.6911)
Test:  [ 0/25]  eta: 0:02:48  loss: 1.1341 (1.1341)  acc1: 76.8000 (76.8000)  acc5: 94.8000 (94.8000)  time: 6.7503  data: 6.4160  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 1.1989 (1.3170)  acc1: 72.4000 (72.8727)  acc5: 93.6000 (92.5455)  time: 0.8860  data: 0.5836  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.6953 (1.5634)  acc1: 62.8000 (67.7905)  acc5: 88.0000 (88.8191)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.7583 (1.5707)  acc1: 62.8000 (67.5520)  acc5: 85.2000 (88.7040)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5627 s / it)
* Acc@1 67.574 Acc@5 88.682 loss 1.565
Accuracy of the model on the 50000 test images: 67.6%
Max accuracy: 67.57%
Epoch: [16]  [   0/1251]  eta: 1:20:25  lr: 0.003201  min_lr: 0.003201  loss: 4.2488 (4.2488)  weight_decay: 0.0500 (0.0500)  time: 3.8575  data: 3.2327  max mem: 54228
Epoch: [16]  [ 200/1251]  eta: 0:11:17  lr: 0.003233  min_lr: 0.003233  loss: 4.1245 (4.1804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6058 (0.6778)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [16]  [ 400/1251]  eta: 0:09:02  lr: 0.003265  min_lr: 0.003265  loss: 3.9837 (4.1711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6109 (0.6575)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [16]  [ 600/1251]  eta: 0:06:53  lr: 0.003297  min_lr: 0.003297  loss: 4.3550 (4.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6750 (0.6632)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [16]  [ 800/1251]  eta: 0:04:45  lr: 0.003329  min_lr: 0.003329  loss: 4.3461 (4.1699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.6605)  time: 0.6331  data: 0.0005  max mem: 54228
Epoch: [16]  [1000/1251]  eta: 0:02:38  lr: 0.003361  min_lr: 0.003361  loss: 3.8361 (4.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5480 (0.6557)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [16]  [1200/1251]  eta: 0:00:32  lr: 0.003393  min_lr: 0.003393  loss: 4.5077 (4.1525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.6505)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 4.2878 (4.1513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.6531)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [16] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 4.2878 (4.1407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.6531)
Test:  [ 0/25]  eta: 0:02:34  loss: 1.0845 (1.0845)  acc1: 79.2000 (79.2000)  acc5: 93.6000 (93.6000)  time: 6.1961  data: 5.8788  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.2772 (1.3138)  acc1: 73.6000 (73.3455)  acc5: 93.2000 (92.4727)  time: 0.8361  data: 0.5349  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.6180 (1.5561)  acc1: 65.6000 (68.3238)  acc5: 88.8000 (89.1810)  time: 0.2999  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.6207 (1.5640)  acc1: 66.8000 (68.2560)  acc5: 87.2000 (89.1200)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5413 s / it)
* Acc@1 67.910 Acc@5 89.106 loss 1.558
Accuracy of the model on the 50000 test images: 67.9%
Max accuracy: 67.91%
Epoch: [17]  [   0/1251]  eta: 1:01:34  lr: 0.003401  min_lr: 0.003401  loss: 4.5218 (4.5218)  weight_decay: 0.0500 (0.0500)  time: 2.9533  data: 2.3153  max mem: 54228
Epoch: [17]  [ 200/1251]  eta: 0:11:12  lr: 0.003433  min_lr: 0.003433  loss: 4.1649 (4.1306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6283 (0.6484)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [17]  [ 400/1251]  eta: 0:09:00  lr: 0.003465  min_lr: 0.003465  loss: 4.4281 (4.1054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6293 (0.6450)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [17]  [ 600/1251]  eta: 0:06:52  lr: 0.003497  min_lr: 0.003497  loss: 4.3989 (4.1034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6191 (0.6385)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [17]  [ 800/1251]  eta: 0:04:45  lr: 0.003529  min_lr: 0.003529  loss: 4.3055 (4.0967)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5366 (0.6385)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [17]  [1000/1251]  eta: 0:02:38  lr: 0.003561  min_lr: 0.003561  loss: 4.3207 (4.1085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6235 (0.6327)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [17]  [1200/1251]  eta: 0:00:32  lr: 0.003593  min_lr: 0.003593  loss: 4.3983 (4.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5638 (0.6263)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 4.3350 (4.1067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6158 (0.6271)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [17] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 4.3350 (4.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6158 (0.6271)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.9908 (0.9908)  acc1: 78.8000 (78.8000)  acc5: 94.4000 (94.4000)  time: 6.5835  data: 6.2610  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 1.1701 (1.2099)  acc1: 76.0000 (74.5455)  acc5: 94.4000 (93.0909)  time: 0.8709  data: 0.5695  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.5280 (1.4817)  acc1: 65.6000 (69.1429)  acc5: 87.6000 (89.5810)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.6931 (1.4936)  acc1: 65.6000 (68.8000)  acc5: 86.8000 (89.5520)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5552 s / it)
* Acc@1 68.900 Acc@5 89.448 loss 1.489
Accuracy of the model on the 50000 test images: 68.9%
Max accuracy: 68.90%
Epoch: [18]  [   0/1251]  eta: 1:10:20  lr: 0.003601  min_lr: 0.003601  loss: 4.2473 (4.2473)  weight_decay: 0.0500 (0.0500)  time: 3.3739  data: 2.7323  max mem: 54228
Epoch: [18]  [ 200/1251]  eta: 0:11:15  lr: 0.003633  min_lr: 0.003633  loss: 4.3011 (4.1001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5324 (0.6131)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [18]  [ 400/1251]  eta: 0:09:01  lr: 0.003665  min_lr: 0.003665  loss: 4.3047 (4.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6012 (0.6180)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [18]  [ 600/1251]  eta: 0:06:52  lr: 0.003697  min_lr: 0.003697  loss: 4.2664 (4.0832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5560 (0.6135)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [18]  [ 800/1251]  eta: 0:04:45  lr: 0.003729  min_lr: 0.003729  loss: 4.3054 (4.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6075 (0.6200)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [18]  [1000/1251]  eta: 0:02:38  lr: 0.003761  min_lr: 0.003761  loss: 4.2803 (4.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5269 (0.6070)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [18]  [1200/1251]  eta: 0:00:32  lr: 0.003793  min_lr: 0.003793  loss: 4.2767 (4.0759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5082 (0.6000)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 3.7211 (4.0701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5687 (0.5996)  time: 0.5385  data: 0.0005  max mem: 54228
Epoch: [18] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 3.7211 (4.0645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5687 (0.5996)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.9502 (0.9502)  acc1: 81.6000 (81.6000)  acc5: 96.4000 (96.4000)  time: 6.6277  data: 6.2947  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 1.2292 (1.2107)  acc1: 77.2000 (75.0182)  acc5: 94.4000 (93.7818)  time: 0.8749  data: 0.5726  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.5109 (1.4485)  acc1: 66.8000 (69.6952)  acc5: 88.8000 (90.0000)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.5799 (1.4554)  acc1: 65.6000 (69.2960)  acc5: 86.8000 (89.8400)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5566 s / it)
* Acc@1 69.242 Acc@5 89.816 loss 1.453
Accuracy of the model on the 50000 test images: 69.2%
Max accuracy: 69.24%
Epoch: [19]  [   0/1251]  eta: 1:11:29  lr: 0.003801  min_lr: 0.003801  loss: 4.3482 (4.3482)  weight_decay: 0.0500 (0.0500)  time: 3.4287  data: 2.7875  max mem: 54228
Epoch: [19]  [ 200/1251]  eta: 0:11:14  lr: 0.003833  min_lr: 0.003833  loss: 4.1872 (4.0873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5211 (0.6202)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [19]  [ 400/1251]  eta: 0:09:00  lr: 0.003865  min_lr: 0.003865  loss: 4.1692 (4.0625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5686 (0.5834)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [19]  [ 600/1251]  eta: 0:06:52  lr: 0.003897  min_lr: 0.003897  loss: 4.3561 (4.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5320 (0.5734)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [19]  [ 800/1251]  eta: 0:04:45  lr: 0.003929  min_lr: 0.003929  loss: 4.1915 (4.0499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5450 (0.5679)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [19]  [1000/1251]  eta: 0:02:38  lr: 0.003961  min_lr: 0.003961  loss: 3.4407 (4.0446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4825 (0.5616)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [19]  [1200/1251]  eta: 0:00:32  lr: 0.003993  min_lr: 0.003993  loss: 3.7832 (4.0372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5454 (0.5682)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.2793 (4.0400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.5746)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [19] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.2793 (4.0479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.5746)
Test:  [ 0/25]  eta: 0:02:33  loss: 1.0458 (1.0458)  acc1: 79.2000 (79.2000)  acc5: 93.6000 (93.6000)  time: 6.1277  data: 5.7904  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.2018 (1.2343)  acc1: 74.8000 (74.8364)  acc5: 93.6000 (93.1636)  time: 0.8636  data: 0.5609  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.5440 (1.4920)  acc1: 67.2000 (69.7714)  acc5: 88.8000 (89.7714)  time: 0.3183  data: 0.0190  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.6753 (1.5078)  acc1: 66.0000 (69.3440)  acc5: 86.8000 (89.5840)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5526 s / it)
* Acc@1 69.076 Acc@5 89.842 loss 1.496
Accuracy of the model on the 50000 test images: 69.1%
Max accuracy: 69.24%
Epoch: [20]  [   0/1251]  eta: 1:29:24  lr: 0.004000  min_lr: 0.004000  loss: 4.6379 (4.6379)  weight_decay: 0.0500 (0.0500)  time: 4.2878  data: 1.8215  max mem: 54228
Epoch: [20]  [ 200/1251]  eta: 0:11:19  lr: 0.004000  min_lr: 0.004000  loss: 4.2375 (4.0640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4959 (0.5456)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [20]  [ 400/1251]  eta: 0:09:03  lr: 0.004000  min_lr: 0.004000  loss: 4.1186 (4.0662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4904 (0.5418)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [20]  [ 600/1251]  eta: 0:06:53  lr: 0.004000  min_lr: 0.004000  loss: 4.3055 (4.0538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5766 (0.5632)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [20]  [ 800/1251]  eta: 0:04:45  lr: 0.004000  min_lr: 0.004000  loss: 4.3152 (4.0300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5885 (0.5641)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [20]  [1000/1251]  eta: 0:02:38  lr: 0.004000  min_lr: 0.004000  loss: 3.7748 (4.0182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5542 (0.5708)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [20]  [1200/1251]  eta: 0:00:32  lr: 0.004000  min_lr: 0.004000  loss: 3.7305 (4.0071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5417 (0.5687)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.1934 (4.0097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.5708)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [20] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.1934 (4.0046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.5708)
Test:  [ 0/25]  eta: 0:02:28  loss: 1.0210 (1.0210)  acc1: 80.8000 (80.8000)  acc5: 96.8000 (96.8000)  time: 5.9583  data: 5.6058  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.2714 (1.2772)  acc1: 75.2000 (74.1091)  acc5: 94.8000 (94.1091)  time: 0.8148  data: 0.5100  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.5679 (1.5063)  acc1: 67.2000 (69.8286)  acc5: 88.8000 (90.7429)  time: 0.3004  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.6309 (1.5153)  acc1: 67.2000 (69.6640)  acc5: 87.6000 (90.6400)  time: 0.3005  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5339 s / it)
* Acc@1 69.842 Acc@5 90.346 loss 1.517
Accuracy of the model on the 50000 test images: 69.8%
Max accuracy: 69.84%
Epoch: [21]  [   0/1251]  eta: 1:20:08  lr: 0.004000  min_lr: 0.004000  loss: 3.0316 (3.0316)  weight_decay: 0.0500 (0.0500)  time: 3.8441  data: 3.2067  max mem: 54228
Epoch: [21]  [ 200/1251]  eta: 0:11:18  lr: 0.004000  min_lr: 0.004000  loss: 3.9460 (3.9244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5995 (0.6015)  time: 0.6386  data: 0.0005  max mem: 54228
Epoch: [21]  [ 400/1251]  eta: 0:09:02  lr: 0.004000  min_lr: 0.004000  loss: 4.4331 (3.9783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5317 (0.5765)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [21]  [ 600/1251]  eta: 0:06:52  lr: 0.004000  min_lr: 0.004000  loss: 4.2862 (3.9968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4560 (0.5636)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [21]  [ 800/1251]  eta: 0:04:45  lr: 0.004000  min_lr: 0.004000  loss: 4.0340 (4.0041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5036 (0.5562)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [21]  [1000/1251]  eta: 0:02:38  lr: 0.004000  min_lr: 0.004000  loss: 3.7038 (3.9949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5214 (0.5545)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [21]  [1200/1251]  eta: 0:00:32  lr: 0.004000  min_lr: 0.004000  loss: 4.0098 (3.9858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5853 (0.5549)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1800 (3.9871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5576 (0.5573)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [21] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.1800 (3.9657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5576 (0.5573)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.9469 (0.9469)  acc1: 82.0000 (82.0000)  acc5: 95.6000 (95.6000)  time: 6.2704  data: 5.9384  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0674 (1.1474)  acc1: 76.8000 (74.7273)  acc5: 93.6000 (93.9273)  time: 0.8421  data: 0.5401  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.4842 (1.3974)  acc1: 64.8000 (70.3238)  acc5: 88.4000 (90.4762)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.6232 (1.4097)  acc1: 64.8000 (70.0160)  acc5: 87.6000 (90.3360)  time: 0.2991  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5418 s / it)
* Acc@1 70.208 Acc@5 90.654 loss 1.399
Accuracy of the model on the 50000 test images: 70.2%
Max accuracy: 70.21%
Epoch: [22]  [   0/1251]  eta: 1:19:20  lr: 0.003999  min_lr: 0.003999  loss: 4.5473 (4.5473)  weight_decay: 0.0500 (0.0500)  time: 3.8053  data: 3.1727  max mem: 54228
Epoch: [22]  [ 200/1251]  eta: 0:11:17  lr: 0.003999  min_lr: 0.003999  loss: 3.4256 (3.8708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5038 (0.5647)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [22]  [ 400/1251]  eta: 0:09:01  lr: 0.003999  min_lr: 0.003999  loss: 4.0613 (3.8867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4801 (0.5515)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [22]  [ 600/1251]  eta: 0:06:52  lr: 0.003999  min_lr: 0.003999  loss: 3.9719 (3.8867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4716 (0.5622)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [22]  [ 800/1251]  eta: 0:04:45  lr: 0.003999  min_lr: 0.003999  loss: 3.7689 (3.8978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4409 (0.5552)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [22]  [1000/1251]  eta: 0:02:38  lr: 0.003999  min_lr: 0.003999  loss: 3.7117 (3.9034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4498 (0.5438)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [22]  [1200/1251]  eta: 0:00:32  lr: 0.003999  min_lr: 0.003999  loss: 4.2663 (3.9147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5495 (0.5471)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.2488 (3.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5885 (0.5500)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [22] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.2488 (3.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5885 (0.5500)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.9278 (0.9278)  acc1: 79.2000 (79.2000)  acc5: 96.0000 (96.0000)  time: 5.9444  data: 5.6270  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.1204 (1.1734)  acc1: 77.2000 (76.1091)  acc5: 94.4000 (93.7091)  time: 0.8129  data: 0.5119  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.4654 (1.3939)  acc1: 68.0000 (71.4857)  acc5: 89.2000 (90.6476)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.5555 (1.4031)  acc1: 67.6000 (71.1360)  acc5: 88.0000 (90.4640)  time: 0.2990  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5292 s / it)
* Acc@1 70.838 Acc@5 90.822 loss 1.403
Accuracy of the model on the 50000 test images: 70.8%
Max accuracy: 70.84%
Epoch: [23]  [   0/1251]  eta: 1:13:15  lr: 0.003999  min_lr: 0.003999  loss: 4.3394 (4.3394)  weight_decay: 0.0500 (0.0500)  time: 3.5139  data: 2.8764  max mem: 54228
Epoch: [23]  [ 200/1251]  eta: 0:11:15  lr: 0.003999  min_lr: 0.003999  loss: 3.7670 (3.8647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4541 (0.5350)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [23]  [ 400/1251]  eta: 0:09:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1096 (3.8964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4954 (0.5233)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [23]  [ 600/1251]  eta: 0:06:52  lr: 0.003998  min_lr: 0.003998  loss: 4.1569 (3.8801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5949 (0.5439)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [23]  [ 800/1251]  eta: 0:04:45  lr: 0.003998  min_lr: 0.003998  loss: 3.6776 (3.8716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5379 (0.5403)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [23]  [1000/1251]  eta: 0:02:38  lr: 0.003998  min_lr: 0.003998  loss: 3.8322 (3.8757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4891 (0.5390)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [23]  [1200/1251]  eta: 0:00:32  lr: 0.003998  min_lr: 0.003998  loss: 3.9796 (3.8770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4884 (0.5326)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 4.1124 (3.8787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5759 (0.5399)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [23] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 4.1124 (3.9005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5759 (0.5399)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.8784 (0.8784)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 6.0124  data: 5.6660  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0839 (1.1391)  acc1: 78.4000 (76.1818)  acc5: 95.2000 (94.7636)  time: 0.8187  data: 0.5154  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.4051 (1.3852)  acc1: 68.0000 (71.0857)  acc5: 91.2000 (91.2191)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.4882 (1.3947)  acc1: 68.0000 (70.8480)  acc5: 88.8000 (91.1360)  time: 0.2990  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5312 s / it)
* Acc@1 71.356 Acc@5 91.018 loss 1.392
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.36%
Epoch: [24]  [   0/1251]  eta: 1:06:48  lr: 0.003998  min_lr: 0.003998  loss: 4.2758 (4.2758)  weight_decay: 0.0500 (0.0500)  time: 3.2046  data: 2.5653  max mem: 54228
Epoch: [24]  [ 200/1251]  eta: 0:11:13  lr: 0.003998  min_lr: 0.003998  loss: 3.9963 (3.9575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6124 (0.5645)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [24]  [ 400/1251]  eta: 0:09:00  lr: 0.003998  min_lr: 0.003998  loss: 3.8399 (3.9444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5516 (0.5887)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [24]  [ 600/1251]  eta: 0:06:52  lr: 0.003997  min_lr: 0.003997  loss: 3.9692 (3.9362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4147 (0.5497)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [24]  [ 800/1251]  eta: 0:04:45  lr: 0.003997  min_lr: 0.003997  loss: 3.7369 (3.9295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5428 (0.5577)  time: 0.6360  data: 0.0005  max mem: 54228
Epoch: [24]  [1000/1251]  eta: 0:02:38  lr: 0.003997  min_lr: 0.003997  loss: 3.8928 (3.9246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4567 (0.5403)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [24]  [1200/1251]  eta: 0:00:32  lr: 0.003997  min_lr: 0.003997  loss: 3.9668 (3.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4794 (0.5342)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 4.1891 (3.9196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4671 (0.5315)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [24] Total time: 0:13:07 (0.6298 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 4.1891 (3.8777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4671 (0.5315)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.9017 (0.9017)  acc1: 81.6000 (81.6000)  acc5: 97.2000 (97.2000)  time: 6.5910  data: 6.2591  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 1.0135 (1.1315)  acc1: 78.8000 (76.7636)  acc5: 94.8000 (94.2182)  time: 0.8712  data: 0.5694  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.4758 (1.3551)  acc1: 69.2000 (71.9619)  acc5: 90.0000 (91.0857)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.4843 (1.3599)  acc1: 69.2000 (71.5680)  acc5: 89.6000 (91.1360)  time: 0.2989  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5550 s / it)
* Acc@1 72.150 Acc@5 91.492 loss 1.345
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.15%
Epoch: [25]  [   0/1251]  eta: 1:09:05  lr: 0.003997  min_lr: 0.003997  loss: 3.4158 (3.4158)  weight_decay: 0.0500 (0.0500)  time: 3.3137  data: 2.6829  max mem: 54228
Epoch: [25]  [ 200/1251]  eta: 0:11:14  lr: 0.003997  min_lr: 0.003997  loss: 3.9345 (3.8965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4689 (0.5528)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [25]  [ 400/1251]  eta: 0:09:00  lr: 0.003996  min_lr: 0.003996  loss: 4.0534 (3.8966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5320 (0.5618)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [25]  [ 600/1251]  eta: 0:06:52  lr: 0.003996  min_lr: 0.003996  loss: 3.7056 (3.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4530 (0.5740)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [25]  [ 800/1251]  eta: 0:04:45  lr: 0.003996  min_lr: 0.003996  loss: 3.9323 (3.8742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5839 (0.5670)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [25]  [1000/1251]  eta: 0:02:38  lr: 0.003996  min_lr: 0.003996  loss: 3.8134 (3.8582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5034 (0.5775)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [25]  [1200/1251]  eta: 0:00:32  lr: 0.003996  min_lr: 0.003996  loss: 4.0696 (3.8506)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5571 (0.5743)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 3.8913 (3.8525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4671 (0.5705)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [25] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 3.8913 (3.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4671 (0.5705)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.8756 (0.8756)  acc1: 80.8000 (80.8000)  acc5: 95.6000 (95.6000)  time: 6.6555  data: 6.3302  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9595 (1.0732)  acc1: 78.0000 (76.8727)  acc5: 94.8000 (94.3636)  time: 0.8774  data: 0.5758  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.3410 (1.3020)  acc1: 68.4000 (72.2476)  acc5: 90.4000 (91.2762)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.4098 (1.3086)  acc1: 68.8000 (71.9520)  acc5: 90.8000 (91.3280)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5593 s / it)
* Acc@1 72.186 Acc@5 91.706 loss 1.303
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.19%
Epoch: [26]  [   0/1251]  eta: 1:12:13  lr: 0.003995  min_lr: 0.003995  loss: 4.1443 (4.1443)  weight_decay: 0.0500 (0.0500)  time: 3.4640  data: 2.8318  max mem: 54228
Epoch: [26]  [ 200/1251]  eta: 0:11:15  lr: 0.003995  min_lr: 0.003995  loss: 3.8646 (3.8716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5195 (0.5170)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [26]  [ 400/1251]  eta: 0:09:00  lr: 0.003995  min_lr: 0.003995  loss: 4.0918 (3.8670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4458 (0.5115)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [26]  [ 600/1251]  eta: 0:06:52  lr: 0.003995  min_lr: 0.003995  loss: 3.7264 (3.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4255 (0.5081)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [26]  [ 800/1251]  eta: 0:04:45  lr: 0.003994  min_lr: 0.003994  loss: 3.9064 (3.8431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5500 (0.5180)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [26]  [1000/1251]  eta: 0:02:38  lr: 0.003994  min_lr: 0.003994  loss: 3.9981 (3.8464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5211 (0.5245)  time: 0.6349  data: 0.0004  max mem: 54228
Epoch: [26]  [1200/1251]  eta: 0:00:32  lr: 0.003994  min_lr: 0.003994  loss: 3.8686 (3.8400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5634 (0.5315)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.7634 (3.8400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4710 (0.5303)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [26] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.7634 (3.8251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4710 (0.5303)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.8131 (0.8131)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.1955  data: 4.8564  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0320 (1.0525)  acc1: 79.2000 (77.3091)  acc5: 94.8000 (94.6909)  time: 0.8077  data: 0.5048  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.3474 (1.2605)  acc1: 71.6000 (72.7429)  acc5: 90.8000 (91.6381)  time: 0.3342  data: 0.0349  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3575 (1.2704)  acc1: 71.6000 (72.6080)  acc5: 90.4000 (91.6160)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5272 s / it)
* Acc@1 72.614 Acc@5 91.934 loss 1.258
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.61%
Epoch: [27]  [   0/1251]  eta: 1:14:00  lr: 0.003994  min_lr: 0.003994  loss: 2.6967 (2.6967)  weight_decay: 0.0500 (0.0500)  time: 3.5492  data: 2.9217  max mem: 54228
Epoch: [27]  [ 200/1251]  eta: 0:11:15  lr: 0.003994  min_lr: 0.003994  loss: 4.0062 (3.7741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5026 (0.5757)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [27]  [ 400/1251]  eta: 0:09:01  lr: 0.003993  min_lr: 0.003993  loss: 3.7777 (3.7676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4563 (0.5351)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [27]  [ 600/1251]  eta: 0:06:52  lr: 0.003993  min_lr: 0.003993  loss: 3.8602 (3.7773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5345 (nan)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [27]  [ 800/1251]  eta: 0:04:45  lr: 0.003993  min_lr: 0.003993  loss: 3.8533 (3.7657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5597 (nan)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [27]  [1000/1251]  eta: 0:02:38  lr: 0.003992  min_lr: 0.003992  loss: 3.7162 (3.7582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5134 (nan)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [27]  [1200/1251]  eta: 0:00:32  lr: 0.003992  min_lr: 0.003992  loss: 4.0807 (3.7631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4870 (nan)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 3.7042 (3.7602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (nan)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [27] Total time: 0:13:07 (0.6299 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 3.7042 (3.7977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (nan)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.8345 (0.8345)  acc1: 83.6000 (83.6000)  acc5: 97.6000 (97.6000)  time: 6.4991  data: 6.1572  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0972 (1.0799)  acc1: 76.8000 (78.0364)  acc5: 95.6000 (94.7273)  time: 0.8631  data: 0.5601  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.3463 (1.2940)  acc1: 70.4000 (73.3333)  acc5: 90.0000 (92.0571)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.4350 (1.3087)  acc1: 70.4000 (72.8800)  acc5: 90.0000 (91.7600)  time: 0.2991  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5513 s / it)
* Acc@1 72.962 Acc@5 92.002 loss 1.302
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 72.96%
Epoch: [28]  [   0/1251]  eta: 1:14:53  lr: 0.003992  min_lr: 0.003992  loss: 4.2812 (4.2812)  weight_decay: 0.0500 (0.0500)  time: 3.5921  data: 2.9615  max mem: 54228
Epoch: [28]  [ 200/1251]  eta: 0:11:16  lr: 0.003992  min_lr: 0.003992  loss: 3.7691 (3.8514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6266 (0.5521)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [28]  [ 400/1251]  eta: 0:09:01  lr: 0.003991  min_lr: 0.003991  loss: 3.8306 (3.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4527 (0.5348)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [28]  [ 600/1251]  eta: 0:06:52  lr: 0.003991  min_lr: 0.003991  loss: 4.2521 (3.8148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5113 (0.5495)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [28]  [ 800/1251]  eta: 0:04:45  lr: 0.003991  min_lr: 0.003991  loss: 3.9957 (3.8140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5299 (0.5680)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [28]  [1000/1251]  eta: 0:02:38  lr: 0.003990  min_lr: 0.003990  loss: 3.6009 (3.8088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4768 (0.5669)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [28]  [1200/1251]  eta: 0:00:32  lr: 0.003990  min_lr: 0.003990  loss: 3.6234 (3.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5270 (0.5651)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.6671 (3.7934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5270 (0.5641)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [28] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.6671 (3.7761)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5270 (0.5641)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.8435 (0.8435)  acc1: 81.2000 (81.2000)  acc5: 98.0000 (98.0000)  time: 6.1244  data: 5.7907  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0090 (1.0343)  acc1: 78.4000 (76.6909)  acc5: 96.0000 (95.2000)  time: 0.8295  data: 0.5267  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2755 (1.2356)  acc1: 70.0000 (72.9333)  acc5: 91.2000 (92.1333)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3514 (1.2477)  acc1: 70.0000 (72.6880)  acc5: 89.6000 (92.0640)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5374 s / it)
* Acc@1 73.042 Acc@5 92.120 loss 1.236
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.04%
Epoch: [29]  [   0/1251]  eta: 1:15:17  lr: 0.003990  min_lr: 0.003990  loss: 3.7018 (3.7018)  weight_decay: 0.0500 (0.0500)  time: 3.6110  data: 2.9738  max mem: 54228
Epoch: [29]  [ 200/1251]  eta: 0:11:16  lr: 0.003989  min_lr: 0.003989  loss: 3.8825 (3.7346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6238 (0.6093)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [29]  [ 400/1251]  eta: 0:09:01  lr: 0.003989  min_lr: 0.003989  loss: 3.9580 (3.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5688 (0.6084)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [29]  [ 600/1251]  eta: 0:06:52  lr: 0.003989  min_lr: 0.003989  loss: 3.8770 (3.7621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.6019)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [29]  [ 800/1251]  eta: 0:04:45  lr: 0.003988  min_lr: 0.003988  loss: 3.8792 (3.7554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5052 (0.5928)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [29]  [1000/1251]  eta: 0:02:38  lr: 0.003988  min_lr: 0.003988  loss: 4.1559 (3.7481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5211 (0.5868)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [29]  [1200/1251]  eta: 0:00:32  lr: 0.003988  min_lr: 0.003988  loss: 3.8902 (3.7509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4787 (0.5831)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.8898 (3.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4762 (0.5803)  time: 0.5330  data: 0.0005  max mem: 54228
Epoch: [29] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.8898 (3.7563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4762 (0.5803)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.8766 (0.8766)  acc1: 83.6000 (83.6000)  acc5: 97.6000 (97.6000)  time: 6.2309  data: 5.9117  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0101 (1.0705)  acc1: 79.6000 (77.7455)  acc5: 95.2000 (95.1636)  time: 0.8389  data: 0.5377  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.3475 (1.2948)  acc1: 69.6000 (73.0286)  acc5: 91.2000 (91.8857)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3976 (1.3039)  acc1: 69.2000 (72.6720)  acc5: 90.0000 (91.9200)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5417 s / it)
* Acc@1 73.532 Acc@5 92.304 loss 1.286
Accuracy of the model on the 50000 test images: 73.5%
Max accuracy: 73.53%
Epoch: [30]  [   0/1251]  eta: 1:12:42  lr: 0.003987  min_lr: 0.003987  loss: 4.1105 (4.1105)  weight_decay: 0.0500 (0.0500)  time: 3.4873  data: 2.8393  max mem: 54228
Epoch: [30]  [ 200/1251]  eta: 0:11:15  lr: 0.003987  min_lr: 0.003987  loss: 3.8366 (3.6674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4945 (0.5698)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [30]  [ 400/1251]  eta: 0:09:01  lr: 0.003987  min_lr: 0.003987  loss: 3.5406 (3.6933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6298 (0.6071)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [30]  [ 600/1251]  eta: 0:06:52  lr: 0.003986  min_lr: 0.003986  loss: 3.9874 (3.7173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5777 (0.6097)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [30]  [ 800/1251]  eta: 0:04:45  lr: 0.003986  min_lr: 0.003986  loss: 3.9249 (3.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6319 (0.6107)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [30]  [1000/1251]  eta: 0:02:38  lr: 0.003985  min_lr: 0.003985  loss: 3.6015 (3.7190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5423 (0.6023)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [30]  [1200/1251]  eta: 0:00:32  lr: 0.003985  min_lr: 0.003985  loss: 3.9566 (3.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4980 (0.5940)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.5277 (3.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5100 (0.5905)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [30] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.5277 (3.7310)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5100 (0.5905)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.8037 (0.8037)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 6.3811  data: 6.0620  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9467 (0.9817)  acc1: 78.8000 (78.5091)  acc5: 96.8000 (95.8546)  time: 0.8530  data: 0.5514  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2528 (1.2003)  acc1: 70.4000 (74.1714)  acc5: 92.8000 (92.7238)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3995 (1.2194)  acc1: 70.4000 (73.6320)  acc5: 90.4000 (92.5120)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5492 s / it)
* Acc@1 73.804 Acc@5 92.380 loss 1.219
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.80%
Epoch: [31]  [   0/1251]  eta: 1:02:16  lr: 0.003985  min_lr: 0.003985  loss: 3.2260 (3.2260)  weight_decay: 0.0500 (0.0500)  time: 2.9868  data: 2.3539  max mem: 54228
Epoch: [31]  [ 200/1251]  eta: 0:11:14  lr: 0.003984  min_lr: 0.003984  loss: 3.9242 (3.7162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5602 (0.6616)  time: 0.6395  data: 0.0004  max mem: 54228
Epoch: [31]  [ 400/1251]  eta: 0:09:00  lr: 0.003984  min_lr: 0.003984  loss: 3.7581 (3.7464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4989 (0.6277)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [31]  [ 600/1251]  eta: 0:06:52  lr: 0.003983  min_lr: 0.003983  loss: 3.3758 (3.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6053 (0.6116)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [31]  [ 800/1251]  eta: 0:04:45  lr: 0.003983  min_lr: 0.003983  loss: 3.7591 (3.7157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5583 (0.6053)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [31]  [1000/1251]  eta: 0:02:38  lr: 0.003982  min_lr: 0.003982  loss: 3.9254 (3.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5802 (0.5929)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [31]  [1200/1251]  eta: 0:00:32  lr: 0.003982  min_lr: 0.003982  loss: 3.7469 (3.7211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4585 (0.5951)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.8128 (3.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (0.5995)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [31] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.8128 (3.7141)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (0.5995)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.8590 (0.8590)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 6.6710  data: 6.3363  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9885 (1.0238)  acc1: 79.2000 (78.5091)  acc5: 96.0000 (95.0909)  time: 0.8790  data: 0.5764  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.2756 (1.2206)  acc1: 72.0000 (74.1333)  acc5: 91.6000 (92.1714)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3343 (1.2287)  acc1: 72.0000 (73.7760)  acc5: 90.4000 (92.1280)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5585 s / it)
* Acc@1 73.904 Acc@5 92.544 loss 1.224
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.90%
Epoch: [32]  [   0/1251]  eta: 1:06:59  lr: 0.003982  min_lr: 0.003982  loss: 2.5892 (2.5892)  weight_decay: 0.0500 (0.0500)  time: 3.2128  data: 2.5782  max mem: 54228
Epoch: [32]  [ 200/1251]  eta: 0:11:16  lr: 0.003981  min_lr: 0.003981  loss: 3.7920 (3.6975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5119 (0.5834)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [32]  [ 400/1251]  eta: 0:09:01  lr: 0.003981  min_lr: 0.003981  loss: 3.6849 (3.7054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6862 (0.6347)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [32]  [ 600/1251]  eta: 0:06:52  lr: 0.003980  min_lr: 0.003980  loss: 3.8796 (3.6926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5103 (0.6111)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [32]  [ 800/1251]  eta: 0:04:45  lr: 0.003980  min_lr: 0.003980  loss: 3.8930 (3.7104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4837 (0.5993)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [32]  [1000/1251]  eta: 0:02:38  lr: 0.003979  min_lr: 0.003979  loss: 3.4280 (3.7117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5749 (0.6144)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [32]  [1200/1251]  eta: 0:00:32  lr: 0.003979  min_lr: 0.003979  loss: 3.5455 (3.7052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5220 (0.6035)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.9253 (3.7037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6182 (0.6204)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [32] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.9253 (3.6879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6182 (0.6204)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.8474 (0.8474)  acc1: 83.6000 (83.6000)  acc5: 98.4000 (98.4000)  time: 6.4981  data: 6.1784  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9945 (1.0462)  acc1: 80.0000 (78.7273)  acc5: 96.8000 (96.1091)  time: 0.8629  data: 0.5620  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2877 (1.2526)  acc1: 72.0000 (74.1524)  acc5: 92.4000 (92.9524)  time: 0.2992  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3666 (1.2654)  acc1: 71.2000 (73.7120)  acc5: 91.2000 (92.8160)  time: 0.2991  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5519 s / it)
* Acc@1 74.442 Acc@5 92.660 loss 1.256
Accuracy of the model on the 50000 test images: 74.4%
Max accuracy: 74.44%
Epoch: [33]  [   0/1251]  eta: 1:06:56  lr: 0.003979  min_lr: 0.003979  loss: 4.0269 (4.0269)  weight_decay: 0.0500 (0.0500)  time: 3.2106  data: 2.5655  max mem: 54228
Epoch: [33]  [ 200/1251]  eta: 0:11:13  lr: 0.003978  min_lr: 0.003978  loss: 3.5233 (3.6624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5361 (0.6006)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [33]  [ 400/1251]  eta: 0:09:00  lr: 0.003978  min_lr: 0.003978  loss: 3.9135 (3.7085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4990 (0.5866)  time: 0.6360  data: 0.0004  max mem: 54228
Epoch: [33]  [ 600/1251]  eta: 0:06:52  lr: 0.003977  min_lr: 0.003977  loss: 3.8746 (3.7001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4988 (0.5969)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [33]  [ 800/1251]  eta: 0:04:45  lr: 0.003977  min_lr: 0.003977  loss: 3.7370 (3.6854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4964 (0.6077)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [33]  [1000/1251]  eta: 0:02:38  lr: 0.003976  min_lr: 0.003976  loss: 3.6206 (3.6699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.6116)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [33]  [1200/1251]  eta: 0:00:32  lr: 0.003976  min_lr: 0.003976  loss: 3.9184 (3.6735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5993 (0.6088)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.8303 (3.6789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.6178)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [33] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.8303 (3.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.6178)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.8402 (0.8402)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 6.3187  data: 5.9857  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9886 (1.0264)  acc1: 78.8000 (78.7636)  acc5: 96.4000 (95.9273)  time: 0.8468  data: 0.5445  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2834 (1.2374)  acc1: 72.4000 (74.2286)  acc5: 92.8000 (92.8571)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3489 (1.2481)  acc1: 72.0000 (74.1120)  acc5: 91.2000 (92.7360)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5452 s / it)
* Acc@1 74.462 Acc@5 92.844 loss 1.239
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.46%
Epoch: [34]  [   0/1251]  eta: 1:01:57  lr: 0.003975  min_lr: 0.003975  loss: 2.3551 (2.3551)  weight_decay: 0.0500 (0.0500)  time: 2.9716  data: 2.3343  max mem: 54228
Epoch: [34]  [ 200/1251]  eta: 0:11:12  lr: 0.003975  min_lr: 0.003975  loss: 3.7211 (3.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4905 (0.5736)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [34]  [ 400/1251]  eta: 0:09:00  lr: 0.003974  min_lr: 0.003974  loss: 3.5529 (3.6598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5453 (0.5817)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [34]  [ 600/1251]  eta: 0:06:52  lr: 0.003974  min_lr: 0.003974  loss: 3.5872 (3.6775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5946 (0.6151)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [34]  [ 800/1251]  eta: 0:04:45  lr: 0.003973  min_lr: 0.003973  loss: 3.6761 (3.6552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5749 (0.6321)  time: 0.6394  data: 0.0004  max mem: 54228
Epoch: [34]  [1000/1251]  eta: 0:02:38  lr: 0.003972  min_lr: 0.003972  loss: 3.8615 (3.6529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.6373)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [34]  [1200/1251]  eta: 0:00:32  lr: 0.003972  min_lr: 0.003972  loss: 3.9615 (3.6652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6947 (0.6370)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 3.7391 (3.6707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5102 (0.6327)  time: 0.5329  data: 0.0004  max mem: 54228
Epoch: [34] Total time: 0:13:07 (0.6298 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 3.7391 (3.6595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5102 (0.6327)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.8088 (0.8088)  acc1: 85.6000 (85.6000)  acc5: 96.8000 (96.8000)  time: 6.5481  data: 6.2152  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9803 (1.0403)  acc1: 80.0000 (79.2000)  acc5: 95.6000 (95.5636)  time: 0.8675  data: 0.5653  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2474 (1.2508)  acc1: 70.8000 (74.8762)  acc5: 92.0000 (92.8191)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.4008 (1.2600)  acc1: 70.8000 (74.2400)  acc5: 91.2000 (92.7360)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5529 s / it)
* Acc@1 74.824 Acc@5 92.912 loss 1.246
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.82%
Epoch: [35]  [   0/1251]  eta: 1:03:47  lr: 0.003972  min_lr: 0.003972  loss: 2.8244 (2.8244)  weight_decay: 0.0500 (0.0500)  time: 3.0595  data: 2.4286  max mem: 54228
Epoch: [35]  [ 200/1251]  eta: 0:11:15  lr: 0.003971  min_lr: 0.003971  loss: 3.6698 (3.6264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5655 (0.6242)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [35]  [ 400/1251]  eta: 0:09:00  lr: 0.003971  min_lr: 0.003971  loss: 3.7994 (3.6228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6063 (0.6360)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [35]  [ 600/1251]  eta: 0:06:52  lr: 0.003970  min_lr: 0.003970  loss: 3.9507 (3.6232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6600 (0.6768)  time: 0.6336  data: 0.0004  max mem: 54228
Epoch: [35]  [ 800/1251]  eta: 0:04:45  lr: 0.003969  min_lr: 0.003969  loss: 3.5856 (3.6203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6238 (0.6606)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [35]  [1000/1251]  eta: 0:02:38  lr: 0.003969  min_lr: 0.003969  loss: 3.9404 (3.6341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5328 (0.6623)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [35]  [1200/1251]  eta: 0:00:32  lr: 0.003968  min_lr: 0.003968  loss: 3.7847 (3.6252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6308 (0.6600)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.6270 (3.6250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.6617)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [35] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.6270 (3.6396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.6617)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.7922 (0.7922)  acc1: 84.8000 (84.8000)  acc5: 96.4000 (96.4000)  time: 6.1524  data: 5.8048  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9026 (0.9333)  acc1: 77.6000 (79.2364)  acc5: 96.4000 (95.9273)  time: 0.8317  data: 0.5280  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1902 (1.1542)  acc1: 74.4000 (74.7619)  acc5: 92.0000 (92.9524)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2996 (1.1653)  acc1: 72.4000 (74.5120)  acc5: 91.2000 (92.8320)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5386 s / it)
* Acc@1 74.936 Acc@5 93.114 loss 1.153
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.94%
Epoch: [36]  [   0/1251]  eta: 1:13:04  lr: 0.003968  min_lr: 0.003968  loss: 4.4005 (4.4005)  weight_decay: 0.0500 (0.0500)  time: 3.5045  data: 2.8593  max mem: 54228
Epoch: [36]  [ 200/1251]  eta: 0:11:14  lr: 0.003967  min_lr: 0.003967  loss: 3.6412 (3.6043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6607 (0.6452)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [36]  [ 400/1251]  eta: 0:09:00  lr: 0.003967  min_lr: 0.003967  loss: 3.7422 (3.5984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5242 (0.6963)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [36]  [ 600/1251]  eta: 0:06:52  lr: 0.003966  min_lr: 0.003966  loss: 3.9498 (3.6210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6079 (0.6999)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [36]  [ 800/1251]  eta: 0:04:44  lr: 0.003965  min_lr: 0.003965  loss: 3.8496 (3.6142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6166 (0.6928)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [36]  [1000/1251]  eta: 0:02:38  lr: 0.003965  min_lr: 0.003965  loss: 3.8684 (3.6251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.6779)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [36]  [1200/1251]  eta: 0:00:32  lr: 0.003964  min_lr: 0.003964  loss: 3.8653 (3.6302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5858 (0.6779)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.5178 (3.6301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6366 (0.6818)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [36] Total time: 0:13:07 (0.6296 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.5178 (3.6250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6366 (0.6818)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.7321 (0.7321)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 6.5002  data: 6.1697  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9901 (0.9755)  acc1: 79.2000 (79.1636)  acc5: 95.6000 (95.3091)  time: 0.8634  data: 0.5612  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2497 (1.1877)  acc1: 70.8000 (74.4191)  acc5: 92.0000 (92.7238)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3190 (1.1983)  acc1: 70.8000 (74.3520)  acc5: 90.8000 (92.4960)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5524 s / it)
* Acc@1 74.902 Acc@5 93.078 loss 1.177
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.94%
Epoch: [37]  [   0/1251]  eta: 1:29:22  lr: 0.003964  min_lr: 0.003964  loss: 3.7194 (3.7194)  weight_decay: 0.0500 (0.0500)  time: 4.2866  data: 3.1539  max mem: 54228
Epoch: [37]  [ 200/1251]  eta: 0:11:20  lr: 0.003963  min_lr: 0.003963  loss: 3.7197 (3.5900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5807 (0.7492)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [37]  [ 400/1251]  eta: 0:09:04  lr: 0.003962  min_lr: 0.003962  loss: 3.7629 (3.6436)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7547 (0.7296)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [37]  [ 600/1251]  eta: 0:06:54  lr: 0.003962  min_lr: 0.003962  loss: 3.5896 (3.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5334 (0.7049)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [37]  [ 800/1251]  eta: 0:04:46  lr: 0.003961  min_lr: 0.003961  loss: 3.5967 (3.6216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7781 (0.6989)  time: 0.6339  data: 0.0004  max mem: 54228
Epoch: [37]  [1000/1251]  eta: 0:02:38  lr: 0.003960  min_lr: 0.003960  loss: 3.5285 (3.6117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5286 (0.7077)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [37]  [1200/1251]  eta: 0:00:32  lr: 0.003960  min_lr: 0.003960  loss: 3.7306 (3.6032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6871 (0.7011)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.7767 (3.6054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.7002)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [37] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.7767 (3.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.7002)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.8148 (0.8148)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.9452  data: 5.6160  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0033 (1.0312)  acc1: 80.0000 (79.3455)  acc5: 96.4000 (95.7455)  time: 0.8551  data: 0.5531  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2349 (1.2332)  acc1: 72.8000 (75.5619)  acc5: 92.4000 (93.2952)  time: 0.3231  data: 0.0235  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3499 (1.2403)  acc1: 72.4000 (75.3440)  acc5: 92.0000 (93.1840)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5485 s / it)
* Acc@1 75.240 Acc@5 93.226 loss 1.236
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.24%
Epoch: [38]  [   0/1251]  eta: 1:05:35  lr: 0.003959  min_lr: 0.003959  loss: 4.0106 (4.0106)  weight_decay: 0.0500 (0.0500)  time: 3.1458  data: 2.4926  max mem: 54228
Epoch: [38]  [ 200/1251]  eta: 0:11:15  lr: 0.003959  min_lr: 0.003959  loss: 3.4360 (3.5772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6638 (nan)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [38]  [ 400/1251]  eta: 0:09:00  lr: 0.003958  min_lr: 0.003958  loss: 3.9245 (3.6046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6375 (nan)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [38]  [ 600/1251]  eta: 0:06:52  lr: 0.003957  min_lr: 0.003957  loss: 3.7511 (3.6368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7284 (nan)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [38]  [ 800/1251]  eta: 0:04:45  lr: 0.003956  min_lr: 0.003956  loss: 3.7896 (3.6328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (nan)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [38]  [1000/1251]  eta: 0:02:38  lr: 0.003956  min_lr: 0.003956  loss: 3.7335 (3.6353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7083 (nan)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [38]  [1200/1251]  eta: 0:00:32  lr: 0.003955  min_lr: 0.003955  loss: 3.5777 (3.6387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6191 (nan)  time: 0.6366  data: 0.0004  max mem: 54228
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 3.6647 (3.6336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6256 (nan)  time: 0.5371  data: 0.0004  max mem: 54228
Epoch: [38] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 3.6647 (3.6054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6256 (nan)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.7374 (0.7374)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 6.7483  data: 6.4300  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9120 (0.9689)  acc1: 78.4000 (79.5636)  acc5: 96.8000 (95.7455)  time: 0.8857  data: 0.5848  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.1692 (1.1668)  acc1: 74.4000 (75.5429)  acc5: 92.4000 (93.3333)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2607 (1.1763)  acc1: 73.6000 (75.1520)  acc5: 91.6000 (93.2480)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5612 s / it)
* Acc@1 75.506 Acc@5 93.368 loss 1.173
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.51%
Epoch: [39]  [   0/1251]  eta: 1:08:37  lr: 0.003955  min_lr: 0.003955  loss: 2.6137 (2.6137)  weight_decay: 0.0500 (0.0500)  time: 3.2911  data: 2.6546  max mem: 54228
Epoch: [39]  [ 200/1251]  eta: 0:11:14  lr: 0.003954  min_lr: 0.003954  loss: 3.8976 (3.6128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4993 (0.7026)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [39]  [ 400/1251]  eta: 0:09:00  lr: 0.003953  min_lr: 0.003953  loss: 3.1111 (3.5803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4583 (0.7195)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [39]  [ 600/1251]  eta: 0:06:52  lr: 0.003952  min_lr: 0.003952  loss: 3.2873 (3.5794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.7369)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [39]  [ 800/1251]  eta: 0:04:45  lr: 0.003952  min_lr: 0.003952  loss: 3.8768 (3.5812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8845 (0.7578)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [39]  [1000/1251]  eta: 0:02:38  lr: 0.003951  min_lr: 0.003951  loss: 3.7898 (3.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5524 (0.7414)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [39]  [1200/1251]  eta: 0:00:32  lr: 0.003950  min_lr: 0.003950  loss: 3.7426 (3.5900)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.5883 (3.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [39] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.5883 (3.5896)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.7541 (0.7541)  acc1: 85.2000 (85.2000)  acc5: 96.4000 (96.4000)  time: 6.2314  data: 5.8986  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9448 (0.9983)  acc1: 81.2000 (80.0364)  acc5: 96.4000 (95.7455)  time: 0.8391  data: 0.5366  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2694 (1.1906)  acc1: 73.6000 (75.5810)  acc5: 92.8000 (93.3333)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2916 (1.1966)  acc1: 73.6000 (75.4240)  acc5: 91.2000 (93.1360)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5425 s / it)
* Acc@1 75.586 Acc@5 93.404 loss 1.191
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.59%
Epoch: [40]  [   0/1251]  eta: 1:14:19  lr: 0.003950  min_lr: 0.003950  loss: 4.2264 (4.2264)  weight_decay: 0.0500 (0.0500)  time: 3.5651  data: 2.9287  max mem: 54228
Epoch: [40]  [ 200/1251]  eta: 0:11:15  lr: 0.003949  min_lr: 0.003949  loss: 3.6937 (3.6169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.6835)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [40]  [ 400/1251]  eta: 0:09:02  lr: 0.003948  min_lr: 0.003948  loss: 3.7617 (3.6107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8260 (0.7626)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [40]  [ 600/1251]  eta: 0:06:52  lr: 0.003947  min_lr: 0.003947  loss: 3.7946 (3.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6176 (0.7501)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [40]  [ 800/1251]  eta: 0:04:45  lr: 0.003947  min_lr: 0.003947  loss: 3.7756 (3.5821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5527 (0.7452)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [40]  [1000/1251]  eta: 0:02:38  lr: 0.003946  min_lr: 0.003946  loss: 3.6131 (3.5796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6495 (0.7485)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [40]  [1200/1251]  eta: 0:00:32  lr: 0.003945  min_lr: 0.003945  loss: 3.7752 (3.5841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7107 (0.7310)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.7593 (3.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.7301)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [40] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.7593 (3.5764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.7301)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.7908 (0.7908)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 6.2079  data: 5.8775  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9755 (0.9685)  acc1: 80.8000 (80.1091)  acc5: 96.4000 (96.0727)  time: 0.8369  data: 0.5346  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1288 (1.1597)  acc1: 74.0000 (75.9238)  acc5: 93.2000 (93.4857)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3151 (1.1668)  acc1: 73.2000 (75.5680)  acc5: 91.6000 (93.4080)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5398 s / it)
* Acc@1 75.818 Acc@5 93.460 loss 1.160
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.82%
Epoch: [41]  [   0/1251]  eta: 1:18:43  lr: 0.003945  min_lr: 0.003945  loss: 3.7184 (3.7184)  weight_decay: 0.0500 (0.0500)  time: 3.7762  data: 3.1370  max mem: 54228
Epoch: [41]  [ 200/1251]  eta: 0:11:19  lr: 0.003944  min_lr: 0.003944  loss: 3.6866 (3.4846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4930 (0.7224)  time: 0.6360  data: 0.0005  max mem: 54228
Epoch: [41]  [ 400/1251]  eta: 0:09:02  lr: 0.003943  min_lr: 0.003943  loss: 3.8540 (3.5222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (0.7696)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [41]  [ 600/1251]  eta: 0:06:53  lr: 0.003942  min_lr: 0.003942  loss: 3.6180 (3.5258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7240 (0.7775)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [41]  [ 800/1251]  eta: 0:04:45  lr: 0.003941  min_lr: 0.003941  loss: 3.8674 (3.5286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6355 (0.7752)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [41]  [1000/1251]  eta: 0:02:38  lr: 0.003940  min_lr: 0.003940  loss: 3.6752 (3.5368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5892 (0.7728)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [41]  [1200/1251]  eta: 0:00:32  lr: 0.003940  min_lr: 0.003940  loss: 3.6267 (3.5341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6154 (0.7630)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 3.3816 (3.5330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6312 (0.7616)  time: 0.5380  data: 0.0006  max mem: 54228
Epoch: [41] Total time: 0:13:09 (0.6315 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 3.3816 (3.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6312 (0.7616)
Test:  [ 0/25]  eta: 0:02:55  loss: 0.7455 (0.7455)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 7.0133  data: 6.6953  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9240 (0.9108)  acc1: 80.0000 (80.5818)  acc5: 96.4000 (96.0727)  time: 0.9102  data: 0.6089  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.1353 (1.1131)  acc1: 73.2000 (76.2857)  acc5: 93.2000 (93.5048)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2584 (1.1242)  acc1: 73.2000 (75.9200)  acc5: 92.0000 (93.5040)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5722 s / it)
* Acc@1 76.028 Acc@5 93.556 loss 1.117
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.03%
Epoch: [42]  [   0/1251]  eta: 1:06:58  lr: 0.003939  min_lr: 0.003939  loss: 3.9906 (3.9906)  weight_decay: 0.0500 (0.0500)  time: 3.2126  data: 2.5598  max mem: 54228
Epoch: [42]  [ 200/1251]  eta: 0:11:16  lr: 0.003939  min_lr: 0.003939  loss: 3.7341 (3.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7857 (0.8226)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [42]  [ 400/1251]  eta: 0:09:01  lr: 0.003938  min_lr: 0.003938  loss: 3.4929 (3.5712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7224 (0.7878)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [42]  [ 600/1251]  eta: 0:06:52  lr: 0.003937  min_lr: 0.003937  loss: 3.4643 (3.5809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5386 (0.7610)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [42]  [ 800/1251]  eta: 0:04:45  lr: 0.003936  min_lr: 0.003936  loss: 3.8473 (3.5839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7702 (0.7812)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [42]  [1000/1251]  eta: 0:02:38  lr: 0.003935  min_lr: 0.003935  loss: 3.7890 (3.5662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6694 (0.7808)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [42]  [1200/1251]  eta: 0:00:32  lr: 0.003934  min_lr: 0.003934  loss: 3.6087 (3.5756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6254 (0.7762)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 3.5379 (3.5716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5709 (0.7679)  time: 0.5389  data: 0.0006  max mem: 54228
Epoch: [42] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 3.5379 (3.5518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5709 (0.7679)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.8207 (0.8207)  acc1: 82.0000 (82.0000)  acc5: 97.2000 (97.2000)  time: 6.1826  data: 5.8427  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9025 (0.9261)  acc1: 81.6000 (80.3636)  acc5: 96.4000 (96.0364)  time: 0.8354  data: 0.5315  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1523 (1.1231)  acc1: 74.4000 (76.0952)  acc5: 92.8000 (93.6952)  time: 0.3007  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2431 (1.1326)  acc1: 74.4000 (75.9200)  acc5: 92.4000 (93.5040)  time: 0.3008  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5490 s / it)
* Acc@1 76.222 Acc@5 93.726 loss 1.128
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.22%
Epoch: [43]  [   0/1251]  eta: 1:11:20  lr: 0.003934  min_lr: 0.003934  loss: 4.0186 (4.0186)  weight_decay: 0.0500 (0.0500)  time: 3.4214  data: 2.7944  max mem: 54228
Epoch: [43]  [ 200/1251]  eta: 0:11:15  lr: 0.003933  min_lr: 0.003933  loss: 3.4689 (3.5036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7431 (0.8707)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [43]  [ 400/1251]  eta: 0:09:02  lr: 0.003932  min_lr: 0.003932  loss: 3.6049 (3.5403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6833 (0.7857)  time: 0.6425  data: 0.0005  max mem: 54228
Epoch: [43]  [ 600/1251]  eta: 0:06:52  lr: 0.003931  min_lr: 0.003931  loss: 3.6357 (3.5396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6365 (0.8063)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [43]  [ 800/1251]  eta: 0:04:45  lr: 0.003930  min_lr: 0.003930  loss: 3.4660 (3.5440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.8161)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [43]  [1000/1251]  eta: 0:02:38  lr: 0.003929  min_lr: 0.003929  loss: 3.5946 (3.5311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7768 (0.7961)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [43]  [1200/1251]  eta: 0:00:32  lr: 0.003928  min_lr: 0.003928  loss: 3.5846 (3.5326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.7937)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 3.5410 (3.5296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (0.7899)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [43] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 3.5410 (3.5417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (0.7899)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.8406 (0.8406)  acc1: 83.6000 (83.6000)  acc5: 97.6000 (97.6000)  time: 6.6439  data: 6.3150  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.9456 (0.9705)  acc1: 81.2000 (79.3091)  acc5: 96.0000 (96.1091)  time: 0.8760  data: 0.5744  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.2041 (1.1808)  acc1: 73.2000 (75.0286)  acc5: 93.2000 (93.5810)  time: 0.2990  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3130 (1.1909)  acc1: 73.6000 (74.9120)  acc5: 92.0000 (93.3760)  time: 0.2989  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5581 s / it)
* Acc@1 75.806 Acc@5 93.634 loss 1.170
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.22%
Epoch: [44]  [   0/1251]  eta: 1:28:15  lr: 0.003928  min_lr: 0.003928  loss: 3.3396 (3.3396)  weight_decay: 0.0500 (0.0500)  time: 4.2327  data: 3.3339  max mem: 54228
Epoch: [44]  [ 200/1251]  eta: 0:11:20  lr: 0.003927  min_lr: 0.003927  loss: 3.6953 (3.5468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6589 (0.7600)  time: 0.6357  data: 0.0005  max mem: 54228
Epoch: [44]  [ 400/1251]  eta: 0:09:03  lr: 0.003926  min_lr: 0.003926  loss: 3.7853 (3.5164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.7302)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [44]  [ 600/1251]  eta: 0:06:53  lr: 0.003925  min_lr: 0.003925  loss: 3.5211 (3.5058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.7628)  time: 0.6293  data: 0.0004  max mem: 54228
Epoch: [44]  [ 800/1251]  eta: 0:04:45  lr: 0.003924  min_lr: 0.003924  loss: 3.7665 (3.5196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6559 (0.8074)  time: 0.6331  data: 0.0005  max mem: 54228
Epoch: [44]  [1000/1251]  eta: 0:02:38  lr: 0.003923  min_lr: 0.003923  loss: 3.7877 (3.5199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6090 (0.7965)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [44]  [1200/1251]  eta: 0:00:32  lr: 0.003922  min_lr: 0.003922  loss: 3.7685 (3.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8558 (0.8143)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 3.5685 (3.5167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.8189)  time: 0.5388  data: 0.0006  max mem: 54228
Epoch: [44] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 3.5685 (3.5335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.8189)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.7534 (0.7534)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 6.4376  data: 6.0935  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9764 (0.9514)  acc1: 81.2000 (80.9455)  acc5: 96.4000 (96.1818)  time: 0.8579  data: 0.5543  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1857 (1.1329)  acc1: 73.6000 (76.8762)  acc5: 93.2000 (93.8095)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2396 (1.1430)  acc1: 73.2000 (76.5600)  acc5: 92.8000 (93.6800)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5509 s / it)
* Acc@1 76.642 Acc@5 93.802 loss 1.132
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [45]  [   0/1251]  eta: 1:14:35  lr: 0.003922  min_lr: 0.003922  loss: 2.3550 (2.3550)  weight_decay: 0.0500 (0.0500)  time: 3.5772  data: 2.9526  max mem: 54228
Epoch: [45]  [ 200/1251]  eta: 0:11:19  lr: 0.003921  min_lr: 0.003921  loss: 3.7082 (3.5476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6312 (0.7283)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [45]  [ 400/1251]  eta: 0:09:02  lr: 0.003920  min_lr: 0.003920  loss: 3.6087 (3.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9284 (0.7677)  time: 0.6355  data: 0.0004  max mem: 54228
Epoch: [45]  [ 600/1251]  eta: 0:06:53  lr: 0.003919  min_lr: 0.003919  loss: 3.7183 (3.5308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4973 (0.7800)  time: 0.6336  data: 0.0004  max mem: 54228
Epoch: [45]  [ 800/1251]  eta: 0:04:45  lr: 0.003918  min_lr: 0.003918  loss: 3.1927 (3.5241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6709 (0.7728)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [45]  [1000/1251]  eta: 0:02:38  lr: 0.003917  min_lr: 0.003917  loss: 3.8089 (3.5284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6887 (0.7699)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [45]  [1200/1251]  eta: 0:00:32  lr: 0.003916  min_lr: 0.003916  loss: 3.6911 (3.5362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8995 (0.8005)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.5169 (3.5367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7790 (0.7960)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [45] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.5169 (3.5169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7790 (0.7960)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.7609 (0.7609)  acc1: 86.0000 (86.0000)  acc5: 96.4000 (96.4000)  time: 6.2279  data: 5.8744  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9421 (0.9437)  acc1: 81.6000 (80.2909)  acc5: 96.4000 (96.0000)  time: 0.8387  data: 0.5343  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1526 (1.1317)  acc1: 74.8000 (76.2667)  acc5: 93.2000 (93.7143)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1857 (1.1339)  acc1: 74.4000 (76.0320)  acc5: 92.0000 (93.6960)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5402 s / it)
* Acc@1 76.638 Acc@5 93.890 loss 1.115
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [46]  [   0/1251]  eta: 1:20:12  lr: 0.003916  min_lr: 0.003916  loss: 4.1505 (4.1505)  weight_decay: 0.0500 (0.0500)  time: 3.8472  data: 2.8955  max mem: 54228
Epoch: [46]  [ 200/1251]  eta: 0:11:19  lr: 0.003914  min_lr: 0.003914  loss: 3.6735 (3.5625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (0.8236)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [46]  [ 400/1251]  eta: 0:09:03  lr: 0.003913  min_lr: 0.003913  loss: 3.7735 (3.5369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (0.8093)  time: 0.6366  data: 0.0005  max mem: 54228
Epoch: [46]  [ 600/1251]  eta: 0:06:53  lr: 0.003912  min_lr: 0.003912  loss: 3.7885 (3.5425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (0.8193)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [46]  [ 800/1251]  eta: 0:04:45  lr: 0.003911  min_lr: 0.003911  loss: 3.5024 (3.5283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5236 (0.8140)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [46]  [1000/1251]  eta: 0:02:38  lr: 0.003910  min_lr: 0.003910  loss: 3.5679 (3.5205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7032 (0.8219)  time: 0.6345  data: 0.0004  max mem: 54228
Epoch: [46]  [1200/1251]  eta: 0:00:32  lr: 0.003909  min_lr: 0.003909  loss: 3.8638 (3.5228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5447 (0.8107)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 3.6231 (3.5246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5447 (0.8083)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [46] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 3.6231 (3.5130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5447 (0.8083)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8229 (0.8229)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.5773  data: 5.2395  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.9893 (1.0095)  acc1: 79.6000 (81.2727)  acc5: 96.8000 (96.0364)  time: 0.7800  data: 0.4766  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1887 (1.2056)  acc1: 74.4000 (77.0095)  acc5: 92.4000 (93.6000)  time: 0.3004  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2768 (1.2168)  acc1: 74.4000 (76.6080)  acc5: 92.4000 (93.6000)  time: 0.3005  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5174 s / it)
* Acc@1 76.460 Acc@5 93.808 loss 1.210
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.64%
Epoch: [47]  [   0/1251]  eta: 1:23:38  lr: 0.003909  min_lr: 0.003909  loss: 3.7304 (3.7304)  weight_decay: 0.0500 (0.0500)  time: 4.0115  data: 3.3175  max mem: 54228
Epoch: [47]  [ 200/1251]  eta: 0:11:18  lr: 0.003908  min_lr: 0.003908  loss: 3.5849 (3.5047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0495 (0.9027)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [47]  [ 400/1251]  eta: 0:09:03  lr: 0.003907  min_lr: 0.003907  loss: 3.4597 (3.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6282 (0.8436)  time: 0.6296  data: 0.0004  max mem: 54228
Epoch: [47]  [ 600/1251]  eta: 0:06:53  lr: 0.003906  min_lr: 0.003906  loss: 3.6766 (3.5208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6274 (0.8614)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [47]  [ 800/1251]  eta: 0:04:45  lr: 0.003905  min_lr: 0.003905  loss: 3.5644 (3.5255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8637 (0.8803)  time: 0.6341  data: 0.0005  max mem: 54228
Epoch: [47]  [1000/1251]  eta: 0:02:38  lr: 0.003904  min_lr: 0.003904  loss: 3.4735 (3.5233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.8808)  time: 0.6294  data: 0.0005  max mem: 54228
Epoch: [47]  [1200/1251]  eta: 0:00:32  lr: 0.003902  min_lr: 0.003902  loss: 3.7300 (3.5174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6728 (0.8842)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.5425 (3.5147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.8818)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [47] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.5425 (3.5030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.8818)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.7941 (0.7941)  acc1: 84.8000 (84.8000)  acc5: 97.2000 (97.2000)  time: 6.3165  data: 6.0004  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9810 (0.9818)  acc1: 80.8000 (79.6364)  acc5: 96.4000 (96.2909)  time: 0.8470  data: 0.5458  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1741 (1.1648)  acc1: 74.4000 (76.0000)  acc5: 93.2000 (93.8286)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3090 (1.1692)  acc1: 73.6000 (75.7600)  acc5: 92.4000 (93.7120)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5444 s / it)
* Acc@1 76.620 Acc@5 94.036 loss 1.155
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [48]  [   0/1251]  eta: 1:27:02  lr: 0.003902  min_lr: 0.003902  loss: 3.1025 (3.1025)  weight_decay: 0.0500 (0.0500)  time: 4.1746  data: 3.4665  max mem: 54228
Epoch: [48]  [ 200/1251]  eta: 0:11:21  lr: 0.003901  min_lr: 0.003901  loss: 3.6462 (3.5089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7124 (0.7588)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [48]  [ 400/1251]  eta: 0:09:03  lr: 0.003900  min_lr: 0.003900  loss: 3.4123 (3.4929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9991 (0.8318)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [48]  [ 600/1251]  eta: 0:06:53  lr: 0.003899  min_lr: 0.003899  loss: 3.6381 (3.4699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7130 (nan)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [48]  [ 800/1251]  eta: 0:04:45  lr: 0.003898  min_lr: 0.003898  loss: 3.7259 (3.4870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8584 (nan)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [48]  [1000/1251]  eta: 0:02:38  lr: 0.003897  min_lr: 0.003897  loss: 3.6854 (3.4846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6906 (nan)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [48]  [1200/1251]  eta: 0:00:32  lr: 0.003895  min_lr: 0.003895  loss: 3.5865 (3.4823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7825 (nan)  time: 0.6356  data: 0.0004  max mem: 54228
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 3.8232 (3.4872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6707 (nan)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [48] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 3.8232 (3.4928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6707 (nan)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.7728 (0.7728)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 6.4611  data: 6.1411  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9396 (0.9870)  acc1: 83.6000 (80.5091)  acc5: 96.8000 (95.9636)  time: 0.8599  data: 0.5586  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2072 (1.1456)  acc1: 74.4000 (76.8000)  acc5: 93.2000 (93.7714)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2349 (1.1538)  acc1: 73.2000 (76.4480)  acc5: 93.2000 (93.6800)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5507 s / it)
* Acc@1 76.592 Acc@5 93.808 loss 1.141
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [49]  [   0/1251]  eta: 1:24:48  lr: 0.003895  min_lr: 0.003895  loss: 3.4683 (3.4683)  weight_decay: 0.0500 (0.0500)  time: 4.0675  data: 2.6833  max mem: 54228
Epoch: [49]  [ 200/1251]  eta: 0:11:18  lr: 0.003894  min_lr: 0.003894  loss: 3.6180 (3.5249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6231 (0.7637)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [49]  [ 400/1251]  eta: 0:09:01  lr: 0.003893  min_lr: 0.003893  loss: 3.5899 (3.5035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.7925)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [49]  [ 600/1251]  eta: 0:06:53  lr: 0.003892  min_lr: 0.003892  loss: 3.5208 (3.4804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5414 (0.8187)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [49]  [ 800/1251]  eta: 0:04:45  lr: 0.003890  min_lr: 0.003890  loss: 3.3958 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7630 (0.8209)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [49]  [1000/1251]  eta: 0:02:38  lr: 0.003889  min_lr: 0.003889  loss: 3.3363 (3.4719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7552 (0.8374)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [49]  [1200/1251]  eta: 0:00:32  lr: 0.003888  min_lr: 0.003888  loss: 3.6779 (3.4913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6475 (0.8308)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 3.7656 (3.4957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9285 (0.8384)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [49] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 3.7656 (3.4822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9285 (0.8384)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.8525 (0.8525)  acc1: 83.6000 (83.6000)  acc5: 98.0000 (98.0000)  time: 6.4524  data: 6.1143  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0122 (1.0298)  acc1: 82.0000 (80.4364)  acc5: 96.4000 (96.1091)  time: 0.8591  data: 0.5562  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2432 (1.1972)  acc1: 74.4000 (76.6857)  acc5: 93.2000 (94.1524)  time: 0.2996  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2432 (1.2028)  acc1: 74.4000 (76.5120)  acc5: 92.8000 (94.0000)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5492 s / it)
* Acc@1 76.836 Acc@5 94.084 loss 1.198
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.84%
Epoch: [50]  [   0/1251]  eta: 1:13:10  lr: 0.003888  min_lr: 0.003888  loss: 4.0742 (4.0742)  weight_decay: 0.0500 (0.0500)  time: 3.5098  data: 2.8712  max mem: 54228
Epoch: [50]  [ 200/1251]  eta: 0:11:15  lr: 0.003887  min_lr: 0.003887  loss: 3.6430 (3.4374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9614 (1.0114)  time: 0.6284  data: 0.0006  max mem: 54228
Epoch: [50]  [ 400/1251]  eta: 0:09:02  lr: 0.003885  min_lr: 0.003885  loss: 3.5408 (3.4907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9177 (0.9719)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [50]  [ 600/1251]  eta: 0:06:52  lr: 0.003884  min_lr: 0.003884  loss: 3.3312 (3.4605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.9226)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [50]  [ 800/1251]  eta: 0:04:45  lr: 0.003883  min_lr: 0.003883  loss: 3.5416 (3.4589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7779 (0.9154)  time: 0.6331  data: 0.0005  max mem: 54228
Epoch: [50]  [1000/1251]  eta: 0:02:38  lr: 0.003882  min_lr: 0.003882  loss: 3.4109 (3.4630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.9406)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [50]  [1200/1251]  eta: 0:00:32  lr: 0.003881  min_lr: 0.003881  loss: 3.7293 (3.4693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8917 (0.9556)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.5510 (3.4704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.9466)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [50] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.5510 (3.4746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.9466)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6502 (0.6502)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 6.2729  data: 5.9336  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8460 (0.8716)  acc1: 81.6000 (81.0182)  acc5: 96.8000 (96.7636)  time: 0.8428  data: 0.5398  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0855 (1.0531)  acc1: 74.0000 (77.1048)  acc5: 94.4000 (94.5143)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2238 (1.0648)  acc1: 74.8000 (76.8320)  acc5: 92.8000 (94.2400)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5420 s / it)
* Acc@1 77.062 Acc@5 94.192 loss 1.055
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.06%
Epoch: [51]  [   0/1251]  eta: 0:58:30  lr: 0.003880  min_lr: 0.003880  loss: 3.7013 (3.7013)  weight_decay: 0.0500 (0.0500)  time: 2.8065  data: 2.1757  max mem: 54228
Epoch: [51]  [ 200/1251]  eta: 0:11:14  lr: 0.003879  min_lr: 0.003879  loss: 3.4763 (3.4780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8436 (0.8430)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [51]  [ 400/1251]  eta: 0:09:00  lr: 0.003878  min_lr: 0.003878  loss: 3.3997 (3.4561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.9222)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [51]  [ 600/1251]  eta: 0:06:52  lr: 0.003877  min_lr: 0.003877  loss: 3.8107 (3.4783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (0.9138)  time: 0.6290  data: 0.0004  max mem: 54228
Epoch: [51]  [ 800/1251]  eta: 0:04:45  lr: 0.003875  min_lr: 0.003875  loss: 3.5258 (3.4571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8658 (0.8849)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [51]  [1000/1251]  eta: 0:02:38  lr: 0.003874  min_lr: 0.003874  loss: 3.4742 (3.4690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6136 (0.8565)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [51]  [1200/1251]  eta: 0:00:32  lr: 0.003873  min_lr: 0.003873  loss: 3.5011 (3.4677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7613 (0.8481)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.2481 (3.4662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.8478)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [51] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.2481 (3.4768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.8478)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.7802 (0.7802)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 6.4836  data: 6.1672  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8619 (0.9018)  acc1: 81.6000 (80.8000)  acc5: 97.2000 (96.1818)  time: 0.8622  data: 0.5609  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1161 (1.0757)  acc1: 74.8000 (76.8381)  acc5: 93.6000 (94.1905)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1713 (1.0864)  acc1: 74.8000 (76.6560)  acc5: 93.6000 (94.1920)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5532 s / it)
* Acc@1 77.194 Acc@5 94.140 loss 1.075
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.19%
Epoch: [52]  [   0/1251]  eta: 1:19:18  lr: 0.003873  min_lr: 0.003873  loss: 4.0375 (4.0375)  weight_decay: 0.0500 (0.0500)  time: 3.8041  data: 3.1659  max mem: 54228
Epoch: [52]  [ 200/1251]  eta: 0:11:17  lr: 0.003871  min_lr: 0.003871  loss: 3.7357 (3.4566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.0611)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [52]  [ 400/1251]  eta: 0:09:02  lr: 0.003870  min_lr: 0.003870  loss: 3.6018 (3.4389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7360 (0.9660)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [52]  [ 600/1251]  eta: 0:06:53  lr: 0.003869  min_lr: 0.003869  loss: 3.4875 (3.4511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.9502)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [52]  [ 800/1251]  eta: 0:04:45  lr: 0.003867  min_lr: 0.003867  loss: 3.6750 (3.4580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7979 (0.9412)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [52]  [1000/1251]  eta: 0:02:38  lr: 0.003866  min_lr: 0.003866  loss: 3.6757 (3.4669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7968 (0.9008)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [52]  [1200/1251]  eta: 0:00:32  lr: 0.003865  min_lr: 0.003865  loss: 3.6610 (3.4553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9390 (0.9048)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 3.5195 (3.4621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7622 (0.9042)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [52] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 3.5195 (3.4594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7622 (0.9042)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6615 (0.6615)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 6.3896  data: 6.0507  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8879 (0.8948)  acc1: 82.0000 (81.6000)  acc5: 96.8000 (96.4727)  time: 0.8536  data: 0.5504  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0834 (1.0942)  acc1: 73.6000 (77.2000)  acc5: 94.0000 (94.1333)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1500 (1.0957)  acc1: 75.2000 (77.1360)  acc5: 93.2000 (94.0800)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5480 s / it)
* Acc@1 77.194 Acc@5 94.220 loss 1.095
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.19%
Epoch: [53]  [   0/1251]  eta: 1:06:47  lr: 0.003865  min_lr: 0.003865  loss: 3.8462 (3.8462)  weight_decay: 0.0500 (0.0500)  time: 3.2031  data: 2.5741  max mem: 54228
Epoch: [53]  [ 200/1251]  eta: 0:11:13  lr: 0.003863  min_lr: 0.003863  loss: 3.3677 (3.4609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9890 (1.0255)  time: 0.6293  data: 0.0004  max mem: 54228
Epoch: [53]  [ 400/1251]  eta: 0:09:01  lr: 0.003862  min_lr: 0.003862  loss: 3.1594 (3.4393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0582 (1.0022)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [53]  [ 600/1251]  eta: 0:06:52  lr: 0.003861  min_lr: 0.003861  loss: 3.5759 (3.4299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6976 (0.9627)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [53]  [ 800/1251]  eta: 0:04:45  lr: 0.003859  min_lr: 0.003859  loss: 3.6138 (3.4354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7093 (0.9298)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [53]  [1000/1251]  eta: 0:02:38  lr: 0.003858  min_lr: 0.003858  loss: 3.5212 (3.4328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (0.9357)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [53]  [1200/1251]  eta: 0:00:32  lr: 0.003857  min_lr: 0.003857  loss: 3.7130 (3.4390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9119 (0.9390)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 3.4729 (3.4403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7328 (0.9331)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [53] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 3.4729 (3.4556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7328 (0.9331)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6934 (0.6934)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 6.7351  data: 6.4033  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8639 (0.8936)  acc1: 83.2000 (81.7091)  acc5: 96.8000 (96.4364)  time: 0.8840  data: 0.5824  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.1010 (1.0772)  acc1: 75.6000 (77.5619)  acc5: 93.6000 (94.2095)  time: 0.2988  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2097 (1.0915)  acc1: 74.8000 (77.1520)  acc5: 92.4000 (94.0000)  time: 0.2987  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5607 s / it)
* Acc@1 77.252 Acc@5 94.290 loss 1.089
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.25%
Epoch: [54]  [   0/1251]  eta: 1:04:47  lr: 0.003856  min_lr: 0.003856  loss: 3.8198 (3.8198)  weight_decay: 0.0500 (0.0500)  time: 3.1078  data: 2.4710  max mem: 54228
Epoch: [54]  [ 200/1251]  eta: 0:11:15  lr: 0.003855  min_lr: 0.003855  loss: 3.7835 (3.4087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7897 (0.9758)  time: 0.6356  data: 0.0005  max mem: 54228
Epoch: [54]  [ 400/1251]  eta: 0:09:01  lr: 0.003854  min_lr: 0.003854  loss: 3.5887 (3.4157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7148 (0.8779)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [54]  [ 600/1251]  eta: 0:06:52  lr: 0.003852  min_lr: 0.003852  loss: 3.6295 (3.4079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1145 (0.9566)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [54]  [ 800/1251]  eta: 0:04:45  lr: 0.003851  min_lr: 0.003851  loss: 3.6131 (3.4264)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0717 (0.9514)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [54]  [1000/1251]  eta: 0:02:38  lr: 0.003849  min_lr: 0.003849  loss: 3.5733 (3.4301)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (0.9290)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [54]  [1200/1251]  eta: 0:00:32  lr: 0.003848  min_lr: 0.003848  loss: 3.5517 (3.4453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8541 (0.9287)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.6241 (3.4450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (0.9291)  time: 0.5330  data: 0.0005  max mem: 54228
Epoch: [54] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.6241 (3.4434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (0.9291)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8334 (0.8334)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.3684  data: 5.0200  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9602 (0.9955)  acc1: 83.2000 (80.8000)  acc5: 96.8000 (96.5818)  time: 0.8193  data: 0.5153  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1691 (1.1695)  acc1: 74.0000 (76.9333)  acc5: 94.4000 (94.3048)  time: 0.3321  data: 0.0324  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.3147 (1.1810)  acc1: 74.8000 (76.7520)  acc5: 93.2000 (94.2560)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5326 s / it)
* Acc@1 77.124 Acc@5 94.264 loss 1.165
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.25%
Epoch: [55]  [   0/1251]  eta: 1:24:52  lr: 0.003848  min_lr: 0.003848  loss: 2.7772 (2.7772)  weight_decay: 0.0500 (0.0500)  time: 4.0711  data: 3.2819  max mem: 54228
Epoch: [55]  [ 200/1251]  eta: 0:11:21  lr: 0.003846  min_lr: 0.003846  loss: 3.3951 (3.4190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9058 (1.0536)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [55]  [ 400/1251]  eta: 0:09:03  lr: 0.003845  min_lr: 0.003845  loss: 3.6537 (3.4666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8775 (0.9878)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [55]  [ 600/1251]  eta: 0:06:53  lr: 0.003844  min_lr: 0.003844  loss: 3.6128 (3.4623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7122 (0.9570)  time: 0.6333  data: 0.0005  max mem: 54228
Epoch: [55]  [ 800/1251]  eta: 0:04:45  lr: 0.003842  min_lr: 0.003842  loss: 3.5572 (3.4629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6835 (0.9134)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [55]  [1000/1251]  eta: 0:02:38  lr: 0.003841  min_lr: 0.003841  loss: 3.5191 (3.4586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (0.9134)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [55]  [1200/1251]  eta: 0:00:32  lr: 0.003839  min_lr: 0.003839  loss: 3.4176 (3.4381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3591 (nan)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 3.5024 (3.4399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (nan)  time: 0.5337  data: 0.0007  max mem: 54228
Epoch: [55] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 3.5024 (3.4323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (nan)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.7426 (0.7426)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 6.1198  data: 5.7590  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9261 (0.9405)  acc1: 82.8000 (81.1273)  acc5: 96.8000 (96.4364)  time: 0.8291  data: 0.5239  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1612 (1.1413)  acc1: 74.0000 (77.1048)  acc5: 93.2000 (93.8286)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2703 (1.1477)  acc1: 74.0000 (77.0080)  acc5: 92.0000 (93.7440)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5382 s / it)
* Acc@1 77.192 Acc@5 94.256 loss 1.144
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.25%
Epoch: [56]  [   0/1251]  eta: 1:27:38  lr: 0.003839  min_lr: 0.003839  loss: 3.9776 (3.9776)  weight_decay: 0.0500 (0.0500)  time: 4.2035  data: 3.5089  max mem: 54228
Epoch: [56]  [ 200/1251]  eta: 0:11:18  lr: 0.003838  min_lr: 0.003838  loss: 3.4411 (3.4068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8096 (0.8095)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [56]  [ 400/1251]  eta: 0:09:03  lr: 0.003836  min_lr: 0.003836  loss: 3.6896 (3.4308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (0.9118)  time: 0.6364  data: 0.0005  max mem: 54228
Epoch: [56]  [ 600/1251]  eta: 0:06:53  lr: 0.003835  min_lr: 0.003835  loss: 3.6216 (3.4187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.8966)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [56]  [ 800/1251]  eta: 0:04:45  lr: 0.003833  min_lr: 0.003833  loss: 3.6542 (3.4271)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0611 (0.9353)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [56]  [1000/1251]  eta: 0:02:38  lr: 0.003832  min_lr: 0.003832  loss: 3.6749 (3.4332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (0.9429)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [56]  [1200/1251]  eta: 0:00:32  lr: 0.003831  min_lr: 0.003831  loss: 3.5836 (3.4384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9590 (0.9368)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 3.4645 (3.4405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7012 (0.9336)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [56] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 3.4645 (3.4371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7012 (0.9336)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.7539 (0.7539)  acc1: 83.6000 (83.6000)  acc5: 97.6000 (97.6000)  time: 6.0978  data: 5.7680  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9467 (0.9060)  acc1: 81.2000 (81.4909)  acc5: 97.2000 (96.5455)  time: 0.8268  data: 0.5247  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1059 (1.0689)  acc1: 76.0000 (77.3714)  acc5: 94.0000 (94.4000)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1975 (1.0762)  acc1: 75.6000 (77.2480)  acc5: 92.8000 (94.4640)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5383 s / it)
* Acc@1 77.402 Acc@5 94.348 loss 1.072
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.40%
Epoch: [57]  [   0/1251]  eta: 1:12:18  lr: 0.003830  min_lr: 0.003830  loss: 3.3811 (3.3811)  weight_decay: 0.0500 (0.0500)  time: 3.4684  data: 2.8401  max mem: 54228
Epoch: [57]  [ 200/1251]  eta: 0:11:15  lr: 0.003829  min_lr: 0.003829  loss: 3.1774 (3.3945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9263 (1.1293)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [57]  [ 400/1251]  eta: 0:09:02  lr: 0.003827  min_lr: 0.003827  loss: 3.5489 (3.4222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7015 (1.0366)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [57]  [ 600/1251]  eta: 0:06:52  lr: 0.003826  min_lr: 0.003826  loss: 3.5908 (3.4181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.9929)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [57]  [ 800/1251]  eta: 0:04:45  lr: 0.003824  min_lr: 0.003824  loss: 3.4207 (3.4141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0221 (0.9986)  time: 0.6343  data: 0.0005  max mem: 54228
Epoch: [57]  [1000/1251]  eta: 0:02:38  lr: 0.003823  min_lr: 0.003823  loss: 3.4611 (3.4131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6583 (0.9780)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [57]  [1200/1251]  eta: 0:00:32  lr: 0.003821  min_lr: 0.003821  loss: 3.4261 (3.4095)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8312 (0.9719)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 3.5216 (3.4085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9100 (0.9735)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [57] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 3.5216 (3.4205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9100 (0.9735)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.7295 (0.7295)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 6.4675  data: 6.1278  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8986 (0.8987)  acc1: 81.6000 (80.9455)  acc5: 97.6000 (96.8727)  time: 0.8602  data: 0.5573  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1415 (1.0700)  acc1: 73.6000 (77.1810)  acc5: 94.0000 (94.8571)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1686 (1.0800)  acc1: 73.6000 (76.9760)  acc5: 93.6000 (94.6880)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5505 s / it)
* Acc@1 77.676 Acc@5 94.510 loss 1.076
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.68%
Epoch: [58]  [   0/1251]  eta: 1:15:55  lr: 0.003821  min_lr: 0.003821  loss: 3.5433 (3.5433)  weight_decay: 0.0500 (0.0500)  time: 3.6414  data: 2.9954  max mem: 54228
Epoch: [58]  [ 200/1251]  eta: 0:11:18  lr: 0.003820  min_lr: 0.003820  loss: 3.4888 (3.4266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8784 (1.0023)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [58]  [ 400/1251]  eta: 0:09:02  lr: 0.003818  min_lr: 0.003818  loss: 3.6041 (3.4017)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7834 (0.9546)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [58]  [ 600/1251]  eta: 0:06:53  lr: 0.003817  min_lr: 0.003817  loss: 3.4421 (3.3917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0160 (0.9714)  time: 0.6362  data: 0.0005  max mem: 54228
Epoch: [58]  [ 800/1251]  eta: 0:04:45  lr: 0.003815  min_lr: 0.003815  loss: 3.5402 (3.3951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7488 (0.9573)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [58]  [1000/1251]  eta: 0:02:38  lr: 0.003813  min_lr: 0.003813  loss: 2.9931 (3.3836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8728 (0.9461)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [58]  [1200/1251]  eta: 0:00:32  lr: 0.003812  min_lr: 0.003812  loss: 3.3928 (3.3880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.9393)  time: 0.6391  data: 0.0005  max mem: 54228
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.4075 (3.3855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9741 (0.9498)  time: 0.5340  data: 0.0006  max mem: 54228
Epoch: [58] Total time: 0:13:10 (0.6315 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.4075 (3.4157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9741 (0.9498)
Test:  [ 0/25]  eta: 0:02:51  loss: 0.7414 (0.7414)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 6.8633  data: 6.5266  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8345 (0.8850)  acc1: 82.4000 (81.4909)  acc5: 97.2000 (96.6545)  time: 0.8963  data: 0.5936  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0991 (1.0565)  acc1: 74.4000 (77.7524)  acc5: 94.4000 (94.5524)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1484 (1.0666)  acc1: 74.4000 (77.4880)  acc5: 92.8000 (94.4160)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5653 s / it)
* Acc@1 77.824 Acc@5 94.498 loss 1.057
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.82%
Epoch: [59]  [   0/1251]  eta: 1:05:24  lr: 0.003812  min_lr: 0.003812  loss: 2.3885 (2.3885)  weight_decay: 0.0500 (0.0500)  time: 3.1367  data: 2.5011  max mem: 54228
Epoch: [59]  [ 200/1251]  eta: 0:11:13  lr: 0.003810  min_lr: 0.003810  loss: 3.7346 (3.4422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8687 (1.1944)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [59]  [ 400/1251]  eta: 0:08:59  lr: 0.003809  min_lr: 0.003809  loss: 3.2823 (3.4084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6911 (0.9763)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [59]  [ 600/1251]  eta: 0:06:52  lr: 0.003807  min_lr: 0.003807  loss: 3.5568 (3.4287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9696 (0.9880)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [59]  [ 800/1251]  eta: 0:04:45  lr: 0.003805  min_lr: 0.003805  loss: 3.4541 (3.4283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7893 (0.9646)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [59]  [1000/1251]  eta: 0:02:38  lr: 0.003804  min_lr: 0.003804  loss: 3.4995 (3.4329)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0520 (0.9573)  time: 0.6335  data: 0.0005  max mem: 54228
Epoch: [59]  [1200/1251]  eta: 0:00:32  lr: 0.003802  min_lr: 0.003802  loss: 3.5180 (3.4338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.9541)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.2368 (3.4320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.9496)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [59] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.2368 (3.4186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7338 (0.9496)
Test:  [ 0/25]  eta: 0:01:43  loss: 0.6196 (0.6196)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 4.1296  data: 3.8012  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.8173 (0.8252)  acc1: 83.2000 (81.7818)  acc5: 97.2000 (96.8364)  time: 0.7999  data: 0.4975  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0802 (1.0100)  acc1: 75.2000 (77.8095)  acc5: 93.2000 (94.4191)  time: 0.3837  data: 0.0836  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1137 (1.0242)  acc1: 75.2000 (77.6960)  acc5: 92.8000 (94.2080)  time: 0.3004  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5250 s / it)
* Acc@1 77.608 Acc@5 94.496 loss 1.022
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.82%
Epoch: [60]  [   0/1251]  eta: 1:29:08  lr: 0.003802  min_lr: 0.003802  loss: 3.4812 (3.4812)  weight_decay: 0.0500 (0.0500)  time: 4.2750  data: 1.7498  max mem: 54228
Epoch: [60]  [ 200/1251]  eta: 0:11:19  lr: 0.003800  min_lr: 0.003800  loss: 3.6523 (3.3498)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0195 (0.9758)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [60]  [ 400/1251]  eta: 0:09:03  lr: 0.003799  min_lr: 0.003799  loss: 3.5222 (3.4110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0126 (1.0475)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [60]  [ 600/1251]  eta: 0:06:53  lr: 0.003797  min_lr: 0.003797  loss: 3.4536 (3.4136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7180 (0.9768)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [60]  [ 800/1251]  eta: 0:04:45  lr: 0.003796  min_lr: 0.003796  loss: 3.2645 (3.4079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0128 (0.9961)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [60]  [1000/1251]  eta: 0:02:38  lr: 0.003794  min_lr: 0.003794  loss: 3.5783 (3.4113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7294 (0.9541)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [60]  [1200/1251]  eta: 0:00:32  lr: 0.003793  min_lr: 0.003793  loss: 3.6931 (3.4079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9088 (0.9461)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.4180 (3.4063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.9444)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [60] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.4180 (3.4023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7451 (0.9444)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.6674 (0.6674)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 6.5323  data: 6.1867  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8782 (0.8891)  acc1: 82.4000 (81.7091)  acc5: 97.6000 (96.7636)  time: 0.8661  data: 0.5627  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0807 (1.0603)  acc1: 76.4000 (78.0000)  acc5: 93.6000 (94.4952)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1775 (1.0728)  acc1: 76.4000 (77.6800)  acc5: 93.6000 (94.4160)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5540 s / it)
* Acc@1 77.664 Acc@5 94.436 loss 1.065
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.82%
Epoch: [61]  [   0/1251]  eta: 1:25:59  lr: 0.003792  min_lr: 0.003792  loss: 3.4107 (3.4107)  weight_decay: 0.0500 (0.0500)  time: 4.1244  data: 2.8658  max mem: 54228
Epoch: [61]  [ 200/1251]  eta: 0:11:21  lr: 0.003791  min_lr: 0.003791  loss: 3.5510 (3.3969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8041 (1.0997)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [61]  [ 400/1251]  eta: 0:09:03  lr: 0.003789  min_lr: 0.003789  loss: 3.4951 (3.4222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8561 (0.9512)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [61]  [ 600/1251]  eta: 0:06:53  lr: 0.003787  min_lr: 0.003787  loss: 3.4234 (3.4167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8981 (0.9856)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [61]  [ 800/1251]  eta: 0:04:45  lr: 0.003786  min_lr: 0.003786  loss: 3.4405 (3.3971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9228 (0.9772)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [61]  [1000/1251]  eta: 0:02:38  lr: 0.003784  min_lr: 0.003784  loss: 3.5353 (3.4065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2266 (1.0009)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [61]  [1200/1251]  eta: 0:00:32  lr: 0.003782  min_lr: 0.003782  loss: 3.5337 (3.4031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9170 (0.9799)  time: 0.6336  data: 0.0005  max mem: 54228
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.4110 (3.4022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8328 (0.9775)  time: 0.5364  data: 0.0007  max mem: 54228
Epoch: [61] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.4110 (3.3960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8328 (0.9775)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6875 (0.6875)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 6.7043  data: 6.3675  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8690 (0.8539)  acc1: 84.4000 (82.8727)  acc5: 96.0000 (96.5091)  time: 0.8810  data: 0.5791  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0851 (1.0447)  acc1: 76.4000 (78.2667)  acc5: 94.0000 (94.0762)  time: 0.2986  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1670 (1.0562)  acc1: 74.8000 (78.0640)  acc5: 92.8000 (94.1440)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5590 s / it)
* Acc@1 77.960 Acc@5 94.498 loss 1.046
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.96%
Epoch: [62]  [   0/1251]  eta: 1:19:32  lr: 0.003782  min_lr: 0.003782  loss: 3.8100 (3.8100)  weight_decay: 0.0500 (0.0500)  time: 3.8148  data: 3.1739  max mem: 54228
Epoch: [62]  [ 200/1251]  eta: 0:11:16  lr: 0.003780  min_lr: 0.003780  loss: 3.5417 (3.4563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9676 (1.1360)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [62]  [ 400/1251]  eta: 0:09:01  lr: 0.003779  min_lr: 0.003779  loss: 3.5539 (3.4495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7331 (1.0525)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [62]  [ 600/1251]  eta: 0:06:52  lr: 0.003777  min_lr: 0.003777  loss: 3.3863 (3.4140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8783 (1.0238)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [62]  [ 800/1251]  eta: 0:04:45  lr: 0.003775  min_lr: 0.003775  loss: 3.4478 (3.4070)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [62]  [1000/1251]  eta: 0:02:38  lr: 0.003774  min_lr: 0.003774  loss: 3.4180 (3.3947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (nan)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [62]  [1200/1251]  eta: 0:00:32  lr: 0.003772  min_lr: 0.003772  loss: 3.4552 (3.3996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 3.6106 (3.3990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (nan)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [62] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 3.6106 (3.3971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (nan)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.6941 (0.6941)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 6.4825  data: 6.1482  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8909 (0.9141)  acc1: 81.6000 (81.8909)  acc5: 96.8000 (96.6909)  time: 0.8609  data: 0.5592  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1167 (1.1009)  acc1: 75.6000 (78.0762)  acc5: 94.8000 (94.4952)  time: 0.2986  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2723 (1.1131)  acc1: 74.4000 (77.9200)  acc5: 92.8000 (94.4960)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5498 s / it)
* Acc@1 77.986 Acc@5 94.676 loss 1.106
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 77.99%
Epoch: [63]  [   0/1251]  eta: 1:13:46  lr: 0.003772  min_lr: 0.003772  loss: 3.4736 (3.4736)  weight_decay: 0.0500 (0.0500)  time: 3.5387  data: 2.9088  max mem: 54228
Epoch: [63]  [ 200/1251]  eta: 0:11:15  lr: 0.003770  min_lr: 0.003770  loss: 3.3361 (3.3675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8700 (0.9692)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [63]  [ 400/1251]  eta: 0:09:01  lr: 0.003768  min_lr: 0.003768  loss: 3.3337 (3.3295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.9054)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [63]  [ 600/1251]  eta: 0:06:52  lr: 0.003767  min_lr: 0.003767  loss: 3.4982 (3.3286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (0.8897)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [63]  [ 800/1251]  eta: 0:04:45  lr: 0.003765  min_lr: 0.003765  loss: 3.4441 (3.3354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.9450)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [63]  [1000/1251]  eta: 0:02:38  lr: 0.003763  min_lr: 0.003763  loss: 3.3255 (3.3536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.9566)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [63]  [1200/1251]  eta: 0:00:32  lr: 0.003762  min_lr: 0.003762  loss: 3.6290 (3.3619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0337 (0.9666)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.3738 (3.3639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8161 (0.9627)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [63] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.3738 (3.3899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8161 (0.9627)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6615 (0.6615)  acc1: 86.8000 (86.8000)  acc5: 99.2000 (99.2000)  time: 6.5829  data: 6.2407  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8096 (0.8649)  acc1: 82.4000 (82.1818)  acc5: 96.8000 (96.4000)  time: 0.8710  data: 0.5677  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0778 (1.0435)  acc1: 76.4000 (78.6095)  acc5: 94.4000 (94.2667)  time: 0.2997  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1788 (1.0534)  acc1: 76.4000 (78.3200)  acc5: 92.4000 (94.1280)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5548 s / it)
* Acc@1 78.172 Acc@5 94.578 loss 1.052
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.17%
Epoch: [64]  [   0/1251]  eta: 1:14:34  lr: 0.003761  min_lr: 0.003761  loss: 3.3983 (3.3983)  weight_decay: 0.0500 (0.0500)  time: 3.5765  data: 2.9356  max mem: 54228
Epoch: [64]  [ 200/1251]  eta: 0:11:18  lr: 0.003760  min_lr: 0.003760  loss: 3.2102 (3.3439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8830 (1.0586)  time: 0.6372  data: 0.0005  max mem: 54228
Epoch: [64]  [ 400/1251]  eta: 0:09:02  lr: 0.003758  min_lr: 0.003758  loss: 3.1348 (3.3841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.9836)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [64]  [ 600/1251]  eta: 0:06:52  lr: 0.003756  min_lr: 0.003756  loss: 3.3747 (3.3932)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0202 (1.0074)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [64]  [ 800/1251]  eta: 0:04:45  lr: 0.003754  min_lr: 0.003754  loss: 3.3671 (3.4048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0173 (1.0326)  time: 0.6338  data: 0.0005  max mem: 54228
Epoch: [64]  [1000/1251]  eta: 0:02:38  lr: 0.003753  min_lr: 0.003753  loss: 3.5741 (3.4065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8758 (1.0018)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [64]  [1200/1251]  eta: 0:00:32  lr: 0.003751  min_lr: 0.003751  loss: 3.0313 (3.3980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0533 (1.0038)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 3.2742 (3.3983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9613 (1.0049)  time: 0.5378  data: 0.0006  max mem: 54228
Epoch: [64] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 3.2742 (3.3857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9613 (1.0049)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.6818 (0.6818)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 6.4769  data: 6.1527  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8424 (0.8541)  acc1: 81.6000 (81.8909)  acc5: 96.4000 (96.9091)  time: 0.8602  data: 0.5597  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0633 (1.0341)  acc1: 75.2000 (77.7905)  acc5: 94.0000 (94.5905)  time: 0.2983  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1222 (1.0364)  acc1: 75.2000 (77.8240)  acc5: 92.8000 (94.4960)  time: 0.2982  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5506 s / it)
* Acc@1 78.090 Acc@5 94.666 loss 1.026
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.17%
Epoch: [65]  [   0/1251]  eta: 1:26:36  lr: 0.003751  min_lr: 0.003751  loss: 3.6685 (3.6685)  weight_decay: 0.0500 (0.0500)  time: 4.1537  data: 2.4894  max mem: 54228
Epoch: [65]  [ 200/1251]  eta: 0:11:20  lr: 0.003749  min_lr: 0.003749  loss: 3.5743 (3.3843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8676 (0.8663)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [65]  [ 400/1251]  eta: 0:09:02  lr: 0.003747  min_lr: 0.003747  loss: 3.3029 (3.3849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8324 (0.9530)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [65]  [ 600/1251]  eta: 0:06:53  lr: 0.003745  min_lr: 0.003745  loss: 3.6365 (3.3958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1641 (0.9515)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [65]  [ 800/1251]  eta: 0:04:45  lr: 0.003744  min_lr: 0.003744  loss: 3.6108 (3.3954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7355 (0.9353)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [65]  [1000/1251]  eta: 0:02:38  lr: 0.003742  min_lr: 0.003742  loss: 3.5617 (3.4021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8436 (0.9441)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [65]  [1200/1251]  eta: 0:00:32  lr: 0.003740  min_lr: 0.003740  loss: 3.3242 (3.3965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.9501)  time: 0.6272  data: 0.0005  max mem: 54228
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 3.6580 (3.3980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.9424)  time: 0.5326  data: 0.0007  max mem: 54228
Epoch: [65] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 3.6580 (3.3791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7161 (0.9424)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.7086 (0.7086)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 6.6135  data: 6.2806  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8385 (0.8631)  acc1: 82.0000 (81.4545)  acc5: 96.4000 (96.6182)  time: 0.8727  data: 0.5713  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0863 (1.0271)  acc1: 74.8000 (77.9238)  acc5: 94.8000 (94.6476)  time: 0.2985  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1133 (1.0377)  acc1: 76.0000 (77.7600)  acc5: 93.2000 (94.6080)  time: 0.2984  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5553 s / it)
* Acc@1 78.172 Acc@5 94.748 loss 1.025
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.17%
Epoch: [66]  [   0/1251]  eta: 1:03:21  lr: 0.003740  min_lr: 0.003740  loss: 3.7353 (3.7353)  weight_decay: 0.0500 (0.0500)  time: 3.0390  data: 2.4075  max mem: 54228
Epoch: [66]  [ 200/1251]  eta: 0:11:12  lr: 0.003738  min_lr: 0.003738  loss: 3.4558 (3.3457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8492 (1.0918)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [66]  [ 400/1251]  eta: 0:09:00  lr: 0.003736  min_lr: 0.003736  loss: 3.5883 (3.3722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9349 (1.0286)  time: 0.6358  data: 0.0004  max mem: 54228
Epoch: [66]  [ 600/1251]  eta: 0:06:52  lr: 0.003734  min_lr: 0.003734  loss: 3.6218 (3.3917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (1.0020)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [66]  [ 800/1251]  eta: 0:04:44  lr: 0.003732  min_lr: 0.003732  loss: 3.4063 (3.3979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9092 (0.9681)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [66]  [1000/1251]  eta: 0:02:38  lr: 0.003731  min_lr: 0.003731  loss: 3.5159 (3.3880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8498 (0.9674)  time: 0.6274  data: 0.0004  max mem: 54228
Epoch: [66]  [1200/1251]  eta: 0:00:32  lr: 0.003729  min_lr: 0.003729  loss: 3.5654 (3.3874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7072 (0.9772)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 3.3358 (3.3904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.9715)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [66] Total time: 0:13:07 (0.6297 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 3.3358 (3.3685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7805 (0.9715)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.6886 (0.6886)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.1615  data: 5.8332  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9155 (0.9113)  acc1: 82.4000 (81.3818)  acc5: 96.0000 (96.2909)  time: 0.8319  data: 0.5306  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0481 (1.0635)  acc1: 76.0000 (77.9429)  acc5: 93.6000 (94.4000)  time: 0.2988  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1435 (1.0723)  acc1: 76.0000 (77.5520)  acc5: 92.8000 (94.3200)  time: 0.2987  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5379 s / it)
* Acc@1 78.076 Acc@5 94.674 loss 1.053
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.17%
Epoch: [67]  [   0/1251]  eta: 1:25:43  lr: 0.003728  min_lr: 0.003728  loss: 2.7149 (2.7149)  weight_decay: 0.0500 (0.0500)  time: 4.1119  data: 3.0204  max mem: 54228
Epoch: [67]  [ 200/1251]  eta: 0:11:19  lr: 0.003727  min_lr: 0.003727  loss: 3.5402 (3.3695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9081 (0.9655)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [67]  [ 400/1251]  eta: 0:09:03  lr: 0.003725  min_lr: 0.003725  loss: 3.7147 (3.3540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8271 (1.0131)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [67]  [ 600/1251]  eta: 0:06:53  lr: 0.003723  min_lr: 0.003723  loss: 3.5980 (3.3576)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0175 (1.0020)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [67]  [ 800/1251]  eta: 0:04:45  lr: 0.003721  min_lr: 0.003721  loss: 3.2352 (3.3591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6978 (1.0070)  time: 0.6333  data: 0.0005  max mem: 54228
Epoch: [67]  [1000/1251]  eta: 0:02:38  lr: 0.003719  min_lr: 0.003719  loss: 3.5400 (3.3669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8894 (1.0145)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [67]  [1200/1251]  eta: 0:00:32  lr: 0.003717  min_lr: 0.003717  loss: 3.7129 (3.3743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (1.0094)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.4745 (3.3740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0023)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [67] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.4745 (3.3669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (1.0023)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6497 (0.6497)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 6.4174  data: 6.0987  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8468 (0.8621)  acc1: 80.4000 (81.4182)  acc5: 96.4000 (96.3636)  time: 0.8562  data: 0.5547  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0753 (1.0331)  acc1: 75.6000 (77.6571)  acc5: 94.0000 (94.5905)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1666 (1.0389)  acc1: 75.6000 (77.5040)  acc5: 93.6000 (94.5920)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5496 s / it)
* Acc@1 78.254 Acc@5 94.730 loss 1.025
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.25%
Epoch: [68]  [   0/1251]  eta: 1:12:29  lr: 0.003717  min_lr: 0.003717  loss: 3.5947 (3.5947)  weight_decay: 0.0500 (0.0500)  time: 3.4766  data: 2.8321  max mem: 54228
Epoch: [68]  [ 200/1251]  eta: 0:11:17  lr: 0.003715  min_lr: 0.003715  loss: 3.5309 (3.3401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7689 (0.8768)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [68]  [ 400/1251]  eta: 0:09:02  lr: 0.003713  min_lr: 0.003713  loss: 3.4324 (3.3372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0248 (0.9664)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [68]  [ 600/1251]  eta: 0:06:53  lr: 0.003711  min_lr: 0.003711  loss: 3.4508 (3.3459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7182 (0.9804)  time: 0.6342  data: 0.0005  max mem: 54228
Epoch: [68]  [ 800/1251]  eta: 0:04:45  lr: 0.003710  min_lr: 0.003710  loss: 3.5555 (3.3485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7561 (0.9429)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [68]  [1000/1251]  eta: 0:02:38  lr: 0.003708  min_lr: 0.003708  loss: 3.5043 (3.3525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0619 (0.9944)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [68]  [1200/1251]  eta: 0:00:32  lr: 0.003706  min_lr: 0.003706  loss: 3.4517 (3.3620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.9610)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.4459 (3.3588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8846 (0.9629)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [68] Total time: 0:13:10 (0.6317 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.4459 (3.3662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8846 (0.9629)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.6932 (0.6932)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.0907  data: 4.7481  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.8593 (0.8576)  acc1: 82.4000 (81.7091)  acc5: 96.4000 (96.7636)  time: 0.7886  data: 0.4856  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0718 (1.0318)  acc1: 76.0000 (78.2476)  acc5: 95.2000 (94.8000)  time: 0.3283  data: 0.0297  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1735 (1.0486)  acc1: 76.0000 (77.9360)  acc5: 92.8000 (94.6240)  time: 0.2982  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5187 s / it)
* Acc@1 78.190 Acc@5 94.754 loss 1.046
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.25%
Epoch: [69]  [   0/1251]  eta: 1:24:34  lr: 0.003705  min_lr: 0.003705  loss: 3.7313 (3.7313)  weight_decay: 0.0500 (0.0500)  time: 4.0566  data: 2.9422  max mem: 54228
Epoch: [69]  [ 200/1251]  eta: 0:11:18  lr: 0.003703  min_lr: 0.003703  loss: 3.3393 (3.3149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6845 (0.9078)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [69]  [ 400/1251]  eta: 0:09:01  lr: 0.003702  min_lr: 0.003702  loss: 3.5098 (3.3297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0012 (1.0383)  time: 0.6273  data: 0.0005  max mem: 54228
Epoch: [69]  [ 600/1251]  eta: 0:06:52  lr: 0.003700  min_lr: 0.003700  loss: 3.6819 (3.3560)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [69]  [ 800/1251]  eta: 0:04:45  lr: 0.003698  min_lr: 0.003698  loss: 3.6483 (3.3656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8378 (nan)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [69]  [1000/1251]  eta: 0:02:38  lr: 0.003696  min_lr: 0.003696  loss: 3.5266 (3.3673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7761 (nan)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [69]  [1200/1251]  eta: 0:00:32  lr: 0.003694  min_lr: 0.003694  loss: 3.0815 (3.3650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7813 (nan)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 3.5819 (3.3672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (nan)  time: 0.5328  data: 0.0005  max mem: 54228
Epoch: [69] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 3.5819 (3.3615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (nan)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6997 (0.6997)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 6.2687  data: 5.9430  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8275 (0.8825)  acc1: 82.0000 (82.5091)  acc5: 96.8000 (96.7273)  time: 0.8422  data: 0.5405  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0674 (1.0714)  acc1: 76.4000 (78.6857)  acc5: 94.8000 (94.6667)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2171 (1.0788)  acc1: 76.4000 (78.4000)  acc5: 93.2000 (94.6080)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5434 s / it)
* Acc@1 78.456 Acc@5 94.812 loss 1.069
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.46%
Epoch: [70]  [   0/1251]  eta: 1:11:29  lr: 0.003694  min_lr: 0.003694  loss: 3.5695 (3.5695)  weight_decay: 0.0500 (0.0500)  time: 3.4286  data: 2.7796  max mem: 54228
Epoch: [70]  [ 200/1251]  eta: 0:11:15  lr: 0.003692  min_lr: 0.003692  loss: 3.4198 (3.3289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8740 (1.1262)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [70]  [ 400/1251]  eta: 0:09:02  lr: 0.003690  min_lr: 0.003690  loss: 3.3362 (3.3206)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0768 (1.0812)  time: 0.6294  data: 0.0005  max mem: 54228
Epoch: [70]  [ 600/1251]  eta: 0:06:53  lr: 0.003688  min_lr: 0.003688  loss: 3.4485 (3.3237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9110 (1.0125)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [70]  [ 800/1251]  eta: 0:04:45  lr: 0.003686  min_lr: 0.003686  loss: 3.1611 (3.3249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8794 (1.0010)  time: 0.6415  data: 0.0005  max mem: 54228
Epoch: [70]  [1000/1251]  eta: 0:02:38  lr: 0.003684  min_lr: 0.003684  loss: 3.2695 (3.3328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8276 (0.9786)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [70]  [1200/1251]  eta: 0:00:32  lr: 0.003682  min_lr: 0.003682  loss: 3.5849 (3.3423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0565 (0.9930)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.5647 (3.3444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7867 (0.9937)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [70] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.5647 (3.3587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7867 (0.9937)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.7281 (0.7281)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.1740  data: 5.8472  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 1.0006 (0.9474)  acc1: 82.8000 (82.9455)  acc5: 96.8000 (97.0545)  time: 0.8337  data: 0.5319  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1465 (1.1260)  acc1: 77.2000 (78.7619)  acc5: 94.4000 (94.7810)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2385 (1.1374)  acc1: 76.0000 (78.5440)  acc5: 93.6000 (94.7040)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5386 s / it)
* Acc@1 78.414 Acc@5 94.780 loss 1.137
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.46%
Epoch: [71]  [   0/1251]  eta: 1:30:33  lr: 0.003681  min_lr: 0.003681  loss: 3.5309 (3.5309)  weight_decay: 0.0500 (0.0500)  time: 4.3432  data: 2.6839  max mem: 54228
Epoch: [71]  [ 200/1251]  eta: 0:11:22  lr: 0.003680  min_lr: 0.003680  loss: 3.5742 (3.3110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7971 (0.9228)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [71]  [ 400/1251]  eta: 0:09:03  lr: 0.003678  min_lr: 0.003678  loss: 3.6047 (3.3433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8875 (0.9575)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [71]  [ 600/1251]  eta: 0:06:53  lr: 0.003676  min_lr: 0.003676  loss: 3.2664 (3.3255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7925 (0.9502)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [71]  [ 800/1251]  eta: 0:04:45  lr: 0.003674  min_lr: 0.003674  loss: 3.2367 (3.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8749 (0.9354)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [71]  [1000/1251]  eta: 0:02:38  lr: 0.003672  min_lr: 0.003672  loss: 3.4317 (3.3352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9533 (nan)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [71]  [1200/1251]  eta: 0:00:32  lr: 0.003670  min_lr: 0.003670  loss: 3.5424 (3.3425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6198 (nan)  time: 0.6353  data: 0.0005  max mem: 54228
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.2449 (3.3402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0192 (nan)  time: 0.5336  data: 0.0007  max mem: 54228
Epoch: [71] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.2449 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0192 (nan)
Test:  [ 0/25]  eta: 0:02:52  loss: 0.6159 (0.6159)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 6.8893  data: 6.5611  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8088 (0.8147)  acc1: 83.2000 (82.4000)  acc5: 97.2000 (96.9091)  time: 0.8987  data: 0.5968  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0301 (0.9914)  acc1: 75.6000 (78.7238)  acc5: 94.8000 (94.8571)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1267 (1.0040)  acc1: 75.2000 (78.2720)  acc5: 93.6000 (94.9120)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5696 s / it)
* Acc@1 78.422 Acc@5 94.990 loss 1.001
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.46%
Epoch: [72]  [   0/1251]  eta: 1:26:46  lr: 0.003669  min_lr: 0.003669  loss: 3.6319 (3.6319)  weight_decay: 0.0500 (0.0500)  time: 4.1619  data: 3.4349  max mem: 54228
Epoch: [72]  [ 200/1251]  eta: 0:11:18  lr: 0.003667  min_lr: 0.003667  loss: 3.6181 (3.3790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7959 (0.8094)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [72]  [ 400/1251]  eta: 0:09:02  lr: 0.003665  min_lr: 0.003665  loss: 3.3586 (3.3205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.9677)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [72]  [ 600/1251]  eta: 0:06:53  lr: 0.003663  min_lr: 0.003663  loss: 3.0909 (3.3277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9838 (0.9701)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [72]  [ 800/1251]  eta: 0:04:45  lr: 0.003661  min_lr: 0.003661  loss: 3.4295 (3.3355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.9425)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [72]  [1000/1251]  eta: 0:02:38  lr: 0.003659  min_lr: 0.003659  loss: 3.3825 (3.3220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.9894)  time: 0.6344  data: 0.0005  max mem: 54228
Epoch: [72]  [1200/1251]  eta: 0:00:32  lr: 0.003657  min_lr: 0.003657  loss: 3.2152 (3.3174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1396 (0.9812)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 3.5719 (3.3212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9113 (0.9740)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [72] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 3.5719 (3.3468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9113 (0.9740)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.6529 (0.6529)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 6.4408  data: 6.1255  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8052 (0.8593)  acc1: 81.2000 (82.3273)  acc5: 97.2000 (96.6182)  time: 0.8585  data: 0.5572  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0984 (1.0198)  acc1: 76.4000 (78.7619)  acc5: 94.0000 (94.7048)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1193 (1.0338)  acc1: 75.6000 (78.2880)  acc5: 94.0000 (94.5920)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5499 s / it)
* Acc@1 78.476 Acc@5 94.830 loss 1.029
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.48%
Epoch: [73]  [   0/1251]  eta: 1:17:36  lr: 0.003657  min_lr: 0.003657  loss: 2.5361 (2.5361)  weight_decay: 0.0500 (0.0500)  time: 3.7225  data: 3.0853  max mem: 54228
Epoch: [73]  [ 200/1251]  eta: 0:11:17  lr: 0.003655  min_lr: 0.003655  loss: 3.4394 (3.3021)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0756 (0.9934)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [73]  [ 400/1251]  eta: 0:09:02  lr: 0.003653  min_lr: 0.003653  loss: 3.5283 (3.3313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7799 (0.9330)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [73]  [ 600/1251]  eta: 0:06:53  lr: 0.003651  min_lr: 0.003651  loss: 3.4239 (3.3403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8015 (0.9300)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [73]  [ 800/1251]  eta: 0:04:45  lr: 0.003649  min_lr: 0.003649  loss: 3.5004 (3.3387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.9361)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [73]  [1000/1251]  eta: 0:02:38  lr: 0.003647  min_lr: 0.003647  loss: 3.5251 (3.3399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1333 (0.9542)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [73]  [1200/1251]  eta: 0:00:32  lr: 0.003645  min_lr: 0.003645  loss: 3.2278 (3.3338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7234 (0.9540)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.4778 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (0.9552)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [73] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.4778 (3.3365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7661 (0.9552)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.7020 (0.7020)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 6.3782  data: 6.0484  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8238 (0.8640)  acc1: 82.4000 (82.6909)  acc5: 97.6000 (96.8727)  time: 0.8523  data: 0.5501  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0702 (1.0246)  acc1: 77.6000 (78.8191)  acc5: 94.4000 (95.0286)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1441 (1.0377)  acc1: 77.2000 (78.3680)  acc5: 94.4000 (94.8960)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5484 s / it)
* Acc@1 78.726 Acc@5 94.828 loss 1.030
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.73%
Epoch: [74]  [   0/1251]  eta: 1:16:11  lr: 0.003644  min_lr: 0.003644  loss: 4.0088 (4.0088)  weight_decay: 0.0500 (0.0500)  time: 3.6539  data: 3.0269  max mem: 54228
Epoch: [74]  [ 200/1251]  eta: 0:11:18  lr: 0.003642  min_lr: 0.003642  loss: 3.5976 (3.3305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9679 (1.0110)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [74]  [ 400/1251]  eta: 0:09:02  lr: 0.003640  min_lr: 0.003640  loss: 3.4041 (3.3284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0874 (1.0514)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [74]  [ 600/1251]  eta: 0:06:52  lr: 0.003638  min_lr: 0.003638  loss: 3.6130 (3.3357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1387 (1.0639)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [74]  [ 800/1251]  eta: 0:04:45  lr: 0.003636  min_lr: 0.003636  loss: 3.4819 (3.3426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9249 (1.0218)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [74]  [1000/1251]  eta: 0:02:38  lr: 0.003634  min_lr: 0.003634  loss: 3.4398 (3.3433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9712 (1.0108)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [74]  [1200/1251]  eta: 0:00:32  lr: 0.003632  min_lr: 0.003632  loss: 3.5195 (3.3294)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9896 (0.9994)  time: 0.6368  data: 0.0005  max mem: 54228
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 3.4553 (3.3309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9676 (1.0001)  time: 0.5412  data: 0.0005  max mem: 54228
Epoch: [74] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 3.4553 (3.3315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9676 (1.0001)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.7507 (0.7507)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 6.4592  data: 6.1318  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8862 (0.8833)  acc1: 83.6000 (82.1818)  acc5: 96.8000 (96.6545)  time: 0.8597  data: 0.5578  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1043 (1.0485)  acc1: 77.2000 (78.8952)  acc5: 94.0000 (94.6476)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1689 (1.0560)  acc1: 77.2000 (78.7040)  acc5: 93.2000 (94.5600)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5519 s / it)
* Acc@1 78.360 Acc@5 94.890 loss 1.057
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.73%
Epoch: [75]  [   0/1251]  eta: 1:24:25  lr: 0.003631  min_lr: 0.003631  loss: 3.1337 (3.1337)  weight_decay: 0.0500 (0.0500)  time: 4.0494  data: 3.3137  max mem: 54228
Epoch: [75]  [ 200/1251]  eta: 0:11:19  lr: 0.003629  min_lr: 0.003629  loss: 3.5367 (3.2567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9390 (1.0061)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [75]  [ 400/1251]  eta: 0:09:02  lr: 0.003627  min_lr: 0.003627  loss: 3.3894 (3.2889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (1.0394)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [75]  [ 600/1251]  eta: 0:06:53  lr: 0.003625  min_lr: 0.003625  loss: 2.9741 (3.2843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6282 (0.9923)  time: 0.6342  data: 0.0005  max mem: 54228
Epoch: [75]  [ 800/1251]  eta: 0:04:45  lr: 0.003623  min_lr: 0.003623  loss: 3.5597 (3.2831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1547 (1.0112)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [75]  [1000/1251]  eta: 0:02:38  lr: 0.003621  min_lr: 0.003621  loss: 3.3205 (3.2814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8932 (0.9927)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [75]  [1200/1251]  eta: 0:00:32  lr: 0.003619  min_lr: 0.003619  loss: 3.1945 (3.2913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7090 (0.9896)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 3.5389 (3.2936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0529 (0.9969)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [75] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 3.5389 (3.3309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0529 (0.9969)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.7084 (0.7084)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 6.2624  data: 5.9420  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8039 (0.8363)  acc1: 82.8000 (82.8364)  acc5: 97.2000 (97.1636)  time: 0.8417  data: 0.5405  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0136 (0.9947)  acc1: 76.4000 (78.8571)  acc5: 94.4000 (94.9333)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0994 (1.0102)  acc1: 76.4000 (78.4800)  acc5: 94.0000 (94.9280)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5425 s / it)
* Acc@1 78.644 Acc@5 95.028 loss 1.008
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.73%
Epoch: [76]  [   0/1251]  eta: 1:26:32  lr: 0.003618  min_lr: 0.003618  loss: 4.0200 (4.0200)  weight_decay: 0.0500 (0.0500)  time: 4.1505  data: 2.5970  max mem: 54228
Epoch: [76]  [ 200/1251]  eta: 0:11:18  lr: 0.003616  min_lr: 0.003616  loss: 3.1992 (3.3663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9335 (0.8742)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [76]  [ 400/1251]  eta: 0:09:03  lr: 0.003614  min_lr: 0.003614  loss: 3.5619 (3.3396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8300 (0.8808)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [76]  [ 600/1251]  eta: 0:06:53  lr: 0.003612  min_lr: 0.003612  loss: 3.1476 (3.3435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (0.9145)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [76]  [ 800/1251]  eta: 0:04:45  lr: 0.003610  min_lr: 0.003610  loss: 3.6497 (3.3520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7760 (0.9459)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [76]  [1000/1251]  eta: 0:02:38  lr: 0.003607  min_lr: 0.003607  loss: 3.4886 (3.3533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8754 (0.9532)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [76]  [1200/1251]  eta: 0:00:32  lr: 0.003605  min_lr: 0.003605  loss: 3.5152 (3.3617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9468 (0.9404)  time: 0.6268  data: 0.0005  max mem: 54228
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.4342 (3.3622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8755 (0.9369)  time: 0.5321  data: 0.0007  max mem: 54228
Epoch: [76] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.4342 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8755 (0.9369)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.7412 (0.7412)  acc1: 84.8000 (84.8000)  acc5: 97.6000 (97.6000)  time: 6.6463  data: 6.3275  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8075 (0.8656)  acc1: 82.4000 (82.8727)  acc5: 96.4000 (96.9818)  time: 0.8760  data: 0.5755  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0613 (1.0278)  acc1: 77.6000 (79.0286)  acc5: 94.8000 (94.8381)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1322 (1.0452)  acc1: 76.8000 (78.5760)  acc5: 93.6000 (94.6080)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5579 s / it)
* Acc@1 78.620 Acc@5 94.892 loss 1.044
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.73%
Epoch: [77]  [   0/1251]  eta: 1:16:57  lr: 0.003605  min_lr: 0.003605  loss: 3.1854 (3.1854)  weight_decay: 0.0500 (0.0500)  time: 3.6913  data: 2.7882  max mem: 54228
Epoch: [77]  [ 200/1251]  eta: 0:11:18  lr: 0.003603  min_lr: 0.003603  loss: 3.2671 (3.2833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9090 (0.9803)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [77]  [ 400/1251]  eta: 0:09:02  lr: 0.003601  min_lr: 0.003601  loss: 3.4185 (3.2939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.9949)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [77]  [ 600/1251]  eta: 0:06:53  lr: 0.003598  min_lr: 0.003598  loss: 3.4077 (3.3021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6698 (1.0143)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [77]  [ 800/1251]  eta: 0:04:45  lr: 0.003596  min_lr: 0.003596  loss: 3.3314 (3.3012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9011 (0.9829)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [77]  [1000/1251]  eta: 0:02:38  lr: 0.003594  min_lr: 0.003594  loss: 3.2626 (3.3093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (0.9673)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [77]  [1200/1251]  eta: 0:00:32  lr: 0.003592  min_lr: 0.003592  loss: 3.1920 (3.3068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.9622)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 3.4280 (3.3113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.9617)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [77] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 3.4280 (3.3194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7869 (0.9617)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.7471 (0.7471)  acc1: 86.4000 (86.4000)  acc5: 97.2000 (97.2000)  time: 6.0478  data: 5.7153  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8750 (0.8679)  acc1: 82.4000 (82.4727)  acc5: 96.8000 (96.5818)  time: 0.8223  data: 0.5199  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9921 (1.0292)  acc1: 78.4000 (79.0286)  acc5: 94.0000 (94.7619)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1518 (1.0432)  acc1: 77.6000 (78.7040)  acc5: 93.6000 (94.6400)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5352 s / it)
* Acc@1 78.798 Acc@5 94.910 loss 1.038
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.80%
Epoch: [78]  [   0/1251]  eta: 1:11:15  lr: 0.003591  min_lr: 0.003591  loss: 3.4966 (3.4966)  weight_decay: 0.0500 (0.0500)  time: 3.4174  data: 2.7848  max mem: 54228
Epoch: [78]  [ 200/1251]  eta: 0:11:17  lr: 0.003589  min_lr: 0.003589  loss: 3.5088 (3.3040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7456 (0.8446)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [78]  [ 400/1251]  eta: 0:09:01  lr: 0.003587  min_lr: 0.003587  loss: 3.3958 (3.3255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7447 (0.8810)  time: 0.6348  data: 0.0005  max mem: 54228
Epoch: [78]  [ 600/1251]  eta: 0:06:53  lr: 0.003585  min_lr: 0.003585  loss: 3.4819 (3.3466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0046 (0.8923)  time: 0.6354  data: 0.0005  max mem: 54228
Epoch: [78]  [ 800/1251]  eta: 0:04:45  lr: 0.003583  min_lr: 0.003583  loss: 3.2118 (3.3364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0213 (0.9114)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [78]  [1000/1251]  eta: 0:02:38  lr: 0.003580  min_lr: 0.003580  loss: 3.1738 (3.3354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9486 (0.9242)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [78]  [1200/1251]  eta: 0:00:32  lr: 0.003578  min_lr: 0.003578  loss: 3.3639 (3.3331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (0.9455)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 3.6558 (3.3356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.9427)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [78] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 3.6558 (3.3168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.9427)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.7772 (0.7772)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 6.4881  data: 6.1550  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.9473 (0.9558)  acc1: 82.0000 (81.8545)  acc5: 97.2000 (96.9818)  time: 0.8622  data: 0.5599  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.1291 (1.1080)  acc1: 76.8000 (78.4762)  acc5: 95.2000 (95.0095)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2374 (1.1174)  acc1: 77.2000 (78.4480)  acc5: 94.4000 (94.9440)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5537 s / it)
* Acc@1 78.528 Acc@5 94.958 loss 1.110
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.80%
Epoch: [79]  [   0/1251]  eta: 1:26:50  lr: 0.003578  min_lr: 0.003578  loss: 3.7595 (3.7595)  weight_decay: 0.0500 (0.0500)  time: 4.1653  data: 2.5305  max mem: 54228
Epoch: [79]  [ 200/1251]  eta: 0:11:19  lr: 0.003575  min_lr: 0.003575  loss: 3.5407 (3.3113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3345 (0.9997)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [79]  [ 400/1251]  eta: 0:09:03  lr: 0.003573  min_lr: 0.003573  loss: 3.4880 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.9482)  time: 0.6364  data: 0.0004  max mem: 54228
Epoch: [79]  [ 600/1251]  eta: 0:06:53  lr: 0.003571  min_lr: 0.003571  loss: 3.4327 (3.3011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8702 (0.9415)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [79]  [ 800/1251]  eta: 0:04:45  lr: 0.003569  min_lr: 0.003569  loss: 3.4290 (3.2990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7937 (0.9057)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [79]  [1000/1251]  eta: 0:02:38  lr: 0.003567  min_lr: 0.003567  loss: 3.3543 (3.3131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8340 (0.9010)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [79]  [1200/1251]  eta: 0:00:32  lr: 0.003564  min_lr: 0.003564  loss: 3.4446 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7706 (0.9060)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.3598 (3.3121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7943 (0.9036)  time: 0.5374  data: 0.0006  max mem: 54228
Epoch: [79] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.3598 (3.3074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7943 (0.9036)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6283 (0.6283)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.2596  data: 5.9216  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8395 (0.8320)  acc1: 82.4000 (82.9455)  acc5: 96.8000 (97.0182)  time: 0.8418  data: 0.5387  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0309 (0.9919)  acc1: 77.2000 (79.5048)  acc5: 94.4000 (94.7429)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0736 (1.0044)  acc1: 77.2000 (79.1360)  acc5: 93.6000 (94.6880)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5420 s / it)
* Acc@1 78.958 Acc@5 94.908 loss 1.004
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 78.96%
Epoch: [80]  [   0/1251]  eta: 1:12:40  lr: 0.003564  min_lr: 0.003564  loss: 2.7430 (2.7430)  weight_decay: 0.0500 (0.0500)  time: 3.4852  data: 2.8509  max mem: 54228
Epoch: [80]  [ 200/1251]  eta: 0:11:15  lr: 0.003562  min_lr: 0.003562  loss: 3.4046 (3.2749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8081 (0.9572)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [80]  [ 400/1251]  eta: 0:09:02  lr: 0.003559  min_lr: 0.003559  loss: 3.4193 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8211 (0.9586)  time: 0.6360  data: 0.0005  max mem: 54228
Epoch: [80]  [ 600/1251]  eta: 0:06:53  lr: 0.003557  min_lr: 0.003557  loss: 3.5089 (3.2954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7954 (0.9599)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [80]  [ 800/1251]  eta: 0:04:45  lr: 0.003555  min_lr: 0.003555  loss: 3.4408 (3.2995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8655 (0.9492)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [80]  [1000/1251]  eta: 0:02:38  lr: 0.003553  min_lr: 0.003553  loss: 3.6312 (3.3019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6853 (0.9429)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [80]  [1200/1251]  eta: 0:00:32  lr: 0.003550  min_lr: 0.003550  loss: 3.1465 (3.2909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.9222)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 3.6430 (3.2912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6912 (0.9127)  time: 0.5334  data: 0.0007  max mem: 54228
Epoch: [80] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 3.6430 (3.3047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6912 (0.9127)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7801 (0.7801)  acc1: 85.2000 (85.2000)  acc5: 98.8000 (98.8000)  time: 5.4371  data: 5.0940  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.9979 (0.9748)  acc1: 83.2000 (82.5455)  acc5: 97.6000 (96.9091)  time: 0.7945  data: 0.4921  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.2309 (1.1283)  acc1: 76.4000 (78.7048)  acc5: 94.8000 (95.0857)  time: 0.3142  data: 0.0160  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.2619 (1.1423)  acc1: 76.4000 (78.3680)  acc5: 94.0000 (94.8800)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5277 s / it)
* Acc@1 78.840 Acc@5 95.010 loss 1.133
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.96%
Epoch: [81]  [   0/1251]  eta: 1:23:31  lr: 0.003550  min_lr: 0.003550  loss: 3.8794 (3.8794)  weight_decay: 0.0500 (0.0500)  time: 4.0060  data: 3.0587  max mem: 54228
Epoch: [81]  [ 200/1251]  eta: 0:11:21  lr: 0.003547  min_lr: 0.003547  loss: 3.4182 (3.2224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.8805)  time: 0.6274  data: 0.0004  max mem: 54228
Epoch: [81]  [ 400/1251]  eta: 0:09:03  lr: 0.003545  min_lr: 0.003545  loss: 3.2807 (3.2695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7544 (0.9448)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [81]  [ 600/1251]  eta: 0:06:53  lr: 0.003543  min_lr: 0.003543  loss: 3.4320 (3.2670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.8998)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [81]  [ 800/1251]  eta: 0:04:45  lr: 0.003541  min_lr: 0.003541  loss: 3.4278 (3.2790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8623 (0.9196)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [81]  [1000/1251]  eta: 0:02:38  lr: 0.003538  min_lr: 0.003538  loss: 3.4901 (3.2803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8964 (0.9313)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [81]  [1200/1251]  eta: 0:00:32  lr: 0.003536  min_lr: 0.003536  loss: 3.5113 (3.2907)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0598 (0.9379)  time: 0.6338  data: 0.0005  max mem: 54228
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 3.3587 (3.2881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.9378)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [81] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 3.3587 (3.2981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (0.9378)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6947 (0.6947)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.8744  data: 5.5330  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8624 (0.8581)  acc1: 84.8000 (82.9091)  acc5: 97.2000 (96.9455)  time: 0.8064  data: 0.5033  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0238 (1.0206)  acc1: 78.0000 (79.5619)  acc5: 94.4000 (94.9905)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0985 (1.0348)  acc1: 77.6000 (79.2160)  acc5: 94.0000 (94.8800)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5268 s / it)
* Acc@1 79.140 Acc@5 95.046 loss 1.029
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.14%
Epoch: [82]  [   0/1251]  eta: 1:09:52  lr: 0.003535  min_lr: 0.003535  loss: 3.4553 (3.4553)  weight_decay: 0.0500 (0.0500)  time: 3.3514  data: 2.7204  max mem: 54228
Epoch: [82]  [ 200/1251]  eta: 0:11:14  lr: 0.003533  min_lr: 0.003533  loss: 3.2992 (3.2609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0025 (0.9807)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [82]  [ 400/1251]  eta: 0:09:00  lr: 0.003531  min_lr: 0.003531  loss: 3.3068 (3.2713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.9822)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [82]  [ 600/1251]  eta: 0:06:52  lr: 0.003528  min_lr: 0.003528  loss: 3.4550 (3.2816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8082 (0.9330)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [82]  [ 800/1251]  eta: 0:04:45  lr: 0.003526  min_lr: 0.003526  loss: 3.2770 (3.2729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7127 (0.9139)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [82]  [1000/1251]  eta: 0:02:38  lr: 0.003524  min_lr: 0.003524  loss: 3.2752 (3.2832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7261 (0.9055)  time: 0.6368  data: 0.0005  max mem: 54228
Epoch: [82]  [1200/1251]  eta: 0:00:32  lr: 0.003521  min_lr: 0.003521  loss: 3.3812 (3.2900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6609 (0.9002)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 3.2770 (3.2868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9157 (0.9023)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [82] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 3.2770 (3.2986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9157 (0.9023)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6391 (0.6391)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.5877  data: 6.2676  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7925 (0.7940)  acc1: 84.0000 (83.4909)  acc5: 97.2000 (96.9455)  time: 0.8704  data: 0.5701  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9990 (0.9658)  acc1: 77.2000 (79.0667)  acc5: 94.8000 (94.9524)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0637 (0.9774)  acc1: 76.4000 (78.8960)  acc5: 94.4000 (94.9120)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5558 s / it)
* Acc@1 79.220 Acc@5 95.106 loss 0.970
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.22%
Epoch: [83]  [   0/1251]  eta: 1:11:30  lr: 0.003521  min_lr: 0.003521  loss: 3.4250 (3.4250)  weight_decay: 0.0500 (0.0500)  time: 3.4295  data: 2.7944  max mem: 54228
Epoch: [83]  [ 200/1251]  eta: 0:11:15  lr: 0.003519  min_lr: 0.003519  loss: 3.2708 (3.2780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8927 (0.9193)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [83]  [ 400/1251]  eta: 0:09:01  lr: 0.003516  min_lr: 0.003516  loss: 3.2155 (3.2911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8895 (0.9132)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [83]  [ 600/1251]  eta: 0:06:52  lr: 0.003514  min_lr: 0.003514  loss: 3.3964 (3.2911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8678 (0.9574)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [83]  [ 800/1251]  eta: 0:04:45  lr: 0.003512  min_lr: 0.003512  loss: 3.4761 (3.2930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7834 (0.9484)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [83]  [1000/1251]  eta: 0:02:38  lr: 0.003509  min_lr: 0.003509  loss: 3.3588 (3.2918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8138 (0.9275)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [83]  [1200/1251]  eta: 0:00:32  lr: 0.003507  min_lr: 0.003507  loss: 3.5656 (3.3005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8707 (0.9282)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 3.5781 (3.2994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.9324)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [83] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 3.5781 (3.2913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.9324)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6361 (0.6361)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 6.3186  data: 5.9898  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8512 (0.8107)  acc1: 81.2000 (82.1818)  acc5: 97.2000 (97.0546)  time: 0.8469  data: 0.5448  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9750 (0.9818)  acc1: 78.0000 (78.4381)  acc5: 94.8000 (94.9143)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0467 (0.9885)  acc1: 78.0000 (78.2880)  acc5: 94.4000 (94.8960)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5469 s / it)
* Acc@1 79.166 Acc@5 95.058 loss 0.968
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.22%
Epoch: [84]  [   0/1251]  eta: 1:27:24  lr: 0.003506  min_lr: 0.003506  loss: 3.8556 (3.8556)  weight_decay: 0.0500 (0.0500)  time: 4.1923  data: 2.9143  max mem: 54228
Epoch: [84]  [ 200/1251]  eta: 0:11:21  lr: 0.003504  min_lr: 0.003504  loss: 3.3588 (3.2376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7499 (0.8134)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [84]  [ 400/1251]  eta: 0:09:03  lr: 0.003502  min_lr: 0.003502  loss: 3.2645 (3.2836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7854 (0.8925)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [84]  [ 600/1251]  eta: 0:06:53  lr: 0.003499  min_lr: 0.003499  loss: 3.3130 (3.2762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (0.8859)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [84]  [ 800/1251]  eta: 0:04:46  lr: 0.003497  min_lr: 0.003497  loss: 3.5502 (3.2779)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (0.8941)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [84]  [1000/1251]  eta: 0:02:38  lr: 0.003494  min_lr: 0.003494  loss: 3.3927 (3.2841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (0.8820)  time: 0.6295  data: 0.0005  max mem: 54228
Epoch: [84]  [1200/1251]  eta: 0:00:32  lr: 0.003492  min_lr: 0.003492  loss: 3.4565 (3.2920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8766 (nan)  time: 0.6372  data: 0.0005  max mem: 54228
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 3.3169 (3.2858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (nan)  time: 0.5369  data: 0.0007  max mem: 54228
Epoch: [84] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 3.3169 (3.2833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (nan)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.5696 (0.5696)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 6.2743  data: 5.9364  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7824 (0.7692)  acc1: 83.6000 (84.0000)  acc5: 97.6000 (97.3818)  time: 0.8428  data: 0.5401  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9850 (0.9451)  acc1: 78.0000 (79.9429)  acc5: 94.4000 (95.2381)  time: 0.2993  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0555 (0.9516)  acc1: 77.2000 (79.6960)  acc5: 94.4000 (95.2800)  time: 0.2991  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5423 s / it)
* Acc@1 79.390 Acc@5 95.284 loss 0.945
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.39%
Epoch: [85]  [   0/1251]  eta: 1:13:35  lr: 0.003491  min_lr: 0.003491  loss: 3.6861 (3.6861)  weight_decay: 0.0500 (0.0500)  time: 3.5298  data: 2.8913  max mem: 54228
Epoch: [85]  [ 200/1251]  eta: 0:11:16  lr: 0.003489  min_lr: 0.003489  loss: 3.1688 (3.2736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7909 (1.0424)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [85]  [ 400/1251]  eta: 0:09:01  lr: 0.003487  min_lr: 0.003487  loss: 3.2050 (3.2765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6705 (0.9387)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [85]  [ 600/1251]  eta: 0:06:52  lr: 0.003484  min_lr: 0.003484  loss: 3.4988 (3.2739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7600 (0.9351)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [85]  [ 800/1251]  eta: 0:04:45  lr: 0.003482  min_lr: 0.003482  loss: 3.4111 (3.2640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1130 (0.9135)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [85]  [1000/1251]  eta: 0:02:38  lr: 0.003479  min_lr: 0.003479  loss: 3.6622 (3.2706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8964 (0.9161)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [85]  [1200/1251]  eta: 0:00:32  lr: 0.003477  min_lr: 0.003477  loss: 3.5113 (3.2786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7655 (0.9111)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 2.9868 (3.2776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8654 (0.9156)  time: 0.5393  data: 0.0006  max mem: 54228
Epoch: [85] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 2.9868 (3.2858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8654 (0.9156)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5756 (0.5756)  acc1: 88.8000 (88.8000)  acc5: 97.6000 (97.6000)  time: 6.3346  data: 6.0008  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7393 (0.7611)  acc1: 82.4000 (82.6182)  acc5: 97.2000 (97.0182)  time: 0.8483  data: 0.5458  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9542 (0.9243)  acc1: 76.8000 (78.8952)  acc5: 94.8000 (95.1048)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0477 (0.9355)  acc1: 76.8000 (78.7840)  acc5: 94.0000 (95.1200)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5455 s / it)
* Acc@1 79.482 Acc@5 95.244 loss 0.917
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.48%
Epoch: [86]  [   0/1251]  eta: 1:20:42  lr: 0.003476  min_lr: 0.003476  loss: 3.4854 (3.4854)  weight_decay: 0.0500 (0.0500)  time: 3.8709  data: 3.2257  max mem: 54228
Epoch: [86]  [ 200/1251]  eta: 0:11:17  lr: 0.003474  min_lr: 0.003474  loss: 3.3137 (3.2531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6468 (0.7052)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [86]  [ 400/1251]  eta: 0:09:03  lr: 0.003472  min_lr: 0.003472  loss: 3.5310 (3.2758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7309 (0.8228)  time: 0.6389  data: 0.0004  max mem: 54228
Epoch: [86]  [ 600/1251]  eta: 0:06:53  lr: 0.003469  min_lr: 0.003469  loss: 3.2119 (3.2828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7421 (0.8472)  time: 0.6292  data: 0.0004  max mem: 54228
Epoch: [86]  [ 800/1251]  eta: 0:04:45  lr: 0.003467  min_lr: 0.003467  loss: 3.5949 (3.2807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8428 (0.8820)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [86]  [1000/1251]  eta: 0:02:38  lr: 0.003464  min_lr: 0.003464  loss: 3.1636 (3.2745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6454 (0.8688)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [86]  [1200/1251]  eta: 0:00:32  lr: 0.003462  min_lr: 0.003462  loss: 3.4413 (3.2780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1943 (0.9009)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 3.2438 (3.2792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0590 (0.9034)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [86] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 3.2438 (3.2833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0590 (0.9034)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.6363 (0.6363)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 6.1380  data: 5.8216  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7821 (0.7894)  acc1: 84.0000 (83.1636)  acc5: 97.6000 (97.3455)  time: 0.8318  data: 0.5296  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9675 (0.9642)  acc1: 76.8000 (79.3524)  acc5: 94.4000 (95.1238)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0699 (0.9758)  acc1: 76.0000 (79.0240)  acc5: 94.0000 (94.9600)  time: 0.2996  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5386 s / it)
* Acc@1 79.132 Acc@5 95.106 loss 0.963
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.48%
Epoch: [87]  [   0/1251]  eta: 1:19:13  lr: 0.003461  min_lr: 0.003461  loss: 3.4667 (3.4667)  weight_decay: 0.0500 (0.0500)  time: 3.7997  data: 3.1100  max mem: 54228
Epoch: [87]  [ 200/1251]  eta: 0:11:20  lr: 0.003459  min_lr: 0.003459  loss: 2.9761 (3.2218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.8927)  time: 0.6433  data: 0.0005  max mem: 54228
Epoch: [87]  [ 400/1251]  eta: 0:09:03  lr: 0.003456  min_lr: 0.003456  loss: 3.3927 (3.2554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (0.9063)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [87]  [ 600/1251]  eta: 0:06:53  lr: 0.003454  min_lr: 0.003454  loss: 3.5322 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.8874)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [87]  [ 800/1251]  eta: 0:04:46  lr: 0.003451  min_lr: 0.003451  loss: 3.4332 (3.2655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7091 (0.8664)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [87]  [1000/1251]  eta: 0:02:38  lr: 0.003449  min_lr: 0.003449  loss: 3.3596 (3.2738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7910 (0.8908)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [87]  [1200/1251]  eta: 0:00:32  lr: 0.003446  min_lr: 0.003446  loss: 2.7867 (3.2702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8099 (0.9006)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 3.5003 (3.2693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8138 (0.9045)  time: 0.5466  data: 0.0007  max mem: 54228
Epoch: [87] Total time: 0:13:10 (0.6315 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 3.5003 (3.2816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8138 (0.9045)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6785 (0.6785)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.9976  data: 5.6628  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8303 (0.8289)  acc1: 82.4000 (82.8364)  acc5: 97.2000 (97.1273)  time: 0.8177  data: 0.5151  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9268 (0.9932)  acc1: 78.8000 (79.3905)  acc5: 94.4000 (95.1048)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0909 (1.0015)  acc1: 78.0000 (79.2960)  acc5: 94.0000 (94.9920)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5324 s / it)
* Acc@1 79.306 Acc@5 95.116 loss 0.998
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [88]  [   0/1251]  eta: 1:28:45  lr: 0.003446  min_lr: 0.003446  loss: 2.9315 (2.9315)  weight_decay: 0.0500 (0.0500)  time: 4.2568  data: 1.9886  max mem: 54228
Epoch: [88]  [ 200/1251]  eta: 0:11:20  lr: 0.003443  min_lr: 0.003443  loss: 3.4997 (3.2410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1321 (0.9052)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [88]  [ 400/1251]  eta: 0:09:02  lr: 0.003441  min_lr: 0.003441  loss: 3.3879 (3.2441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6603 (0.8813)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [88]  [ 600/1251]  eta: 0:06:53  lr: 0.003438  min_lr: 0.003438  loss: 3.2163 (3.2543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.8933)  time: 0.6388  data: 0.0005  max mem: 54228
Epoch: [88]  [ 800/1251]  eta: 0:04:45  lr: 0.003436  min_lr: 0.003436  loss: 3.3266 (3.2638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5629 (0.8763)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [88]  [1000/1251]  eta: 0:02:38  lr: 0.003433  min_lr: 0.003433  loss: 3.5883 (3.2603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (0.9325)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [88]  [1200/1251]  eta: 0:00:32  lr: 0.003431  min_lr: 0.003431  loss: 3.3119 (3.2588)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7698 (0.9157)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.2815 (3.2597)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7051 (0.9117)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [88] Total time: 0:13:10 (0.6315 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.2815 (3.2701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7051 (0.9117)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6716 (0.6716)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 6.3880  data: 6.0476  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7960 (0.8341)  acc1: 83.6000 (83.1636)  acc5: 97.2000 (97.0909)  time: 0.8531  data: 0.5501  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0233 (1.0074)  acc1: 76.4000 (79.2952)  acc5: 95.2000 (95.0476)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0746 (1.0213)  acc1: 75.6000 (78.9120)  acc5: 93.6000 (94.8320)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5471 s / it)
* Acc@1 79.316 Acc@5 95.166 loss 1.004
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [89]  [   0/1251]  eta: 1:24:51  lr: 0.003430  min_lr: 0.003430  loss: 3.5784 (3.5784)  weight_decay: 0.0500 (0.0500)  time: 4.0703  data: 2.5356  max mem: 54228
Epoch: [89]  [ 200/1251]  eta: 0:11:18  lr: 0.003428  min_lr: 0.003428  loss: 3.3306 (3.2644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.8902)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [89]  [ 400/1251]  eta: 0:09:02  lr: 0.003425  min_lr: 0.003425  loss: 3.1792 (3.2606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.9044)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [89]  [ 600/1251]  eta: 0:06:53  lr: 0.003423  min_lr: 0.003423  loss: 3.3010 (3.2516)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0035 (0.9132)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [89]  [ 800/1251]  eta: 0:04:45  lr: 0.003420  min_lr: 0.003420  loss: 3.2825 (3.2592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6173 (0.8850)  time: 0.6336  data: 0.0005  max mem: 54228
Epoch: [89]  [1000/1251]  eta: 0:02:38  lr: 0.003418  min_lr: 0.003418  loss: 3.5411 (3.2734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7301 (0.8807)  time: 0.6354  data: 0.0005  max mem: 54228
Epoch: [89]  [1200/1251]  eta: 0:00:32  lr: 0.003415  min_lr: 0.003415  loss: 3.2566 (3.2670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8320 (0.8868)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.3589 (3.2711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.8838)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [89] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.3589 (3.2728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.8838)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.5843 (0.5843)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 6.3099  data: 5.9726  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8034 (0.7952)  acc1: 82.8000 (82.9818)  acc5: 97.6000 (97.1273)  time: 0.8462  data: 0.5433  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9509 (0.9668)  acc1: 78.0000 (79.2571)  acc5: 94.0000 (95.0286)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0054 (0.9759)  acc1: 78.0000 (79.1360)  acc5: 94.0000 (95.0240)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5441 s / it)
* Acc@1 79.296 Acc@5 95.134 loss 0.968
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [90]  [   0/1251]  eta: 1:21:30  lr: 0.003414  min_lr: 0.003414  loss: 3.5757 (3.5757)  weight_decay: 0.0500 (0.0500)  time: 3.9096  data: 2.8820  max mem: 54228
Epoch: [90]  [ 200/1251]  eta: 0:11:19  lr: 0.003412  min_lr: 0.003412  loss: 3.3211 (3.2707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (1.1107)  time: 0.6366  data: 0.0005  max mem: 54228
Epoch: [90]  [ 400/1251]  eta: 0:09:03  lr: 0.003409  min_lr: 0.003409  loss: 3.4208 (3.2763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6641 (0.9847)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [90]  [ 600/1251]  eta: 0:06:53  lr: 0.003407  min_lr: 0.003407  loss: 3.2889 (3.2772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5930 (0.9098)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [90]  [ 800/1251]  eta: 0:04:45  lr: 0.003404  min_lr: 0.003404  loss: 3.5159 (3.2704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7710 (0.9220)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [90]  [1000/1251]  eta: 0:02:38  lr: 0.003402  min_lr: 0.003402  loss: 3.5025 (3.2798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7973 (0.8980)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [90]  [1200/1251]  eta: 0:00:32  lr: 0.003399  min_lr: 0.003399  loss: 3.3425 (3.2824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6019 (0.9006)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.2105 (3.2806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6666 (0.9029)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [90] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.2105 (3.2732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6666 (0.9029)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6943 (0.6943)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 6.2290  data: 5.9013  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8482 (0.8300)  acc1: 83.2000 (82.8727)  acc5: 97.2000 (96.9818)  time: 0.8390  data: 0.5368  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0080 (0.9818)  acc1: 77.2000 (79.4095)  acc5: 95.2000 (95.3333)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1043 (0.9938)  acc1: 77.2000 (78.9920)  acc5: 94.0000 (95.1040)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5424 s / it)
* Acc@1 79.530 Acc@5 95.266 loss 0.982
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.53%
Epoch: [91]  [   0/1251]  eta: 1:21:57  lr: 0.003398  min_lr: 0.003398  loss: 3.6967 (3.6967)  weight_decay: 0.0500 (0.0500)  time: 3.9308  data: 3.2897  max mem: 54228
Epoch: [91]  [ 200/1251]  eta: 0:11:20  lr: 0.003396  min_lr: 0.003396  loss: 3.1175 (3.2545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0336 (0.9123)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [91]  [ 400/1251]  eta: 0:09:03  lr: 0.003393  min_lr: 0.003393  loss: 3.4174 (3.2440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7145 (0.8908)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [91]  [ 600/1251]  eta: 0:06:53  lr: 0.003391  min_lr: 0.003391  loss: 3.6485 (3.2305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9290 (nan)  time: 0.6348  data: 0.0004  max mem: 54228
Epoch: [91]  [ 800/1251]  eta: 0:04:45  lr: 0.003388  min_lr: 0.003388  loss: 3.4974 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8141 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [91]  [1000/1251]  eta: 0:02:38  lr: 0.003385  min_lr: 0.003385  loss: 3.4162 (3.2509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8255 (nan)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [91]  [1200/1251]  eta: 0:00:32  lr: 0.003383  min_lr: 0.003383  loss: 3.3327 (3.2517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6398 (nan)  time: 0.6362  data: 0.0004  max mem: 54228
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.3111 (3.2524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6416 (nan)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [91] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.3111 (3.2623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6416 (nan)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.6033 (0.6033)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 6.1528  data: 5.7994  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8339 (0.7913)  acc1: 82.0000 (83.0182)  acc5: 97.6000 (97.2000)  time: 0.8318  data: 0.5275  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9664 (0.9385)  acc1: 77.2000 (79.6952)  acc5: 94.8000 (95.2381)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0512 (0.9516)  acc1: 76.8000 (79.1040)  acc5: 94.4000 (95.0720)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5393 s / it)
* Acc@1 79.526 Acc@5 95.360 loss 0.946
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.53%
Epoch: [92]  [   0/1251]  eta: 1:28:13  lr: 0.003382  min_lr: 0.003382  loss: 3.3068 (3.3068)  weight_decay: 0.0500 (0.0500)  time: 4.2311  data: 2.7056  max mem: 54228
Epoch: [92]  [ 200/1251]  eta: 0:11:19  lr: 0.003380  min_lr: 0.003380  loss: 3.4880 (3.2133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.9901)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [92]  [ 400/1251]  eta: 0:09:02  lr: 0.003377  min_lr: 0.003377  loss: 3.4678 (3.2449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8279 (0.9318)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [92]  [ 600/1251]  eta: 0:06:53  lr: 0.003374  min_lr: 0.003374  loss: 3.2066 (3.2363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (0.9072)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [92]  [ 800/1251]  eta: 0:04:45  lr: 0.003372  min_lr: 0.003372  loss: 3.4892 (3.2517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7455 (0.9077)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [92]  [1000/1251]  eta: 0:02:38  lr: 0.003369  min_lr: 0.003369  loss: 3.4668 (3.2594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.8943)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [92]  [1200/1251]  eta: 0:00:32  lr: 0.003367  min_lr: 0.003367  loss: 3.3053 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7560 (0.8942)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.2563 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8995)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [92] Total time: 0:13:10 (0.6315 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.2563 (3.2706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8995)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6602 (0.6602)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 6.2264  data: 5.9090  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8319 (0.8260)  acc1: 81.6000 (82.5818)  acc5: 96.8000 (97.1636)  time: 0.8386  data: 0.5375  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0154 (0.9865)  acc1: 77.2000 (79.4476)  acc5: 95.6000 (95.1238)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0328 (0.9981)  acc1: 76.4000 (78.9600)  acc5: 94.4000 (95.1360)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5410 s / it)
* Acc@1 79.412 Acc@5 95.316 loss 0.988
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.53%
Epoch: [93]  [   0/1251]  eta: 1:28:19  lr: 0.003366  min_lr: 0.003366  loss: 3.7717 (3.7717)  weight_decay: 0.0500 (0.0500)  time: 4.2364  data: 2.6192  max mem: 54228
Epoch: [93]  [ 200/1251]  eta: 0:11:19  lr: 0.003363  min_lr: 0.003363  loss: 3.3575 (3.2284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8318 (0.8302)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [93]  [ 400/1251]  eta: 0:09:03  lr: 0.003361  min_lr: 0.003361  loss: 3.4907 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7961 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [93]  [ 600/1251]  eta: 0:06:53  lr: 0.003358  min_lr: 0.003358  loss: 3.2492 (3.2215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7886 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [93]  [ 800/1251]  eta: 0:04:45  lr: 0.003355  min_lr: 0.003355  loss: 3.3637 (3.2341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9749 (nan)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [93]  [1000/1251]  eta: 0:02:38  lr: 0.003353  min_lr: 0.003353  loss: 3.4343 (3.2474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7932 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [93]  [1200/1251]  eta: 0:00:32  lr: 0.003350  min_lr: 0.003350  loss: 3.3820 (3.2477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 3.3721 (3.2486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (nan)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [93] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 3.3721 (3.2492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (nan)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.6456 (0.6456)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 6.5471  data: 6.2249  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8238 (0.8205)  acc1: 83.6000 (83.5273)  acc5: 97.2000 (97.3091)  time: 0.8680  data: 0.5662  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0008 (0.9838)  acc1: 77.6000 (79.5048)  acc5: 95.6000 (95.2952)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0389 (0.9926)  acc1: 77.6000 (79.4560)  acc5: 94.4000 (95.2320)  time: 0.3005  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5542 s / it)
* Acc@1 79.578 Acc@5 95.280 loss 0.991
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.58%
Epoch: [94]  [   0/1251]  eta: 1:03:35  lr: 0.003350  min_lr: 0.003350  loss: 3.6343 (3.6343)  weight_decay: 0.0500 (0.0500)  time: 3.0502  data: 2.4124  max mem: 54228
Epoch: [94]  [ 200/1251]  eta: 0:11:17  lr: 0.003347  min_lr: 0.003347  loss: 3.2713 (3.2138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0272 (0.9989)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [94]  [ 400/1251]  eta: 0:09:01  lr: 0.003344  min_lr: 0.003344  loss: 3.3188 (3.2096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6758 (0.8643)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [94]  [ 600/1251]  eta: 0:06:52  lr: 0.003342  min_lr: 0.003342  loss: 3.2364 (3.2387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7444 (0.8949)  time: 0.6351  data: 0.0004  max mem: 54228
Epoch: [94]  [ 800/1251]  eta: 0:04:45  lr: 0.003339  min_lr: 0.003339  loss: 3.4188 (3.2554)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8263 (0.9023)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [94]  [1000/1251]  eta: 0:02:38  lr: 0.003336  min_lr: 0.003336  loss: 3.3348 (3.2507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5432 (0.8769)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [94]  [1200/1251]  eta: 0:00:32  lr: 0.003334  min_lr: 0.003334  loss: 3.0618 (3.2546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6334 (0.8709)  time: 0.6285  data: 0.0006  max mem: 54228
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 3.5276 (3.2551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7366 (0.8743)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [94] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 3.5276 (3.2584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7366 (0.8743)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.7169 (0.7169)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 4.9027  data: 4.5672  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8165 (0.8471)  acc1: 83.6000 (83.0545)  acc5: 98.0000 (97.4546)  time: 0.8175  data: 0.5150  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0178 (1.0090)  acc1: 76.4000 (79.4667)  acc5: 95.6000 (95.2762)  time: 0.3541  data: 0.0549  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1162 (1.0182)  acc1: 76.4000 (79.3600)  acc5: 94.4000 (95.1520)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5315 s / it)
* Acc@1 79.858 Acc@5 95.362 loss 1.006
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.86%
Epoch: [95]  [   0/1251]  eta: 1:09:11  lr: 0.003333  min_lr: 0.003333  loss: 2.3502 (2.3502)  weight_decay: 0.0500 (0.0500)  time: 3.3186  data: 2.6788  max mem: 54228
Epoch: [95]  [ 200/1251]  eta: 0:11:14  lr: 0.003330  min_lr: 0.003330  loss: 3.3430 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8495 (0.9521)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [95]  [ 400/1251]  eta: 0:09:00  lr: 0.003327  min_lr: 0.003327  loss: 3.3622 (3.2039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8310 (0.8924)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [95]  [ 600/1251]  eta: 0:06:52  lr: 0.003325  min_lr: 0.003325  loss: 2.8774 (3.2139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6160 (0.8559)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [95]  [ 800/1251]  eta: 0:04:45  lr: 0.003322  min_lr: 0.003322  loss: 3.5010 (3.2407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7313 (0.8866)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [95]  [1000/1251]  eta: 0:02:38  lr: 0.003319  min_lr: 0.003319  loss: 3.4871 (3.2452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7763 (0.8857)  time: 0.6342  data: 0.0005  max mem: 54228
Epoch: [95]  [1200/1251]  eta: 0:00:32  lr: 0.003317  min_lr: 0.003317  loss: 3.4007 (3.2438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5760 (0.8849)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 3.2380 (3.2435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5883 (0.8747)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [95] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 3.2380 (3.2424)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5883 (0.8747)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6148 (0.6148)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.6767  data: 6.3588  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8233 (0.7856)  acc1: 83.6000 (82.5818)  acc5: 97.6000 (97.3455)  time: 0.8794  data: 0.5784  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9482 (0.9442)  acc1: 77.2000 (79.1429)  acc5: 95.2000 (95.2952)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0708 (0.9585)  acc1: 76.8000 (78.9760)  acc5: 94.4000 (95.1840)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5587 s / it)
* Acc@1 79.618 Acc@5 95.350 loss 0.942
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.86%
Epoch: [96]  [   0/1251]  eta: 1:27:09  lr: 0.003316  min_lr: 0.003316  loss: 3.4567 (3.4567)  weight_decay: 0.0500 (0.0500)  time: 4.1801  data: 1.9024  max mem: 54228
Epoch: [96]  [ 200/1251]  eta: 0:11:19  lr: 0.003313  min_lr: 0.003313  loss: 3.5029 (3.2459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6877 (0.9117)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [96]  [ 400/1251]  eta: 0:09:03  lr: 0.003311  min_lr: 0.003311  loss: 3.3709 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8300 (0.8517)  time: 0.6356  data: 0.0005  max mem: 54228
Epoch: [96]  [ 600/1251]  eta: 0:06:53  lr: 0.003308  min_lr: 0.003308  loss: 3.2664 (3.2639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6907 (0.8611)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [96]  [ 800/1251]  eta: 0:04:45  lr: 0.003305  min_lr: 0.003305  loss: 3.5177 (3.2553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6683 (0.8347)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [96]  [1000/1251]  eta: 0:02:38  lr: 0.003302  min_lr: 0.003302  loss: 3.2411 (3.2581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7539 (0.8572)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [96]  [1200/1251]  eta: 0:00:32  lr: 0.003300  min_lr: 0.003300  loss: 3.4426 (3.2572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (0.8567)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.2118 (3.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.8521)  time: 0.5334  data: 0.0007  max mem: 54228
Epoch: [96] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.2118 (3.2433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8002 (0.8521)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6424 (0.6424)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 6.5918  data: 6.2611  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8474 (0.8028)  acc1: 82.0000 (83.1636)  acc5: 97.6000 (97.3455)  time: 0.8718  data: 0.5696  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0253 (0.9494)  acc1: 78.0000 (79.9048)  acc5: 95.2000 (95.3905)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0331 (0.9545)  acc1: 78.8000 (79.9840)  acc5: 94.0000 (95.3280)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5580 s / it)
* Acc@1 79.792 Acc@5 95.380 loss 0.943
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.86%
Epoch: [97]  [   0/1251]  eta: 1:25:30  lr: 0.003299  min_lr: 0.003299  loss: 3.2240 (3.2240)  weight_decay: 0.0500 (0.0500)  time: 4.1007  data: 1.8275  max mem: 54228
Epoch: [97]  [ 200/1251]  eta: 0:11:20  lr: 0.003296  min_lr: 0.003296  loss: 3.3122 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7995 (0.8732)  time: 0.6418  data: 0.0004  max mem: 54228
Epoch: [97]  [ 400/1251]  eta: 0:09:03  lr: 0.003294  min_lr: 0.003294  loss: 3.2806 (3.1931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9419 (0.9047)  time: 0.6356  data: 0.0004  max mem: 54228
Epoch: [97]  [ 600/1251]  eta: 0:06:53  lr: 0.003291  min_lr: 0.003291  loss: 3.0720 (3.2018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5878 (0.8811)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [97]  [ 800/1251]  eta: 0:04:45  lr: 0.003288  min_lr: 0.003288  loss: 3.3343 (3.2248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7222 (0.9372)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [97]  [1000/1251]  eta: 0:02:38  lr: 0.003285  min_lr: 0.003285  loss: 3.5074 (3.2453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6726 (0.9163)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [97]  [1200/1251]  eta: 0:00:32  lr: 0.003283  min_lr: 0.003283  loss: 3.5142 (3.2575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.8970)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.3483 (3.2562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.8891)  time: 0.5330  data: 0.0007  max mem: 54228
Epoch: [97] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.3483 (3.2410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.8891)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6826 (0.6826)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 6.5755  data: 6.2561  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8302 (0.8551)  acc1: 82.8000 (83.4182)  acc5: 97.6000 (97.2727)  time: 0.8704  data: 0.5691  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0278 (0.9897)  acc1: 77.2000 (80.2286)  acc5: 94.4000 (95.2571)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0659 (0.9960)  acc1: 77.2000 (80.0320)  acc5: 94.4000 (95.3600)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5575 s / it)
* Acc@1 79.914 Acc@5 95.482 loss 0.994
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.91%
Epoch: [98]  [   0/1251]  eta: 1:13:32  lr: 0.003282  min_lr: 0.003282  loss: 3.3377 (3.3377)  weight_decay: 0.0500 (0.0500)  time: 3.5269  data: 2.8874  max mem: 54228
Epoch: [98]  [ 200/1251]  eta: 0:11:17  lr: 0.003279  min_lr: 0.003279  loss: 3.0109 (3.1894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8427 (0.8989)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [98]  [ 400/1251]  eta: 0:09:01  lr: 0.003276  min_lr: 0.003276  loss: 3.2366 (3.2258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3713 (0.9749)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [98]  [ 600/1251]  eta: 0:06:52  lr: 0.003274  min_lr: 0.003274  loss: 3.1967 (3.2352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6079 (0.8978)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [98]  [ 800/1251]  eta: 0:04:45  lr: 0.003271  min_lr: 0.003271  loss: 3.4602 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8696 (0.8882)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [98]  [1000/1251]  eta: 0:02:38  lr: 0.003268  min_lr: 0.003268  loss: 3.4693 (3.2378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7160 (0.8666)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [98]  [1200/1251]  eta: 0:00:32  lr: 0.003265  min_lr: 0.003265  loss: 3.4218 (3.2394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.8496)  time: 0.6339  data: 0.0004  max mem: 54228
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.3962 (3.2386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (0.8558)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [98] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.3962 (3.2309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (0.8558)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6661 (0.6661)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 6.4218  data: 6.0784  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8288 (0.8404)  acc1: 82.8000 (83.3818)  acc5: 97.6000 (97.3818)  time: 0.8565  data: 0.5529  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0113 (1.0068)  acc1: 77.2000 (79.4286)  acc5: 94.8000 (95.3333)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0745 (1.0136)  acc1: 76.4000 (79.1200)  acc5: 94.8000 (95.3600)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5538 s / it)
* Acc@1 79.522 Acc@5 95.284 loss 1.003
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.91%
Epoch: [99]  [   0/1251]  eta: 1:20:40  lr: 0.003265  min_lr: 0.003265  loss: 3.0266 (3.0266)  weight_decay: 0.0500 (0.0500)  time: 3.8695  data: 3.1093  max mem: 54228
Epoch: [99]  [ 200/1251]  eta: 0:11:17  lr: 0.003262  min_lr: 0.003262  loss: 3.1738 (3.2071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.8321)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [99]  [ 400/1251]  eta: 0:09:01  lr: 0.003259  min_lr: 0.003259  loss: 3.2074 (3.2167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1490 (0.9241)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [99]  [ 600/1251]  eta: 0:06:53  lr: 0.003256  min_lr: 0.003256  loss: 3.4866 (3.2192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1681 (0.9416)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [99]  [ 800/1251]  eta: 0:04:45  lr: 0.003253  min_lr: 0.003253  loss: 3.2644 (3.2196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9041 (0.9141)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [99]  [1000/1251]  eta: 0:02:38  lr: 0.003251  min_lr: 0.003251  loss: 3.4620 (3.2254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9873 (0.9109)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [99]  [1200/1251]  eta: 0:00:32  lr: 0.003248  min_lr: 0.003248  loss: 3.2288 (3.2244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8525 (0.8903)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 3.0874 (3.2209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1509 (0.9051)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [99] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 3.0874 (3.2420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1509 (0.9051)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6062 (0.6062)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 6.2029  data: 5.8681  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7472 (0.7696)  acc1: 82.8000 (83.6364)  acc5: 97.6000 (97.4909)  time: 0.8362  data: 0.5338  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9438 (0.9234)  acc1: 78.4000 (79.7714)  acc5: 95.2000 (95.6381)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0247 (0.9297)  acc1: 76.4000 (79.5040)  acc5: 94.0000 (95.5360)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5394 s / it)
* Acc@1 79.902 Acc@5 95.478 loss 0.915
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.91%
Epoch: [100]  [   0/1251]  eta: 1:25:47  lr: 0.003247  min_lr: 0.003247  loss: 3.6818 (3.6818)  weight_decay: 0.0500 (0.0500)  time: 4.1145  data: 2.5633  max mem: 54228
Epoch: [100]  [ 200/1251]  eta: 0:11:19  lr: 0.003244  min_lr: 0.003244  loss: 3.1317 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (0.7556)  time: 0.6348  data: 0.0005  max mem: 54228
Epoch: [100]  [ 400/1251]  eta: 0:09:03  lr: 0.003242  min_lr: 0.003242  loss: 2.8876 (3.1641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8382 (0.8033)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [100]  [ 600/1251]  eta: 0:06:53  lr: 0.003239  min_lr: 0.003239  loss: 3.2815 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6767 (0.7792)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [100]  [ 800/1251]  eta: 0:04:45  lr: 0.003236  min_lr: 0.003236  loss: 3.0740 (3.1986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8913 (0.8283)  time: 0.6344  data: 0.0004  max mem: 54228
Epoch: [100]  [1000/1251]  eta: 0:02:38  lr: 0.003233  min_lr: 0.003233  loss: 3.3750 (3.2046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7275 (0.8337)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [100]  [1200/1251]  eta: 0:00:32  lr: 0.003230  min_lr: 0.003230  loss: 3.3951 (3.2144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6843 (0.8300)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 3.4431 (3.2186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (0.8324)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [100] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 3.4431 (3.2252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (0.8324)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.6891 (0.6891)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 6.4580  data: 6.1213  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8307 (0.8408)  acc1: 81.2000 (83.4909)  acc5: 97.6000 (97.2727)  time: 0.8595  data: 0.5568  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9964 (0.9973)  acc1: 78.4000 (79.9429)  acc5: 94.4000 (95.1619)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1229 (1.0089)  acc1: 78.4000 (79.6480)  acc5: 94.0000 (95.0240)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5520 s / it)
* Acc@1 79.674 Acc@5 95.306 loss 0.999
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.91%
Epoch: [101]  [   0/1251]  eta: 1:29:34  lr: 0.003230  min_lr: 0.003230  loss: 3.3058 (3.3058)  weight_decay: 0.0500 (0.0500)  time: 4.2963  data: 2.7214  max mem: 54228
Epoch: [101]  [ 200/1251]  eta: 0:11:22  lr: 0.003227  min_lr: 0.003227  loss: 3.4024 (3.2414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5689 (0.7585)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [101]  [ 400/1251]  eta: 0:09:03  lr: 0.003224  min_lr: 0.003224  loss: 3.3003 (3.2387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (0.8097)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [101]  [ 600/1251]  eta: 0:06:53  lr: 0.003221  min_lr: 0.003221  loss: 3.1086 (3.2304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9122 (0.8410)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [101]  [ 800/1251]  eta: 0:04:46  lr: 0.003218  min_lr: 0.003218  loss: 3.2942 (3.2287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8358 (0.8564)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [101]  [1000/1251]  eta: 0:02:38  lr: 0.003215  min_lr: 0.003215  loss: 3.3184 (3.2323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.8319)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [101]  [1200/1251]  eta: 0:00:32  lr: 0.003212  min_lr: 0.003212  loss: 3.0944 (3.2411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (0.8220)  time: 0.6340  data: 0.0005  max mem: 54228
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.2077 (3.2396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.8173)  time: 0.5329  data: 0.0007  max mem: 54228
Epoch: [101] Total time: 0:13:10 (0.6317 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.2077 (3.2271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.8173)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.7031 (0.7031)  acc1: 87.2000 (87.2000)  acc5: 99.6000 (99.6000)  time: 6.6083  data: 6.2710  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8963 (0.8497)  acc1: 84.0000 (83.6000)  acc5: 97.2000 (97.1636)  time: 0.8723  data: 0.5704  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0005 (0.9912)  acc1: 77.6000 (80.1143)  acc5: 95.2000 (95.3333)  time: 0.2986  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0809 (0.9990)  acc1: 78.4000 (80.0160)  acc5: 94.0000 (95.2320)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5547 s / it)
* Acc@1 80.054 Acc@5 95.366 loss 0.993
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.05%
Epoch: [102]  [   0/1251]  eta: 1:15:45  lr: 0.003212  min_lr: 0.003212  loss: 3.2592 (3.2592)  weight_decay: 0.0500 (0.0500)  time: 3.6332  data: 2.9972  max mem: 54228
Epoch: [102]  [ 200/1251]  eta: 0:11:16  lr: 0.003209  min_lr: 0.003209  loss: 3.1537 (3.2466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7642 (0.9087)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [102]  [ 400/1251]  eta: 0:09:01  lr: 0.003206  min_lr: 0.003206  loss: 3.1920 (3.2120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8462 (0.9445)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [102]  [ 600/1251]  eta: 0:06:52  lr: 0.003203  min_lr: 0.003203  loss: 3.3605 (3.2014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8753 (0.9243)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [102]  [ 800/1251]  eta: 0:04:45  lr: 0.003200  min_lr: 0.003200  loss: 3.2723 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7203 (0.9272)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [102]  [1000/1251]  eta: 0:02:38  lr: 0.003197  min_lr: 0.003197  loss: 3.4294 (3.2211)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [102]  [1200/1251]  eta: 0:00:32  lr: 0.003195  min_lr: 0.003195  loss: 3.5245 (3.2238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7396 (nan)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 3.3223 (3.2254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (nan)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [102] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 3.3223 (3.2272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6574 (nan)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6868 (0.6868)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 6.2292  data: 5.8991  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8383 (0.8065)  acc1: 84.8000 (84.0364)  acc5: 96.8000 (97.0546)  time: 0.8385  data: 0.5366  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0077 (0.9561)  acc1: 78.4000 (80.1714)  acc5: 94.8000 (95.3714)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9560 (0.9582)  acc1: 78.4000 (79.9680)  acc5: 94.4000 (95.3600)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5405 s / it)
* Acc@1 80.180 Acc@5 95.486 loss 0.953
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.18%
Epoch: [103]  [   0/1251]  eta: 1:01:47  lr: 0.003194  min_lr: 0.003194  loss: 3.4946 (3.4946)  weight_decay: 0.0500 (0.0500)  time: 2.9639  data: 2.3291  max mem: 54228
Epoch: [103]  [ 200/1251]  eta: 0:11:12  lr: 0.003191  min_lr: 0.003191  loss: 3.3960 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7242 (0.9499)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [103]  [ 400/1251]  eta: 0:09:00  lr: 0.003188  min_lr: 0.003188  loss: 3.3501 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8630 (0.9325)  time: 0.6291  data: 0.0004  max mem: 54228
Epoch: [103]  [ 600/1251]  eta: 0:06:52  lr: 0.003185  min_lr: 0.003185  loss: 3.2022 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7720 (0.8954)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [103]  [ 800/1251]  eta: 0:04:45  lr: 0.003182  min_lr: 0.003182  loss: 3.2490 (3.2167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8317 (0.9030)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [103]  [1000/1251]  eta: 0:02:38  lr: 0.003179  min_lr: 0.003179  loss: 3.3456 (3.2229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8618 (0.8803)  time: 0.6327  data: 0.0004  max mem: 54228
Epoch: [103]  [1200/1251]  eta: 0:00:32  lr: 0.003176  min_lr: 0.003176  loss: 3.3723 (3.2132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.8784)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 3.1561 (3.2140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6380 (0.8706)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [103] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 3.1561 (3.2226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6380 (0.8706)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6858 (0.6858)  acc1: 86.8000 (86.8000)  acc5: 99.2000 (99.2000)  time: 6.2568  data: 5.9389  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7965 (0.7975)  acc1: 83.6000 (83.4182)  acc5: 97.6000 (97.5636)  time: 0.8414  data: 0.5403  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9857 (0.9478)  acc1: 78.8000 (80.0000)  acc5: 95.2000 (95.6381)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0426 (0.9580)  acc1: 79.6000 (80.1120)  acc5: 94.4000 (95.4560)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5423 s / it)
* Acc@1 80.158 Acc@5 95.508 loss 0.961
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.18%
Epoch: [104]  [   0/1251]  eta: 1:24:01  lr: 0.003176  min_lr: 0.003176  loss: 3.4858 (3.4858)  weight_decay: 0.0500 (0.0500)  time: 4.0299  data: 2.6749  max mem: 54228
Epoch: [104]  [ 200/1251]  eta: 0:11:22  lr: 0.003173  min_lr: 0.003173  loss: 3.4860 (3.2386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7602 (0.7922)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [104]  [ 400/1251]  eta: 0:09:03  lr: 0.003170  min_lr: 0.003170  loss: 3.3574 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7936 (0.7943)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [104]  [ 600/1251]  eta: 0:06:53  lr: 0.003167  min_lr: 0.003167  loss: 3.3604 (3.2101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.8167)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [104]  [ 800/1251]  eta: 0:04:45  lr: 0.003164  min_lr: 0.003164  loss: 3.1631 (3.2044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8451 (0.8140)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [104]  [1000/1251]  eta: 0:02:38  lr: 0.003161  min_lr: 0.003161  loss: 3.4232 (3.2039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7319 (0.8327)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [104]  [1200/1251]  eta: 0:00:32  lr: 0.003158  min_lr: 0.003158  loss: 3.4433 (3.2019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7335 (0.8211)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 3.4142 (3.2035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.8159)  time: 0.5378  data: 0.0005  max mem: 54228
Epoch: [104] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 3.4142 (3.2143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.8159)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6314 (0.6314)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 6.7018  data: 6.3781  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8461 (0.8058)  acc1: 83.6000 (83.0545)  acc5: 97.6000 (97.3818)  time: 0.8817  data: 0.5801  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0074 (0.9495)  acc1: 78.4000 (79.8857)  acc5: 95.6000 (95.6952)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0657 (0.9627)  acc1: 78.4000 (79.4880)  acc5: 94.8000 (95.6640)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5611 s / it)
* Acc@1 79.980 Acc@5 95.496 loss 0.953
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.18%
Epoch: [105]  [   0/1251]  eta: 1:20:46  lr: 0.003158  min_lr: 0.003158  loss: 2.3489 (2.3489)  weight_decay: 0.0500 (0.0500)  time: 3.8742  data: 1.8827  max mem: 54228
Epoch: [105]  [ 200/1251]  eta: 0:11:18  lr: 0.003155  min_lr: 0.003155  loss: 3.3929 (3.2014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8527 (0.8834)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [105]  [ 400/1251]  eta: 0:09:02  lr: 0.003152  min_lr: 0.003152  loss: 3.3936 (3.2209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7140 (0.8762)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [105]  [ 600/1251]  eta: 0:06:53  lr: 0.003149  min_lr: 0.003149  loss: 3.4454 (3.2274)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.8677)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [105]  [ 800/1251]  eta: 0:04:45  lr: 0.003146  min_lr: 0.003146  loss: 3.2501 (3.2288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5820 (0.8715)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [105]  [1000/1251]  eta: 0:02:38  lr: 0.003143  min_lr: 0.003143  loss: 3.2795 (3.2173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9209 (0.8721)  time: 0.6377  data: 0.0005  max mem: 54228
Epoch: [105]  [1200/1251]  eta: 0:00:32  lr: 0.003140  min_lr: 0.003140  loss: 3.2134 (3.2195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (0.8572)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.2153 (3.2168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7054 (0.8581)  time: 0.5338  data: 0.0007  max mem: 54228
Epoch: [105] Total time: 0:13:10 (0.6317 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.2153 (3.2231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7054 (0.8581)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.5836 (0.5836)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.4740  data: 6.1445  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7578 (0.7666)  acc1: 82.8000 (84.1818)  acc5: 97.6000 (97.4909)  time: 0.8609  data: 0.5589  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9525 (0.9262)  acc1: 77.6000 (80.2095)  acc5: 94.8000 (95.6571)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0161 (0.9401)  acc1: 77.6000 (79.7760)  acc5: 94.8000 (95.5680)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5503 s / it)
* Acc@1 79.900 Acc@5 95.470 loss 0.934
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.18%
Epoch: [106]  [   0/1251]  eta: 1:22:22  lr: 0.003139  min_lr: 0.003139  loss: 2.4330 (2.4330)  weight_decay: 0.0500 (0.0500)  time: 3.9506  data: 2.4095  max mem: 54228
Epoch: [106]  [ 200/1251]  eta: 0:11:17  lr: 0.003136  min_lr: 0.003136  loss: 3.3550 (3.1686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7342 (0.8630)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [106]  [ 400/1251]  eta: 0:09:02  lr: 0.003133  min_lr: 0.003133  loss: 3.1091 (3.1948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7953 (0.8768)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [106]  [ 600/1251]  eta: 0:06:53  lr: 0.003130  min_lr: 0.003130  loss: 2.9279 (3.1925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6229 (0.8795)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [106]  [ 800/1251]  eta: 0:04:45  lr: 0.003127  min_lr: 0.003127  loss: 3.2978 (3.1942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.8445)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [106]  [1000/1251]  eta: 0:02:38  lr: 0.003124  min_lr: 0.003124  loss: 3.1643 (3.1901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5917 (0.8463)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [106]  [1200/1251]  eta: 0:00:32  lr: 0.003121  min_lr: 0.003121  loss: 3.3029 (3.1921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (0.8578)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 3.1282 (3.1913)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7548 (0.8554)  time: 0.5326  data: 0.0006  max mem: 54228
Epoch: [106] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 3.1282 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7548 (0.8554)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6196 (0.6196)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 6.7200  data: 6.3946  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7748 (0.7936)  acc1: 82.4000 (83.6727)  acc5: 98.0000 (97.5636)  time: 0.8823  data: 0.5816  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9933 (0.9505)  acc1: 78.0000 (79.8667)  acc5: 95.2000 (95.4667)  time: 0.2984  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9961 (0.9549)  acc1: 76.8000 (79.6320)  acc5: 94.4000 (95.5040)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5593 s / it)
* Acc@1 80.028 Acc@5 95.438 loss 0.950
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.18%
Epoch: [107]  [   0/1251]  eta: 1:24:51  lr: 0.003121  min_lr: 0.003121  loss: 2.5125 (2.5125)  weight_decay: 0.0500 (0.0500)  time: 4.0697  data: 2.7331  max mem: 54228
Epoch: [107]  [ 200/1251]  eta: 0:11:21  lr: 0.003118  min_lr: 0.003118  loss: 3.0482 (3.2553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8988 (0.8906)  time: 0.6355  data: 0.0004  max mem: 54228
Epoch: [107]  [ 400/1251]  eta: 0:09:03  lr: 0.003115  min_lr: 0.003115  loss: 3.5414 (3.2346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (0.8539)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [107]  [ 600/1251]  eta: 0:06:53  lr: 0.003112  min_lr: 0.003112  loss: 3.3527 (3.2445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5397 (0.8916)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [107]  [ 800/1251]  eta: 0:04:45  lr: 0.003109  min_lr: 0.003109  loss: 3.1839 (3.2366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7625 (0.8787)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [107]  [1000/1251]  eta: 0:02:38  lr: 0.003106  min_lr: 0.003106  loss: 3.1615 (3.2239)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.8457)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [107]  [1200/1251]  eta: 0:00:32  lr: 0.003103  min_lr: 0.003103  loss: 3.3141 (3.2205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9129 (0.8613)  time: 0.6340  data: 0.0004  max mem: 54228
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 3.3341 (3.2210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8570 (0.8631)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [107] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 3.3341 (3.2046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8570 (0.8631)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6292 (0.6292)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.2832  data: 4.9463  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8367 (0.8001)  acc1: 82.8000 (83.2364)  acc5: 98.0000 (97.3091)  time: 0.8219  data: 0.5189  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0013 (0.9730)  acc1: 77.6000 (79.6000)  acc5: 94.4000 (95.5619)  time: 0.3377  data: 0.0381  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1073 (0.9819)  acc1: 77.2000 (79.3280)  acc5: 94.4000 (95.3600)  time: 0.2999  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5336 s / it)
* Acc@1 79.968 Acc@5 95.512 loss 0.966
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.18%
Epoch: [108]  [   0/1251]  eta: 1:30:26  lr: 0.003102  min_lr: 0.003102  loss: 2.0021 (2.0021)  weight_decay: 0.0500 (0.0500)  time: 4.3376  data: 1.8245  max mem: 54228
Epoch: [108]  [ 200/1251]  eta: 0:11:21  lr: 0.003099  min_lr: 0.003099  loss: 3.1579 (3.2030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6909 (0.7376)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [108]  [ 400/1251]  eta: 0:09:03  lr: 0.003096  min_lr: 0.003096  loss: 3.0859 (3.1838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5556 (0.7761)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [108]  [ 600/1251]  eta: 0:06:54  lr: 0.003093  min_lr: 0.003093  loss: 3.0455 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9576 (0.8114)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [108]  [ 800/1251]  eta: 0:04:46  lr: 0.003090  min_lr: 0.003090  loss: 3.4858 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6946 (0.8179)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [108]  [1000/1251]  eta: 0:02:38  lr: 0.003087  min_lr: 0.003087  loss: 3.2725 (3.1961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8198 (0.8195)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [108]  [1200/1251]  eta: 0:00:32  lr: 0.003084  min_lr: 0.003084  loss: 3.4135 (3.1946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8925 (0.8375)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 3.2174 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.8399)  time: 0.5333  data: 0.0005  max mem: 54228
Epoch: [108] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 3.2174 (3.2066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (0.8399)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6828 (0.6828)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 6.3394  data: 5.9989  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7734 (0.8150)  acc1: 84.8000 (83.2727)  acc5: 98.0000 (97.5636)  time: 0.8495  data: 0.5457  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0139 (0.9685)  acc1: 78.0000 (79.5810)  acc5: 95.2000 (95.6952)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0244 (0.9742)  acc1: 77.2000 (79.4080)  acc5: 95.6000 (95.7280)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5455 s / it)
* Acc@1 79.898 Acc@5 95.426 loss 0.974
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.18%
Epoch: [109]  [   0/1251]  eta: 1:25:29  lr: 0.003083  min_lr: 0.003083  loss: 3.0114 (3.0114)  weight_decay: 0.0500 (0.0500)  time: 4.1006  data: 3.1125  max mem: 54228
Epoch: [109]  [ 200/1251]  eta: 0:11:19  lr: 0.003080  min_lr: 0.003080  loss: 3.1712 (3.1927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8375 (0.9388)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [109]  [ 400/1251]  eta: 0:09:03  lr: 0.003077  min_lr: 0.003077  loss: 3.2214 (3.2112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7849 (0.8787)  time: 0.6454  data: 0.0005  max mem: 54228
Epoch: [109]  [ 600/1251]  eta: 0:06:53  lr: 0.003074  min_lr: 0.003074  loss: 3.4563 (3.1984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8106 (0.8629)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [109]  [ 800/1251]  eta: 0:04:45  lr: 0.003071  min_lr: 0.003071  loss: 3.5102 (3.2026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0064 (0.8872)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [109]  [1000/1251]  eta: 0:02:38  lr: 0.003068  min_lr: 0.003068  loss: 3.2809 (3.2013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5717 (nan)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [109]  [1200/1251]  eta: 0:00:32  lr: 0.003065  min_lr: 0.003065  loss: 3.3569 (3.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (nan)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 3.4011 (3.2041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8588 (nan)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [109] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 3.4011 (3.1918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8588 (nan)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.6748 (0.6748)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 6.1222  data: 5.7762  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8361 (0.8172)  acc1: 83.6000 (83.4545)  acc5: 97.6000 (97.3818)  time: 0.8284  data: 0.5254  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0046 (0.9688)  acc1: 78.4000 (80.3429)  acc5: 95.6000 (95.6381)  time: 0.2989  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0458 (0.9784)  acc1: 78.4000 (80.0000)  acc5: 94.8000 (95.5680)  time: 0.2988  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5362 s / it)
* Acc@1 80.192 Acc@5 95.644 loss 0.972
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.19%
Epoch: [110]  [   0/1251]  eta: 1:18:14  lr: 0.003064  min_lr: 0.003064  loss: 3.7290 (3.7290)  weight_decay: 0.0500 (0.0500)  time: 3.7529  data: 3.1127  max mem: 54228
Epoch: [110]  [ 200/1251]  eta: 0:11:17  lr: 0.003061  min_lr: 0.003061  loss: 3.3025 (3.2248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8193 (0.7880)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [110]  [ 400/1251]  eta: 0:09:02  lr: 0.003058  min_lr: 0.003058  loss: 2.9769 (3.2268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8013 (0.8319)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [110]  [ 600/1251]  eta: 0:06:52  lr: 0.003055  min_lr: 0.003055  loss: 3.2111 (3.2240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.8212)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [110]  [ 800/1251]  eta: 0:04:45  lr: 0.003052  min_lr: 0.003052  loss: 3.2042 (3.2070)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7652 (0.8239)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [110]  [1000/1251]  eta: 0:02:38  lr: 0.003049  min_lr: 0.003049  loss: 3.1520 (3.2126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9273 (0.8478)  time: 0.6338  data: 0.0004  max mem: 54228
Epoch: [110]  [1200/1251]  eta: 0:00:32  lr: 0.003046  min_lr: 0.003046  loss: 3.4030 (3.2038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (0.8576)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 3.3487 (3.2051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (0.8631)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [110] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 3.3487 (3.1979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (0.8631)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6354 (0.6354)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 6.7327  data: 6.4036  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7950 (0.8041)  acc1: 84.0000 (83.8545)  acc5: 97.2000 (97.0545)  time: 0.8847  data: 0.5825  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9702 (0.9486)  acc1: 77.6000 (80.2667)  acc5: 94.8000 (95.6952)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0483 (0.9619)  acc1: 77.6000 (79.8720)  acc5: 94.8000 (95.6000)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5615 s / it)
* Acc@1 80.314 Acc@5 95.688 loss 0.953
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.31%
Epoch: [111]  [   0/1251]  eta: 1:10:42  lr: 0.003045  min_lr: 0.003045  loss: 2.4510 (2.4510)  weight_decay: 0.0500 (0.0500)  time: 3.3913  data: 2.7660  max mem: 54228
Epoch: [111]  [ 200/1251]  eta: 0:11:17  lr: 0.003042  min_lr: 0.003042  loss: 3.0407 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9145 (0.8110)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [111]  [ 400/1251]  eta: 0:09:01  lr: 0.003039  min_lr: 0.003039  loss: 3.2262 (3.1865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6647 (0.7553)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [111]  [ 600/1251]  eta: 0:06:52  lr: 0.003036  min_lr: 0.003036  loss: 3.2436 (3.1877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.7757)  time: 0.6330  data: 0.0005  max mem: 54228
Epoch: [111]  [ 800/1251]  eta: 0:04:45  lr: 0.003033  min_lr: 0.003033  loss: 3.4262 (3.1996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8252 (0.8103)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [111]  [1000/1251]  eta: 0:02:38  lr: 0.003030  min_lr: 0.003030  loss: 3.0231 (3.1891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9180 (0.8166)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [111]  [1200/1251]  eta: 0:00:32  lr: 0.003027  min_lr: 0.003027  loss: 3.3487 (3.2038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6219 (0.8080)  time: 0.6361  data: 0.0004  max mem: 54228
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 3.2075 (3.2006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6219 (0.8110)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [111] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 3.2075 (3.1905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6219 (0.8110)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5756 (0.5756)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 6.5960  data: 6.2630  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7565 (0.7500)  acc1: 83.6000 (83.9273)  acc5: 97.2000 (97.4545)  time: 0.8720  data: 0.5697  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9092 (0.8971)  acc1: 79.6000 (80.5143)  acc5: 95.2000 (95.6381)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9685 (0.9103)  acc1: 77.6000 (80.0640)  acc5: 94.4000 (95.4240)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5583 s / it)
* Acc@1 80.376 Acc@5 95.624 loss 0.911
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.38%
Epoch: [112]  [   0/1251]  eta: 1:10:48  lr: 0.003026  min_lr: 0.003026  loss: 3.6758 (3.6758)  weight_decay: 0.0500 (0.0500)  time: 3.3957  data: 2.7669  max mem: 54228
Epoch: [112]  [ 200/1251]  eta: 0:11:14  lr: 0.003023  min_lr: 0.003023  loss: 3.0133 (3.1502)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4012 (1.0160)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [112]  [ 400/1251]  eta: 0:09:01  lr: 0.003020  min_lr: 0.003020  loss: 3.2833 (3.1476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7537 (0.8886)  time: 0.6363  data: 0.0004  max mem: 54228
Epoch: [112]  [ 600/1251]  eta: 0:06:52  lr: 0.003017  min_lr: 0.003017  loss: 2.8800 (3.1414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7752 (0.8907)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [112]  [ 800/1251]  eta: 0:04:45  lr: 0.003014  min_lr: 0.003014  loss: 3.1876 (3.1558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9035 (0.8887)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [112]  [1000/1251]  eta: 0:02:38  lr: 0.003011  min_lr: 0.003011  loss: 3.0640 (3.1650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7902 (0.8759)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [112]  [1200/1251]  eta: 0:00:32  lr: 0.003007  min_lr: 0.003007  loss: 3.1480 (3.1689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (0.8800)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 3.3041 (3.1700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8190 (0.8797)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [112] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 3.3041 (3.1850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8190 (0.8797)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.7228 (0.7228)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 6.7331  data: 6.4099  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8102 (0.8374)  acc1: 85.6000 (84.6182)  acc5: 97.6000 (97.4182)  time: 0.8845  data: 0.5830  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0509 (0.9908)  acc1: 78.8000 (80.9905)  acc5: 95.6000 (95.7143)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0485 (1.0000)  acc1: 78.0000 (80.6240)  acc5: 94.8000 (95.5680)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5615 s / it)
* Acc@1 80.154 Acc@5 95.628 loss 0.996
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.38%
Epoch: [113]  [   0/1251]  eta: 1:30:56  lr: 0.003007  min_lr: 0.003007  loss: 3.8685 (3.8685)  weight_decay: 0.0500 (0.0500)  time: 4.3621  data: 2.6510  max mem: 54228
Epoch: [113]  [ 200/1251]  eta: 0:11:20  lr: 0.003004  min_lr: 0.003004  loss: 3.1510 (3.1691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.8303)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [113]  [ 400/1251]  eta: 0:09:04  lr: 0.003000  min_lr: 0.003000  loss: 3.0685 (3.1809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (0.8129)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [113]  [ 600/1251]  eta: 0:06:54  lr: 0.002997  min_lr: 0.002997  loss: 3.2857 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8429 (0.8400)  time: 0.6288  data: 0.0006  max mem: 54228
Epoch: [113]  [ 800/1251]  eta: 0:04:46  lr: 0.002994  min_lr: 0.002994  loss: 3.3903 (3.1732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8172 (0.8443)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [113]  [1000/1251]  eta: 0:02:39  lr: 0.002991  min_lr: 0.002991  loss: 3.3401 (3.1738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7251 (0.8435)  time: 0.6287  data: 0.0006  max mem: 54228
Epoch: [113]  [1200/1251]  eta: 0:00:32  lr: 0.002988  min_lr: 0.002988  loss: 3.4819 (3.1852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7074 (0.8339)  time: 0.6336  data: 0.0005  max mem: 54228
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 3.3137 (3.1827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6256 (0.8354)  time: 0.5338  data: 0.0007  max mem: 54228
Epoch: [113] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 3.3137 (3.1840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6256 (0.8354)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6239 (0.6239)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 6.6482  data: 6.3228  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7344 (0.7585)  acc1: 84.0000 (83.7455)  acc5: 97.6000 (97.4182)  time: 0.8767  data: 0.5751  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9594 (0.8949)  acc1: 80.0000 (80.8000)  acc5: 95.6000 (95.8857)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9413 (0.9045)  acc1: 80.0000 (80.5600)  acc5: 95.6000 (95.8720)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5607 s / it)
* Acc@1 80.508 Acc@5 95.730 loss 0.911
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.51%
Epoch: [114]  [   0/1251]  eta: 1:09:28  lr: 0.002987  min_lr: 0.002987  loss: 2.9809 (2.9809)  weight_decay: 0.0500 (0.0500)  time: 3.3321  data: 2.7013  max mem: 54228
Epoch: [114]  [ 200/1251]  eta: 0:11:16  lr: 0.002984  min_lr: 0.002984  loss: 3.2530 (3.1614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8906 (0.9014)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [114]  [ 400/1251]  eta: 0:09:01  lr: 0.002981  min_lr: 0.002981  loss: 3.1324 (3.1606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.8617)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [114]  [ 600/1251]  eta: 0:06:52  lr: 0.002978  min_lr: 0.002978  loss: 3.3171 (3.1559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.8913)  time: 0.6341  data: 0.0005  max mem: 54228
Epoch: [114]  [ 800/1251]  eta: 0:04:45  lr: 0.002975  min_lr: 0.002975  loss: 3.3228 (3.1559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.8720)  time: 0.6285  data: 0.0006  max mem: 54228
Epoch: [114]  [1000/1251]  eta: 0:02:38  lr: 0.002972  min_lr: 0.002972  loss: 3.2820 (3.1599)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (0.8614)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [114]  [1200/1251]  eta: 0:00:32  lr: 0.002968  min_lr: 0.002968  loss: 3.3638 (3.1656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6941 (0.8501)  time: 0.6418  data: 0.0005  max mem: 54228
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.1891 (3.1660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8473)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [114] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.1891 (3.1783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8473)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5972 (0.5972)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.3234  data: 4.9933  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7400 (0.7647)  acc1: 82.4000 (83.7818)  acc5: 97.6000 (97.4182)  time: 0.7877  data: 0.4865  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9280 (0.9188)  acc1: 78.4000 (80.4571)  acc5: 94.8000 (95.5619)  time: 0.3161  data: 0.0179  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0082 (0.9326)  acc1: 77.6000 (80.1920)  acc5: 94.8000 (95.5680)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5231 s / it)
* Acc@1 80.420 Acc@5 95.698 loss 0.926
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.51%
Epoch: [115]  [   0/1251]  eta: 1:30:34  lr: 0.002968  min_lr: 0.002968  loss: 3.6698 (3.6698)  weight_decay: 0.0500 (0.0500)  time: 4.3441  data: 2.9060  max mem: 54228
Epoch: [115]  [ 200/1251]  eta: 0:11:19  lr: 0.002965  min_lr: 0.002965  loss: 3.2484 (3.1786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9830 (0.8598)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [115]  [ 400/1251]  eta: 0:09:02  lr: 0.002961  min_lr: 0.002961  loss: 3.1058 (3.2064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7943 (0.8594)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [115]  [ 600/1251]  eta: 0:06:53  lr: 0.002958  min_lr: 0.002958  loss: 3.3797 (3.2006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0360 (0.8765)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [115]  [ 800/1251]  eta: 0:04:45  lr: 0.002955  min_lr: 0.002955  loss: 3.3774 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.8419)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [115]  [1000/1251]  eta: 0:02:38  lr: 0.002952  min_lr: 0.002952  loss: 3.3557 (3.1989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6902 (0.8246)  time: 0.6360  data: 0.0005  max mem: 54228
Epoch: [115]  [1200/1251]  eta: 0:00:32  lr: 0.002949  min_lr: 0.002949  loss: 3.2205 (3.1914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6358 (0.8158)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 3.4459 (3.1933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.8218)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [115] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 3.4459 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6706 (0.8218)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6869 (0.6869)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.2306  data: 5.9028  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8341 (0.8406)  acc1: 82.0000 (83.3455)  acc5: 98.0000 (97.5636)  time: 0.8388  data: 0.5369  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0104 (0.9807)  acc1: 79.6000 (80.0381)  acc5: 95.2000 (95.6762)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0428 (0.9871)  acc1: 78.8000 (79.7440)  acc5: 95.2000 (95.6160)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5408 s / it)
* Acc@1 80.200 Acc@5 95.536 loss 0.984
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.51%
Epoch: [116]  [   0/1251]  eta: 1:27:07  lr: 0.002948  min_lr: 0.002948  loss: 3.5055 (3.5055)  weight_decay: 0.0500 (0.0500)  time: 4.1787  data: 2.9279  max mem: 54228
Epoch: [116]  [ 200/1251]  eta: 0:11:19  lr: 0.002945  min_lr: 0.002945  loss: 3.0355 (3.1878)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6293  data: 0.0004  max mem: 54228
Epoch: [116]  [ 400/1251]  eta: 0:09:03  lr: 0.002942  min_lr: 0.002942  loss: 3.2166 (3.1960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (nan)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [116]  [ 600/1251]  eta: 0:06:53  lr: 0.002938  min_lr: 0.002938  loss: 3.1068 (3.1776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (nan)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [116]  [ 800/1251]  eta: 0:04:45  lr: 0.002935  min_lr: 0.002935  loss: 3.2488 (3.1774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6573 (nan)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [116]  [1000/1251]  eta: 0:02:38  lr: 0.002932  min_lr: 0.002932  loss: 3.2854 (3.1705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (nan)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [116]  [1200/1251]  eta: 0:00:32  lr: 0.002929  min_lr: 0.002929  loss: 3.2107 (3.1719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8310 (nan)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 3.3744 (3.1726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (nan)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [116] Total time: 0:13:10 (0.6315 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 3.3744 (3.1724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8821 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6257 (0.6257)  acc1: 88.4000 (88.4000)  acc5: 99.6000 (99.6000)  time: 5.8603  data: 5.5217  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7825 (0.8006)  acc1: 84.8000 (84.1818)  acc5: 97.6000 (97.4182)  time: 0.8054  data: 0.5023  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9150 (0.9468)  acc1: 78.0000 (80.2286)  acc5: 95.6000 (96.0000)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0386 (0.9528)  acc1: 77.6000 (79.8720)  acc5: 95.2000 (95.8560)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5272 s / it)
* Acc@1 80.372 Acc@5 95.712 loss 0.952
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.51%
Epoch: [117]  [   0/1251]  eta: 1:20:18  lr: 0.002928  min_lr: 0.002928  loss: 2.8274 (2.8274)  weight_decay: 0.0500 (0.0500)  time: 3.8517  data: 2.8593  max mem: 54228
Epoch: [117]  [ 200/1251]  eta: 0:11:19  lr: 0.002925  min_lr: 0.002925  loss: 3.1480 (3.1791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7509 (0.7922)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [117]  [ 400/1251]  eta: 0:09:02  lr: 0.002922  min_lr: 0.002922  loss: 3.3360 (3.1580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9675 (0.8444)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [117]  [ 600/1251]  eta: 0:06:53  lr: 0.002919  min_lr: 0.002919  loss: 3.2005 (3.1572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.8254)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [117]  [ 800/1251]  eta: 0:04:45  lr: 0.002915  min_lr: 0.002915  loss: 3.0064 (3.1537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7669 (0.8260)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [117]  [1000/1251]  eta: 0:02:38  lr: 0.002912  min_lr: 0.002912  loss: 3.0511 (3.1574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8231 (0.8574)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [117]  [1200/1251]  eta: 0:00:32  lr: 0.002909  min_lr: 0.002909  loss: 3.1762 (3.1580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6582 (0.8448)  time: 0.6340  data: 0.0004  max mem: 54228
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.3635 (3.1579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.8472)  time: 0.5426  data: 0.0005  max mem: 54228
Epoch: [117] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.3635 (3.1634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6577 (0.8472)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.5965 (0.5965)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 6.2452  data: 5.9018  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8157 (0.7837)  acc1: 82.4000 (83.3818)  acc5: 97.2000 (97.3455)  time: 0.8605  data: 0.5573  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9765 (0.9184)  acc1: 78.4000 (80.6476)  acc5: 95.6000 (95.7333)  time: 0.3107  data: 0.0114  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0023 (0.9346)  acc1: 78.0000 (80.1440)  acc5: 95.6000 (95.7120)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5547 s / it)
* Acc@1 80.400 Acc@5 95.692 loss 0.932
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.51%
Epoch: [118]  [   0/1251]  eta: 1:26:22  lr: 0.002908  min_lr: 0.002908  loss: 3.8212 (3.8212)  weight_decay: 0.0500 (0.0500)  time: 4.1428  data: 3.2383  max mem: 54228
Epoch: [118]  [ 200/1251]  eta: 0:11:19  lr: 0.002905  min_lr: 0.002905  loss: 3.2158 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8000 (0.9634)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [118]  [ 400/1251]  eta: 0:09:02  lr: 0.002902  min_lr: 0.002902  loss: 3.2688 (3.1930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9128 (0.9414)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [118]  [ 600/1251]  eta: 0:06:53  lr: 0.002899  min_lr: 0.002899  loss: 3.2159 (3.1641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6150 (0.8955)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [118]  [ 800/1251]  eta: 0:04:45  lr: 0.002895  min_lr: 0.002895  loss: 3.3747 (3.1796)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9439 (0.8858)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [118]  [1000/1251]  eta: 0:02:38  lr: 0.002892  min_lr: 0.002892  loss: 3.2899 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8220 (0.8683)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [118]  [1200/1251]  eta: 0:00:32  lr: 0.002889  min_lr: 0.002889  loss: 3.0518 (3.1690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6729 (0.8656)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.3685 (3.1703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8659 (0.8659)  time: 0.5337  data: 0.0007  max mem: 54228
Epoch: [118] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.3685 (3.1609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8659 (0.8659)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.6334 (0.6334)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.5042  data: 6.1628  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8488 (0.8270)  acc1: 83.2000 (84.2182)  acc5: 97.6000 (97.3091)  time: 0.8637  data: 0.5605  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9936 (0.9694)  acc1: 79.6000 (80.5524)  acc5: 95.2000 (95.5810)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0299 (0.9775)  acc1: 78.4000 (80.2720)  acc5: 94.8000 (95.6480)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5519 s / it)
* Acc@1 80.470 Acc@5 95.642 loss 0.976
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.51%
Epoch: [119]  [   0/1251]  eta: 1:20:30  lr: 0.002888  min_lr: 0.002888  loss: 2.5436 (2.5436)  weight_decay: 0.0500 (0.0500)  time: 3.8610  data: 3.1599  max mem: 54228
Epoch: [119]  [ 200/1251]  eta: 0:11:17  lr: 0.002885  min_lr: 0.002885  loss: 3.2118 (3.1481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8220 (0.8785)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [119]  [ 400/1251]  eta: 0:09:02  lr: 0.002882  min_lr: 0.002882  loss: 3.1269 (3.1535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8668 (0.8287)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [119]  [ 600/1251]  eta: 0:06:53  lr: 0.002879  min_lr: 0.002879  loss: 3.1404 (3.1764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.8278)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [119]  [ 800/1251]  eta: 0:04:45  lr: 0.002875  min_lr: 0.002875  loss: 3.3209 (3.1729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6900 (0.8221)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [119]  [1000/1251]  eta: 0:02:38  lr: 0.002872  min_lr: 0.002872  loss: 3.2724 (3.1686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.8332)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [119]  [1200/1251]  eta: 0:00:32  lr: 0.002869  min_lr: 0.002869  loss: 3.4778 (3.1782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7267 (0.8356)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 3.2923 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.8420)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [119] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 3.2923 (3.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.8420)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.6648 (0.6648)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 6.5318  data: 6.2047  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7951 (0.8135)  acc1: 84.0000 (83.0182)  acc5: 98.0000 (97.3455)  time: 0.8664  data: 0.5644  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0479 (0.9499)  acc1: 78.0000 (79.8667)  acc5: 94.4000 (95.7905)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0125 (0.9595)  acc1: 77.2000 (79.5520)  acc5: 94.4000 (95.6960)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5534 s / it)
* Acc@1 80.276 Acc@5 95.746 loss 0.952
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.51%
Epoch: [120]  [   0/1251]  eta: 1:26:22  lr: 0.002868  min_lr: 0.002868  loss: 3.0333 (3.0333)  weight_decay: 0.0500 (0.0500)  time: 4.1424  data: 2.1536  max mem: 54228
Epoch: [120]  [ 200/1251]  eta: 0:11:20  lr: 0.002865  min_lr: 0.002865  loss: 3.3324 (3.1739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9701 (0.9044)  time: 0.6354  data: 0.0005  max mem: 54228
Epoch: [120]  [ 400/1251]  eta: 0:09:03  lr: 0.002862  min_lr: 0.002862  loss: 3.2721 (3.1791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0041 (0.8655)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [120]  [ 600/1251]  eta: 0:06:53  lr: 0.002858  min_lr: 0.002858  loss: 3.0978 (3.1703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8770 (0.8679)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [120]  [ 800/1251]  eta: 0:04:45  lr: 0.002855  min_lr: 0.002855  loss: 3.2363 (3.1681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9957 (0.8778)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [120]  [1000/1251]  eta: 0:02:38  lr: 0.002852  min_lr: 0.002852  loss: 3.3057 (3.1516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (0.8739)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [120]  [1200/1251]  eta: 0:00:32  lr: 0.002849  min_lr: 0.002849  loss: 3.3244 (3.1508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7724 (0.8835)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.2631 (3.1541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7557 (0.8791)  time: 0.5382  data: 0.0005  max mem: 54228
Epoch: [120] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.2631 (3.1621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7557 (0.8791)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.7408 (0.7408)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 6.8274  data: 6.4998  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8423 (0.8512)  acc1: 85.2000 (83.7455)  acc5: 97.6000 (97.2727)  time: 0.8930  data: 0.5912  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0197 (0.9802)  acc1: 78.4000 (80.5714)  acc5: 95.6000 (95.6762)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0197 (0.9892)  acc1: 78.4000 (80.2560)  acc5: 94.4000 (95.5360)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5645 s / it)
* Acc@1 80.656 Acc@5 95.696 loss 0.975
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.66%
Epoch: [121]  [   0/1251]  eta: 1:20:42  lr: 0.002848  min_lr: 0.002848  loss: 3.1989 (3.1989)  weight_decay: 0.0500 (0.0500)  time: 3.8712  data: 3.2317  max mem: 54228
Epoch: [121]  [ 200/1251]  eta: 0:11:19  lr: 0.002845  min_lr: 0.002845  loss: 3.3764 (3.2132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6837 (0.8406)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [121]  [ 400/1251]  eta: 0:09:02  lr: 0.002841  min_lr: 0.002841  loss: 3.1875 (3.1713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6769 (0.8761)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [121]  [ 600/1251]  eta: 0:06:53  lr: 0.002838  min_lr: 0.002838  loss: 3.2107 (3.1598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6133 (0.8487)  time: 0.6381  data: 0.0005  max mem: 54228
Epoch: [121]  [ 800/1251]  eta: 0:04:45  lr: 0.002835  min_lr: 0.002835  loss: 3.2528 (3.1725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (0.8691)  time: 0.6297  data: 0.0005  max mem: 54228
Epoch: [121]  [1000/1251]  eta: 0:02:38  lr: 0.002831  min_lr: 0.002831  loss: 3.3098 (3.1614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8739 (0.8620)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [121]  [1200/1251]  eta: 0:00:32  lr: 0.002828  min_lr: 0.002828  loss: 3.0422 (3.1553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5814 (0.8436)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.2576 (3.1581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6809 (0.8429)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [121] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.2576 (3.1653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6809 (0.8429)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6535 (0.6535)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 6.7027  data: 6.3700  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8269 (0.8243)  acc1: 83.2000 (83.9273)  acc5: 97.6000 (97.3455)  time: 0.8818  data: 0.5794  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0324 (0.9867)  acc1: 77.6000 (80.0762)  acc5: 95.6000 (95.7524)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0463 (0.9955)  acc1: 77.6000 (79.9200)  acc5: 94.4000 (95.6960)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5591 s / it)
* Acc@1 80.424 Acc@5 95.652 loss 0.986
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.66%
Epoch: [122]  [   0/1251]  eta: 1:24:40  lr: 0.002827  min_lr: 0.002827  loss: 3.1100 (3.1100)  weight_decay: 0.0500 (0.0500)  time: 4.0610  data: 1.8019  max mem: 54228
Epoch: [122]  [ 200/1251]  eta: 0:11:19  lr: 0.002824  min_lr: 0.002824  loss: 2.9059 (3.1512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.7912)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [122]  [ 400/1251]  eta: 0:09:03  lr: 0.002821  min_lr: 0.002821  loss: 3.1291 (3.1329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.8298)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [122]  [ 600/1251]  eta: 0:06:53  lr: 0.002818  min_lr: 0.002818  loss: 3.2835 (3.1448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6662 (0.8297)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [122]  [ 800/1251]  eta: 0:04:45  lr: 0.002814  min_lr: 0.002814  loss: 3.3313 (3.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (0.8364)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [122]  [1000/1251]  eta: 0:02:38  lr: 0.002811  min_lr: 0.002811  loss: 3.0715 (3.1532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8165 (0.8380)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [122]  [1200/1251]  eta: 0:00:32  lr: 0.002808  min_lr: 0.002808  loss: 3.0467 (3.1538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1239 (0.8476)  time: 0.6338  data: 0.0004  max mem: 54228
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 3.3055 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (0.8541)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [122] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 3.3055 (3.1421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (0.8541)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.6908 (0.6908)  acc1: 87.2000 (87.2000)  acc5: 99.6000 (99.6000)  time: 4.4585  data: 4.1309  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8549 (0.8528)  acc1: 85.2000 (83.6727)  acc5: 98.0000 (97.5636)  time: 0.8428  data: 0.5369  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0570 (0.9917)  acc1: 78.8000 (80.1524)  acc5: 94.8000 (96.0762)  time: 0.3908  data: 0.0888  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0971 (1.0035)  acc1: 78.0000 (79.7600)  acc5: 94.8000 (96.0320)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5439 s / it)
* Acc@1 80.602 Acc@5 95.782 loss 1.003
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.66%
Epoch: [123]  [   0/1251]  eta: 1:28:44  lr: 0.002807  min_lr: 0.002807  loss: 3.2569 (3.2569)  weight_decay: 0.0500 (0.0500)  time: 4.2559  data: 1.8430  max mem: 54228
Epoch: [123]  [ 200/1251]  eta: 0:11:20  lr: 0.002804  min_lr: 0.002804  loss: 3.4063 (3.1929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8484 (0.8209)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [123]  [ 400/1251]  eta: 0:09:03  lr: 0.002800  min_lr: 0.002800  loss: 2.9502 (3.1567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8055 (0.8362)  time: 0.6291  data: 0.0004  max mem: 54228
Epoch: [123]  [ 600/1251]  eta: 0:06:53  lr: 0.002797  min_lr: 0.002797  loss: 3.2792 (3.1580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7859 (0.8240)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [123]  [ 800/1251]  eta: 0:04:46  lr: 0.002794  min_lr: 0.002794  loss: 3.1961 (3.1470)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7446 (0.8481)  time: 0.6352  data: 0.0004  max mem: 54228
Epoch: [123]  [1000/1251]  eta: 0:02:38  lr: 0.002790  min_lr: 0.002790  loss: 3.3434 (3.1439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7005 (0.8453)  time: 0.6294  data: 0.0005  max mem: 54228
Epoch: [123]  [1200/1251]  eta: 0:00:32  lr: 0.002787  min_lr: 0.002787  loss: 3.2178 (3.1461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8150 (nan)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 3.2046 (3.1460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (nan)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [123] Total time: 0:13:09 (0.6315 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 3.2046 (3.1479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8205 (nan)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.5701 (0.5701)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.0791  data: 5.7527  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7205 (0.7207)  acc1: 83.6000 (84.4364)  acc5: 98.0000 (97.6727)  time: 0.8716  data: 0.5700  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8896 (0.8611)  acc1: 78.4000 (80.7810)  acc5: 96.0000 (96.1905)  time: 0.3251  data: 0.0259  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8979 (0.8720)  acc1: 78.4000 (80.4640)  acc5: 95.6000 (96.0640)  time: 0.2994  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5555 s / it)
* Acc@1 80.786 Acc@5 95.820 loss 0.875
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.79%
Epoch: [124]  [   0/1251]  eta: 1:13:53  lr: 0.002786  min_lr: 0.002786  loss: 3.1983 (3.1983)  weight_decay: 0.0500 (0.0500)  time: 3.5440  data: 2.9170  max mem: 54228
Epoch: [124]  [ 200/1251]  eta: 0:11:19  lr: 0.002783  min_lr: 0.002783  loss: 3.3193 (3.1042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8621 (0.9720)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [124]  [ 400/1251]  eta: 0:09:02  lr: 0.002780  min_lr: 0.002780  loss: 2.8187 (3.1195)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6790 (0.8491)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [124]  [ 600/1251]  eta: 0:06:53  lr: 0.002776  min_lr: 0.002776  loss: 3.1183 (3.1378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.8212)  time: 0.6294  data: 0.0005  max mem: 54228
Epoch: [124]  [ 800/1251]  eta: 0:04:45  lr: 0.002773  min_lr: 0.002773  loss: 3.0593 (3.1335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8078 (0.8199)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [124]  [1000/1251]  eta: 0:02:38  lr: 0.002770  min_lr: 0.002770  loss: 3.3431 (3.1410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6395 (0.8383)  time: 0.6291  data: 0.0005  max mem: 54228
Epoch: [124]  [1200/1251]  eta: 0:00:32  lr: 0.002766  min_lr: 0.002766  loss: 3.3150 (3.1482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8748 (0.8517)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.3597 (3.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.8533)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [124] Total time: 0:13:10 (0.6317 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.3597 (3.1451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.8533)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6825 (0.6825)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 6.3674  data: 6.0464  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8136 (0.8468)  acc1: 84.4000 (83.2000)  acc5: 97.6000 (97.5636)  time: 0.8516  data: 0.5500  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9793 (0.9915)  acc1: 78.8000 (80.2095)  acc5: 95.6000 (95.9810)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.1152 (1.0015)  acc1: 78.8000 (79.9520)  acc5: 95.2000 (95.8240)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5467 s / it)
* Acc@1 80.408 Acc@5 95.666 loss 0.999
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.79%
Epoch: [125]  [   0/1251]  eta: 1:26:40  lr: 0.002766  min_lr: 0.002766  loss: 2.0285 (2.0285)  weight_decay: 0.0500 (0.0500)  time: 4.1571  data: 3.3531  max mem: 54228
Epoch: [125]  [ 200/1251]  eta: 0:11:18  lr: 0.002762  min_lr: 0.002762  loss: 3.0632 (3.0867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.8577)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [125]  [ 400/1251]  eta: 0:09:01  lr: 0.002759  min_lr: 0.002759  loss: 3.2654 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7293 (0.8040)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [125]  [ 600/1251]  eta: 0:06:53  lr: 0.002756  min_lr: 0.002756  loss: 3.2850 (3.1198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8325 (0.8458)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [125]  [ 800/1251]  eta: 0:04:45  lr: 0.002752  min_lr: 0.002752  loss: 3.0023 (3.1190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7281 (0.8487)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [125]  [1000/1251]  eta: 0:02:38  lr: 0.002749  min_lr: 0.002749  loss: 3.3112 (3.1283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8692 (0.8542)  time: 0.6328  data: 0.0004  max mem: 54228
Epoch: [125]  [1200/1251]  eta: 0:00:32  lr: 0.002746  min_lr: 0.002746  loss: 3.4275 (3.1374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6662 (0.8564)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 3.3393 (3.1402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.8606)  time: 0.5327  data: 0.0005  max mem: 54228
Epoch: [125] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 3.3393 (3.1511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.8606)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.7078 (0.7078)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 6.6241  data: 6.2891  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8094 (0.8450)  acc1: 82.8000 (83.8909)  acc5: 97.6000 (97.2364)  time: 0.8745  data: 0.5720  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0170 (0.9936)  acc1: 77.6000 (80.0571)  acc5: 95.2000 (95.4857)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0883 (1.0017)  acc1: 77.6000 (79.5840)  acc5: 94.8000 (95.5040)  time: 0.2991  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5561 s / it)
* Acc@1 80.614 Acc@5 95.776 loss 0.987
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.79%
Epoch: [126]  [   0/1251]  eta: 1:29:23  lr: 0.002745  min_lr: 0.002745  loss: 3.4460 (3.4460)  weight_decay: 0.0500 (0.0500)  time: 4.2878  data: 1.8686  max mem: 54228
Epoch: [126]  [ 200/1251]  eta: 0:11:20  lr: 0.002742  min_lr: 0.002742  loss: 3.0808 (3.1293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7935 (0.8198)  time: 0.6361  data: 0.0004  max mem: 54228
Epoch: [126]  [ 400/1251]  eta: 0:09:03  lr: 0.002738  min_lr: 0.002738  loss: 3.2251 (3.1357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7013 (0.8206)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [126]  [ 600/1251]  eta: 0:06:53  lr: 0.002735  min_lr: 0.002735  loss: 3.4084 (3.1315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7089 (0.8171)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [126]  [ 800/1251]  eta: 0:04:45  lr: 0.002732  min_lr: 0.002732  loss: 3.0746 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8066 (0.8385)  time: 0.6338  data: 0.0004  max mem: 54228
Epoch: [126]  [1000/1251]  eta: 0:02:38  lr: 0.002728  min_lr: 0.002728  loss: 3.2502 (3.1383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6686 (0.8309)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [126]  [1200/1251]  eta: 0:00:32  lr: 0.002725  min_lr: 0.002725  loss: 3.2632 (3.1385)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.8585)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 3.1469 (3.1361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6352 (0.8508)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [126] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 3.1469 (3.1421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6352 (0.8508)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.6239 (0.6239)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 6.8046  data: 6.4733  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8232 (0.7885)  acc1: 84.4000 (84.3636)  acc5: 97.2000 (97.0545)  time: 0.8913  data: 0.5888  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9373 (0.9233)  acc1: 79.6000 (80.8381)  acc5: 95.6000 (95.9429)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0147 (0.9373)  acc1: 78.4000 (80.3680)  acc5: 96.0000 (95.8880)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5652 s / it)
* Acc@1 80.848 Acc@5 95.810 loss 0.932
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.85%
Epoch: [127]  [   0/1251]  eta: 1:13:30  lr: 0.002724  min_lr: 0.002724  loss: 2.8933 (2.8933)  weight_decay: 0.0500 (0.0500)  time: 3.5256  data: 2.8903  max mem: 54228
Epoch: [127]  [ 200/1251]  eta: 0:11:18  lr: 0.002721  min_lr: 0.002721  loss: 3.1921 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7559 (0.8074)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [127]  [ 400/1251]  eta: 0:09:02  lr: 0.002717  min_lr: 0.002717  loss: 3.2569 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9178 (0.8268)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [127]  [ 600/1251]  eta: 0:06:52  lr: 0.002714  min_lr: 0.002714  loss: 3.0357 (3.1041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8172 (0.8300)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [127]  [ 800/1251]  eta: 0:04:45  lr: 0.002711  min_lr: 0.002711  loss: 3.4258 (3.1111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8874 (0.8664)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [127]  [1000/1251]  eta: 0:02:38  lr: 0.002707  min_lr: 0.002707  loss: 3.3548 (3.1224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8864 (0.8561)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [127]  [1200/1251]  eta: 0:00:32  lr: 0.002704  min_lr: 0.002704  loss: 3.0311 (3.1176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8302 (0.8482)  time: 0.6370  data: 0.0005  max mem: 54228
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 3.2214 (3.1187)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0358 (0.8561)  time: 0.5339  data: 0.0005  max mem: 54228
Epoch: [127] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 3.2214 (3.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0358 (0.8561)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6271 (0.6271)  acc1: 90.8000 (90.8000)  acc5: 98.0000 (98.0000)  time: 6.3985  data: 6.0810  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7945 (0.8115)  acc1: 84.8000 (84.1091)  acc5: 97.6000 (97.4546)  time: 0.8544  data: 0.5531  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9908 (0.9590)  acc1: 78.0000 (80.6667)  acc5: 95.6000 (95.9619)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0074 (0.9667)  acc1: 78.0000 (80.4000)  acc5: 95.6000 (95.8880)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5494 s / it)
* Acc@1 80.750 Acc@5 95.934 loss 0.959
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.85%
Epoch: [128]  [   0/1251]  eta: 1:28:18  lr: 0.002703  min_lr: 0.002703  loss: 2.5307 (2.5307)  weight_decay: 0.0500 (0.0500)  time: 4.2353  data: 3.3404  max mem: 54228
Epoch: [128]  [ 200/1251]  eta: 0:11:19  lr: 0.002700  min_lr: 0.002700  loss: 3.2877 (3.1149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7754 (0.7548)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [128]  [ 400/1251]  eta: 0:09:02  lr: 0.002696  min_lr: 0.002696  loss: 3.4231 (3.1289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.8497)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [128]  [ 600/1251]  eta: 0:06:53  lr: 0.002693  min_lr: 0.002693  loss: 3.1896 (3.1280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7383 (0.8677)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [128]  [ 800/1251]  eta: 0:04:45  lr: 0.002690  min_lr: 0.002690  loss: 3.0855 (3.1374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6353 (0.8322)  time: 0.6334  data: 0.0004  max mem: 54228
Epoch: [128]  [1000/1251]  eta: 0:02:38  lr: 0.002686  min_lr: 0.002686  loss: 2.9908 (3.1382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6974 (0.8180)  time: 0.6359  data: 0.0004  max mem: 54228
Epoch: [128]  [1200/1251]  eta: 0:00:32  lr: 0.002683  min_lr: 0.002683  loss: 3.1883 (3.1398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0939 (0.8249)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 3.2471 (3.1417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (0.8256)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [128] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 3.2471 (3.1366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (0.8256)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6537 (0.6537)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 6.5738  data: 6.2561  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8116 (0.7938)  acc1: 83.2000 (84.3636)  acc5: 97.6000 (97.3455)  time: 0.8700  data: 0.5690  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9667 (0.9400)  acc1: 79.6000 (80.8571)  acc5: 95.6000 (95.8286)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0326 (0.9475)  acc1: 78.8000 (80.7040)  acc5: 95.2000 (95.7760)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5552 s / it)
* Acc@1 80.824 Acc@5 95.838 loss 0.947
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.85%
Epoch: [129]  [   0/1251]  eta: 1:27:02  lr: 0.002682  min_lr: 0.002682  loss: 3.5834 (3.5834)  weight_decay: 0.0500 (0.0500)  time: 4.1748  data: 1.9658  max mem: 54228
Epoch: [129]  [ 200/1251]  eta: 0:11:20  lr: 0.002679  min_lr: 0.002679  loss: 2.7956 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9148 (0.8283)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [129]  [ 400/1251]  eta: 0:09:03  lr: 0.002675  min_lr: 0.002675  loss: 3.3342 (3.1305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7221 (0.8150)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [129]  [ 600/1251]  eta: 0:06:53  lr: 0.002672  min_lr: 0.002672  loss: 3.2810 (3.1263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8638 (0.8662)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [129]  [ 800/1251]  eta: 0:04:45  lr: 0.002668  min_lr: 0.002668  loss: 2.9638 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8773 (0.8801)  time: 0.6282  data: 0.0006  max mem: 54228
Epoch: [129]  [1000/1251]  eta: 0:02:38  lr: 0.002665  min_lr: 0.002665  loss: 2.9631 (3.1320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7352 (0.8627)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [129]  [1200/1251]  eta: 0:00:32  lr: 0.002662  min_lr: 0.002662  loss: 3.1447 (3.1331)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8040 (0.8666)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 3.2898 (3.1362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (0.8733)  time: 0.5338  data: 0.0007  max mem: 54228
Epoch: [129] Total time: 0:13:09 (0.6315 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 3.2898 (3.1193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7851 (0.8733)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.6379 (0.6379)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 6.6110  data: 6.2799  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8001 (0.7879)  acc1: 83.2000 (84.1818)  acc5: 97.6000 (97.4909)  time: 0.8740  data: 0.5712  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0046 (0.9342)  acc1: 79.2000 (81.1619)  acc5: 96.0000 (95.9048)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0077 (0.9416)  acc1: 79.2000 (80.7680)  acc5: 95.6000 (95.8400)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5585 s / it)
* Acc@1 80.880 Acc@5 95.858 loss 0.931
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.88%
Epoch: [130]  [   0/1251]  eta: 1:13:28  lr: 0.002661  min_lr: 0.002661  loss: 2.3240 (2.3240)  weight_decay: 0.0500 (0.0500)  time: 3.5239  data: 2.8914  max mem: 54228
Epoch: [130]  [ 200/1251]  eta: 0:11:17  lr: 0.002657  min_lr: 0.002657  loss: 3.3196 (3.0974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7298 (0.8019)  time: 0.6359  data: 0.0004  max mem: 54228
Epoch: [130]  [ 400/1251]  eta: 0:09:01  lr: 0.002654  min_lr: 0.002654  loss: 3.0411 (3.1065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7819 (0.8331)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [130]  [ 600/1251]  eta: 0:06:52  lr: 0.002651  min_lr: 0.002651  loss: 3.2273 (3.1169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7179 (0.8832)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [130]  [ 800/1251]  eta: 0:04:45  lr: 0.002647  min_lr: 0.002647  loss: 3.2217 (3.1022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6997 (0.8526)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [130]  [1000/1251]  eta: 0:02:38  lr: 0.002644  min_lr: 0.002644  loss: 2.9801 (3.1022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9091 (0.8737)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [130]  [1200/1251]  eta: 0:00:32  lr: 0.002640  min_lr: 0.002640  loss: 3.1850 (3.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 3.2496 (3.1117)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [130] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 3.2496 (3.1193)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6376 (0.6376)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.2433  data: 5.9265  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8425 (0.8183)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (97.5636)  time: 0.8402  data: 0.5392  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0394 (0.9627)  acc1: 78.8000 (81.2000)  acc5: 95.2000 (96.1143)  time: 0.2997  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0844 (0.9730)  acc1: 78.8000 (81.0080)  acc5: 94.8000 (96.0160)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5421 s / it)
* Acc@1 80.916 Acc@5 95.770 loss 0.971
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.92%
Epoch: [131]  [   0/1251]  eta: 1:00:10  lr: 0.002640  min_lr: 0.002640  loss: 3.2917 (3.2917)  weight_decay: 0.0500 (0.0500)  time: 2.8858  data: 2.2570  max mem: 54228
Epoch: [131]  [ 200/1251]  eta: 0:11:12  lr: 0.002636  min_lr: 0.002636  loss: 3.2458 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8643 (0.8996)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [131]  [ 400/1251]  eta: 0:08:59  lr: 0.002633  min_lr: 0.002633  loss: 3.2151 (3.1354)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [131]  [ 600/1251]  eta: 0:06:52  lr: 0.002629  min_lr: 0.002629  loss: 3.1511 (3.1173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6879 (nan)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [131]  [ 800/1251]  eta: 0:04:45  lr: 0.002626  min_lr: 0.002626  loss: 3.1766 (3.1319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (nan)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [131]  [1000/1251]  eta: 0:02:38  lr: 0.002623  min_lr: 0.002623  loss: 3.2160 (3.1338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (nan)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [131]  [1200/1251]  eta: 0:00:32  lr: 0.002619  min_lr: 0.002619  loss: 3.0627 (3.1304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (nan)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 3.2263 (3.1295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5777 (nan)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [131] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 3.2263 (3.1184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5777 (nan)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6447 (0.6447)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 6.3437  data: 6.0087  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8077 (0.7943)  acc1: 84.4000 (84.7636)  acc5: 98.0000 (97.6364)  time: 0.8491  data: 0.5466  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9221 (0.9401)  acc1: 79.2000 (81.3143)  acc5: 96.0000 (96.0191)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0262 (0.9506)  acc1: 78.4000 (80.9440)  acc5: 95.6000 (95.9360)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5453 s / it)
* Acc@1 80.824 Acc@5 95.826 loss 0.954
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.92%
Epoch: [132]  [   0/1251]  eta: 1:32:58  lr: 0.002618  min_lr: 0.002618  loss: 3.1864 (3.1864)  weight_decay: 0.0500 (0.0500)  time: 4.4596  data: 3.3273  max mem: 54228
Epoch: [132]  [ 200/1251]  eta: 0:11:20  lr: 0.002615  min_lr: 0.002615  loss: 3.0482 (3.1087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.7966)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [132]  [ 400/1251]  eta: 0:09:03  lr: 0.002612  min_lr: 0.002612  loss: 3.3251 (3.0801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7153 (0.7778)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [132]  [ 600/1251]  eta: 0:06:53  lr: 0.002608  min_lr: 0.002608  loss: 3.3470 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (0.8404)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [132]  [ 800/1251]  eta: 0:04:45  lr: 0.002605  min_lr: 0.002605  loss: 3.3621 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6593 (0.8407)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [132]  [1000/1251]  eta: 0:02:38  lr: 0.002601  min_lr: 0.002601  loss: 3.3686 (3.1143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1399 (0.8581)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [132]  [1200/1251]  eta: 0:00:32  lr: 0.002598  min_lr: 0.002598  loss: 3.1886 (3.1171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7065 (0.8773)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.2504 (3.1169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8423 (0.8764)  time: 0.5334  data: 0.0007  max mem: 54228
Epoch: [132] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.2504 (3.1168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8423 (0.8764)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.6835 (0.6835)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 6.0002  data: 5.6789  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8593 (0.8669)  acc1: 82.4000 (83.8909)  acc5: 98.0000 (97.5273)  time: 0.8181  data: 0.5165  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0508 (1.0013)  acc1: 78.8000 (80.7810)  acc5: 95.2000 (96.0000)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0653 (1.0128)  acc1: 78.8000 (80.4320)  acc5: 95.2000 (95.9840)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5327 s / it)
* Acc@1 80.916 Acc@5 95.856 loss 1.007
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.92%
Epoch: [133]  [   0/1251]  eta: 1:14:35  lr: 0.002597  min_lr: 0.002597  loss: 2.6309 (2.6309)  weight_decay: 0.0500 (0.0500)  time: 3.5774  data: 2.9313  max mem: 54228
Epoch: [133]  [ 200/1251]  eta: 0:11:18  lr: 0.002594  min_lr: 0.002594  loss: 3.2035 (3.0605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6600 (0.7655)  time: 0.6469  data: 0.0005  max mem: 54228
Epoch: [133]  [ 400/1251]  eta: 0:09:02  lr: 0.002590  min_lr: 0.002590  loss: 3.3425 (3.0664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8105)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [133]  [ 600/1251]  eta: 0:06:53  lr: 0.002587  min_lr: 0.002587  loss: 3.0729 (3.0804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7262 (0.8178)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [133]  [ 800/1251]  eta: 0:04:45  lr: 0.002583  min_lr: 0.002583  loss: 3.1738 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9023 (0.8431)  time: 0.6369  data: 0.0004  max mem: 54228
Epoch: [133]  [1000/1251]  eta: 0:02:38  lr: 0.002580  min_lr: 0.002580  loss: 3.0390 (3.0965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8362 (0.8494)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [133]  [1200/1251]  eta: 0:00:32  lr: 0.002576  min_lr: 0.002576  loss: 3.1132 (3.0992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8308 (0.8887)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.3821 (3.1039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9480 (0.8917)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [133] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.3821 (3.1099)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9480 (0.8917)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.7174 (0.7174)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 6.6324  data: 6.3170  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8541 (0.8709)  acc1: 82.8000 (83.8182)  acc5: 97.6000 (97.5273)  time: 0.8758  data: 0.5746  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 1.0831 (1.0151)  acc1: 79.2000 (80.7048)  acc5: 94.8000 (95.7714)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0646 (1.0240)  acc1: 80.0000 (80.5120)  acc5: 94.8000 (95.7280)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5573 s / it)
* Acc@1 80.802 Acc@5 95.782 loss 1.016
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.92%
Epoch: [134]  [   0/1251]  eta: 1:25:53  lr: 0.002576  min_lr: 0.002576  loss: 2.3601 (2.3601)  weight_decay: 0.0500 (0.0500)  time: 4.1198  data: 2.9596  max mem: 54228
Epoch: [134]  [ 200/1251]  eta: 0:11:21  lr: 0.002572  min_lr: 0.002572  loss: 3.2184 (3.0680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.7712)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [134]  [ 400/1251]  eta: 0:09:03  lr: 0.002569  min_lr: 0.002569  loss: 3.1741 (3.0970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6364 (0.7521)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [134]  [ 600/1251]  eta: 0:06:53  lr: 0.002565  min_lr: 0.002565  loss: 3.0184 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6416 (0.7471)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [134]  [ 800/1251]  eta: 0:04:45  lr: 0.002562  min_lr: 0.002562  loss: 3.0803 (3.1083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6255 (0.7500)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [134]  [1000/1251]  eta: 0:02:38  lr: 0.002558  min_lr: 0.002558  loss: 3.0621 (3.1052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.7788)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [134]  [1200/1251]  eta: 0:00:32  lr: 0.002555  min_lr: 0.002555  loss: 3.1219 (3.1122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6199 (0.7768)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 3.1633 (3.1115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6573 (0.7754)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [134] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 3.1633 (3.1105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6573 (0.7754)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6296 (0.6296)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 6.3609  data: 6.0341  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7572 (0.7592)  acc1: 83.2000 (83.4909)  acc5: 98.0000 (97.5273)  time: 0.8507  data: 0.5489  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9497 (0.9057)  acc1: 79.2000 (80.4191)  acc5: 95.6000 (96.0191)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0024 (0.9142)  acc1: 79.2000 (80.3680)  acc5: 95.6000 (95.9680)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5470 s / it)
* Acc@1 81.086 Acc@5 95.962 loss 0.900
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.09%
Epoch: [135]  [   0/1251]  eta: 1:07:37  lr: 0.002554  min_lr: 0.002554  loss: 3.5162 (3.5162)  weight_decay: 0.0500 (0.0500)  time: 3.2431  data: 2.6142  max mem: 54228
Epoch: [135]  [ 200/1251]  eta: 0:11:14  lr: 0.002551  min_lr: 0.002551  loss: 2.9755 (3.1200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.9014)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [135]  [ 400/1251]  eta: 0:09:01  lr: 0.002547  min_lr: 0.002547  loss: 3.0281 (3.1137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (0.8870)  time: 0.6365  data: 0.0004  max mem: 54228
Epoch: [135]  [ 600/1251]  eta: 0:06:52  lr: 0.002544  min_lr: 0.002544  loss: 3.2592 (3.1192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8535 (0.9065)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [135]  [ 800/1251]  eta: 0:04:45  lr: 0.002540  min_lr: 0.002540  loss: 3.3085 (3.1113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5452 (0.8783)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [135]  [1000/1251]  eta: 0:02:38  lr: 0.002537  min_lr: 0.002537  loss: 3.2397 (3.1159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8195 (0.8697)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [135]  [1200/1251]  eta: 0:00:32  lr: 0.002533  min_lr: 0.002533  loss: 2.8553 (3.1083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9849 (0.8790)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 3.2862 (3.1088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0954 (0.8872)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [135] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 3.2862 (3.1065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0954 (0.8872)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6727 (0.6727)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 6.3661  data: 6.0291  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7499 (0.7829)  acc1: 84.8000 (84.3636)  acc5: 97.6000 (97.5636)  time: 0.8511  data: 0.5484  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9843 (0.9338)  acc1: 78.0000 (80.9524)  acc5: 96.0000 (95.9810)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0532 (0.9441)  acc1: 77.6000 (80.7520)  acc5: 95.2000 (95.8080)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5475 s / it)
* Acc@1 81.048 Acc@5 95.940 loss 0.947
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.09%
Epoch: [136]  [   0/1251]  eta: 1:27:11  lr: 0.002532  min_lr: 0.002532  loss: 3.2163 (3.2163)  weight_decay: 0.0500 (0.0500)  time: 4.1818  data: 1.9108  max mem: 54228
Epoch: [136]  [ 200/1251]  eta: 0:11:19  lr: 0.002529  min_lr: 0.002529  loss: 3.2605 (3.0916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.7621)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [136]  [ 400/1251]  eta: 0:09:03  lr: 0.002526  min_lr: 0.002526  loss: 3.0671 (3.0997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.8186)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [136]  [ 600/1251]  eta: 0:06:53  lr: 0.002522  min_lr: 0.002522  loss: 3.3530 (3.1150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8952 (0.8268)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [136]  [ 800/1251]  eta: 0:04:45  lr: 0.002519  min_lr: 0.002519  loss: 3.3729 (3.1093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8633 (0.8648)  time: 0.6332  data: 0.0004  max mem: 54228
Epoch: [136]  [1000/1251]  eta: 0:02:38  lr: 0.002515  min_lr: 0.002515  loss: 3.1128 (3.1138)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.8348)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [136]  [1200/1251]  eta: 0:00:32  lr: 0.002512  min_lr: 0.002512  loss: 3.2159 (3.1117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9414 (0.8314)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 3.2784 (3.1107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (0.8384)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [136] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 3.2784 (3.1098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9993 (0.8384)
Test:  [ 0/25]  eta: 0:02:54  loss: 0.6173 (0.6173)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.9776  data: 6.6486  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7661 (0.7885)  acc1: 85.2000 (83.8182)  acc5: 97.6000 (97.5636)  time: 0.9068  data: 0.6047  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9525 (0.9296)  acc1: 78.8000 (80.5524)  acc5: 95.6000 (96.2476)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0311 (0.9402)  acc1: 78.8000 (80.3840)  acc5: 95.2000 (96.0640)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5712 s / it)
* Acc@1 81.256 Acc@5 95.998 loss 0.923
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.26%
Epoch: [137]  [   0/1251]  eta: 1:17:57  lr: 0.002511  min_lr: 0.002511  loss: 3.4256 (3.4256)  weight_decay: 0.0500 (0.0500)  time: 3.7392  data: 3.1066  max mem: 54228
Epoch: [137]  [ 200/1251]  eta: 0:11:19  lr: 0.002507  min_lr: 0.002507  loss: 3.2857 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.8901)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [137]  [ 400/1251]  eta: 0:09:02  lr: 0.002504  min_lr: 0.002504  loss: 3.3249 (3.1173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6649 (0.8651)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [137]  [ 600/1251]  eta: 0:06:53  lr: 0.002500  min_lr: 0.002500  loss: 3.4357 (3.1101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9078 (0.8528)  time: 0.6416  data: 0.0004  max mem: 54228
Epoch: [137]  [ 800/1251]  eta: 0:04:45  lr: 0.002497  min_lr: 0.002497  loss: 3.2976 (3.1022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (0.8346)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [137]  [1000/1251]  eta: 0:02:38  lr: 0.002493  min_lr: 0.002493  loss: 3.0286 (3.1016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7950 (0.8546)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [137]  [1200/1251]  eta: 0:00:32  lr: 0.002490  min_lr: 0.002490  loss: 2.9022 (3.0925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6801 (0.8206)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 3.3522 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6743 (0.8144)  time: 0.5336  data: 0.0007  max mem: 54228
Epoch: [137] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 3.3522 (3.1022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6743 (0.8144)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6842 (0.6842)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.3254  data: 5.9904  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8368 (0.8505)  acc1: 84.0000 (84.7273)  acc5: 98.0000 (97.7455)  time: 0.8475  data: 0.5449  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0260 (0.9837)  acc1: 80.8000 (80.9714)  acc5: 96.4000 (96.1714)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0260 (0.9944)  acc1: 79.6000 (80.7040)  acc5: 96.0000 (96.0800)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5472 s / it)
* Acc@1 81.200 Acc@5 95.924 loss 0.985
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.26%
Epoch: [138]  [   0/1251]  eta: 1:26:56  lr: 0.002489  min_lr: 0.002489  loss: 3.2738 (3.2738)  weight_decay: 0.0500 (0.0500)  time: 4.1696  data: 3.1627  max mem: 54228
Epoch: [138]  [ 200/1251]  eta: 0:11:18  lr: 0.002486  min_lr: 0.002486  loss: 3.4087 (3.0762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.9790)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [138]  [ 400/1251]  eta: 0:09:02  lr: 0.002482  min_lr: 0.002482  loss: 2.7419 (3.0837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6770 (0.8959)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [138]  [ 600/1251]  eta: 0:06:53  lr: 0.002479  min_lr: 0.002479  loss: 3.2912 (3.0824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6774 (0.8902)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [138]  [ 800/1251]  eta: 0:04:45  lr: 0.002475  min_lr: 0.002475  loss: 2.7654 (3.0903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7103 (0.8966)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [138]  [1000/1251]  eta: 0:02:38  lr: 0.002472  min_lr: 0.002472  loss: 3.2422 (3.0950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7483 (0.8685)  time: 0.6360  data: 0.0004  max mem: 54228
Epoch: [138]  [1200/1251]  eta: 0:00:32  lr: 0.002468  min_lr: 0.002468  loss: 3.3347 (3.0980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7605 (0.8749)  time: 0.6296  data: 0.0004  max mem: 54228
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 3.1723 (3.0954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.8733)  time: 0.5337  data: 0.0006  max mem: 54228
Epoch: [138] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 3.1723 (3.0932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7291 (0.8733)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.6730 (0.6730)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 6.1954  data: 5.8508  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7841 (0.7903)  acc1: 84.4000 (84.4364)  acc5: 98.0000 (97.6364)  time: 0.8362  data: 0.5322  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9497 (0.9320)  acc1: 80.4000 (81.2191)  acc5: 95.6000 (96.0952)  time: 0.3004  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0381 (0.9450)  acc1: 79.2000 (80.8960)  acc5: 95.6000 (95.9520)  time: 0.3006  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5429 s / it)
* Acc@1 81.120 Acc@5 95.970 loss 0.937
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.26%
Epoch: [139]  [   0/1251]  eta: 1:29:39  lr: 0.002467  min_lr: 0.002467  loss: 2.9060 (2.9060)  weight_decay: 0.0500 (0.0500)  time: 4.3005  data: 1.7783  max mem: 54228
Epoch: [139]  [ 200/1251]  eta: 0:11:20  lr: 0.002464  min_lr: 0.002464  loss: 2.8133 (3.1106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7325 (0.8398)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [139]  [ 400/1251]  eta: 0:09:03  lr: 0.002460  min_lr: 0.002460  loss: 3.0558 (3.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6474 (0.7760)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [139]  [ 600/1251]  eta: 0:06:53  lr: 0.002457  min_lr: 0.002457  loss: 3.2444 (3.1046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8474 (0.8538)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [139]  [ 800/1251]  eta: 0:04:45  lr: 0.002453  min_lr: 0.002453  loss: 3.1189 (3.0985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7045 (0.8470)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [139]  [1000/1251]  eta: 0:02:38  lr: 0.002450  min_lr: 0.002450  loss: 3.2575 (3.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8540 (0.8637)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [139]  [1200/1251]  eta: 0:00:32  lr: 0.002446  min_lr: 0.002446  loss: 3.1266 (3.1071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.8672)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 3.0385 (3.1035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6725 (0.8592)  time: 0.5373  data: 0.0007  max mem: 54228
Epoch: [139] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 3.0385 (3.0924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6725 (0.8592)
Test:  [ 0/25]  eta: 0:02:49  loss: 0.5764 (0.5764)  acc1: 88.4000 (88.4000)  acc5: 99.6000 (99.6000)  time: 6.7634  data: 6.4248  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7374 (0.7460)  acc1: 84.8000 (84.0727)  acc5: 97.6000 (97.6364)  time: 0.8869  data: 0.5843  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8872 (0.8833)  acc1: 80.4000 (81.2571)  acc5: 96.0000 (96.1524)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9579 (0.8932)  acc1: 79.6000 (80.9920)  acc5: 95.6000 (96.1600)  time: 0.2990  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5625 s / it)
* Acc@1 81.166 Acc@5 96.014 loss 0.889
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.26%
Epoch: [140]  [   0/1251]  eta: 1:25:43  lr: 0.002445  min_lr: 0.002445  loss: 3.6011 (3.6011)  weight_decay: 0.0500 (0.0500)  time: 4.1117  data: 2.8213  max mem: 54228
Epoch: [140]  [ 200/1251]  eta: 0:11:21  lr: 0.002442  min_lr: 0.002442  loss: 3.2268 (3.0585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7246 (0.8481)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [140]  [ 400/1251]  eta: 0:09:03  lr: 0.002438  min_lr: 0.002438  loss: 3.2034 (3.0589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.8091)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [140]  [ 600/1251]  eta: 0:06:53  lr: 0.002435  min_lr: 0.002435  loss: 3.1094 (3.0523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9675 (0.8681)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [140]  [ 800/1251]  eta: 0:04:45  lr: 0.002431  min_lr: 0.002431  loss: 3.1377 (3.0647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6912 (nan)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [140]  [1000/1251]  eta: 0:02:38  lr: 0.002428  min_lr: 0.002428  loss: 3.2472 (3.0740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6875 (nan)  time: 0.6328  data: 0.0005  max mem: 54228
Epoch: [140]  [1200/1251]  eta: 0:00:32  lr: 0.002424  min_lr: 0.002424  loss: 3.3093 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8191 (nan)  time: 0.6358  data: 0.0005  max mem: 54228
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 3.1729 (3.0847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (nan)  time: 0.5336  data: 0.0007  max mem: 54228
Epoch: [140] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 3.1729 (3.0856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8767 (nan)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.6511 (0.6511)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.4528  data: 6.1168  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8206 (0.8198)  acc1: 84.4000 (84.3636)  acc5: 97.6000 (97.5636)  time: 0.8590  data: 0.5564  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9830 (0.9311)  acc1: 80.4000 (81.2571)  acc5: 95.6000 (96.1143)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9830 (0.9390)  acc1: 80.0000 (80.7680)  acc5: 95.6000 (96.0480)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5492 s / it)
* Acc@1 81.236 Acc@5 95.942 loss 0.936
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.26%
Epoch: [141]  [   0/1251]  eta: 1:25:40  lr: 0.002424  min_lr: 0.002424  loss: 3.4043 (3.4043)  weight_decay: 0.0500 (0.0500)  time: 4.1095  data: 1.9374  max mem: 54228
Epoch: [141]  [ 200/1251]  eta: 0:11:19  lr: 0.002420  min_lr: 0.002420  loss: 3.3362 (3.0922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0479 (1.0514)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [141]  [ 400/1251]  eta: 0:09:02  lr: 0.002417  min_lr: 0.002417  loss: 2.6019 (3.0810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8522 (0.9458)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [141]  [ 600/1251]  eta: 0:06:53  lr: 0.002413  min_lr: 0.002413  loss: 3.2311 (3.0785)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.9075)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [141]  [ 800/1251]  eta: 0:04:45  lr: 0.002409  min_lr: 0.002409  loss: 2.7638 (3.0784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8852)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [141]  [1000/1251]  eta: 0:02:38  lr: 0.002406  min_lr: 0.002406  loss: 3.2579 (3.0704)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8571 (0.8947)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [141]  [1200/1251]  eta: 0:00:32  lr: 0.002402  min_lr: 0.002402  loss: 3.2246 (3.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6745 (0.8777)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.2543 (3.0717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.8738)  time: 0.5339  data: 0.0007  max mem: 54228
Epoch: [141] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.2543 (3.0825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7468 (0.8738)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6698 (0.6698)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 6.3721  data: 6.0428  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7765 (0.8264)  acc1: 84.8000 (84.9818)  acc5: 98.0000 (97.7091)  time: 0.8516  data: 0.5496  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0258 (0.9646)  acc1: 79.2000 (81.4476)  acc5: 95.2000 (95.9619)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0455 (0.9748)  acc1: 79.2000 (80.9440)  acc5: 95.2000 (95.9200)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5462 s / it)
* Acc@1 81.148 Acc@5 95.968 loss 0.975
Accuracy of the model on the 50000 test images: 81.1%
Max accuracy: 81.26%
Epoch: [142]  [   0/1251]  eta: 1:29:07  lr: 0.002402  min_lr: 0.002402  loss: 2.6614 (2.6614)  weight_decay: 0.0500 (0.0500)  time: 4.2745  data: 3.5396  max mem: 54228
Epoch: [142]  [ 200/1251]  eta: 0:11:19  lr: 0.002398  min_lr: 0.002398  loss: 3.1309 (2.9911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6451 (0.8196)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [142]  [ 400/1251]  eta: 0:09:03  lr: 0.002395  min_lr: 0.002395  loss: 3.1250 (3.0492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8722 (0.8592)  time: 0.6368  data: 0.0004  max mem: 54228
Epoch: [142]  [ 600/1251]  eta: 0:06:54  lr: 0.002391  min_lr: 0.002391  loss: 3.1429 (3.0585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9371 (0.9003)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [142]  [ 800/1251]  eta: 0:04:46  lr: 0.002387  min_lr: 0.002387  loss: 3.3319 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (0.8771)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [142]  [1000/1251]  eta: 0:02:39  lr: 0.002384  min_lr: 0.002384  loss: 3.1006 (3.0652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8766 (0.8941)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [142]  [1200/1251]  eta: 0:00:32  lr: 0.002380  min_lr: 0.002380  loss: 3.0933 (3.0674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6571 (0.8927)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 2.9352 (3.0643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8946)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [142] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 2.9352 (3.0806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.8946)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.5987 (0.5987)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 6.2933  data: 5.9561  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7214 (0.7278)  acc1: 85.2000 (84.1091)  acc5: 97.6000 (97.4545)  time: 0.8447  data: 0.5418  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9049 (0.8664)  acc1: 80.4000 (81.2571)  acc5: 95.6000 (96.0381)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9429 (0.8709)  acc1: 80.4000 (81.0240)  acc5: 96.0000 (96.1760)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5429 s / it)
* Acc@1 81.382 Acc@5 96.084 loss 0.861
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.38%
Epoch: [143]  [   0/1251]  eta: 1:15:51  lr: 0.002380  min_lr: 0.002380  loss: 2.8219 (2.8219)  weight_decay: 0.0500 (0.0500)  time: 3.6382  data: 3.0080  max mem: 54228
Epoch: [143]  [ 200/1251]  eta: 0:11:17  lr: 0.002376  min_lr: 0.002376  loss: 3.0115 (3.0556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7875 (0.8604)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [143]  [ 400/1251]  eta: 0:09:02  lr: 0.002373  min_lr: 0.002373  loss: 3.0231 (3.0631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9444 (0.8478)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [143]  [ 600/1251]  eta: 0:06:52  lr: 0.002369  min_lr: 0.002369  loss: 3.0449 (3.0544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7218 (0.8332)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [143]  [ 800/1251]  eta: 0:04:45  lr: 0.002365  min_lr: 0.002365  loss: 3.0145 (3.0633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7850 (0.8349)  time: 0.6335  data: 0.0004  max mem: 54228
Epoch: [143]  [1000/1251]  eta: 0:02:38  lr: 0.002362  min_lr: 0.002362  loss: 3.2097 (3.0655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6174 (0.8432)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [143]  [1200/1251]  eta: 0:00:32  lr: 0.002358  min_lr: 0.002358  loss: 3.2940 (3.0712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8273 (0.8541)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 3.2022 (3.0735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (0.8600)  time: 0.5396  data: 0.0006  max mem: 54228
Epoch: [143] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 3.2022 (3.0803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (0.8600)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6478 (0.6478)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.9512  data: 5.6169  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7877 (0.8225)  acc1: 83.2000 (84.4727)  acc5: 97.6000 (97.3091)  time: 0.8134  data: 0.5109  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0078 (0.9530)  acc1: 79.6000 (81.4286)  acc5: 96.0000 (95.8095)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0078 (0.9600)  acc1: 79.2000 (81.0080)  acc5: 96.0000 (95.8560)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5306 s / it)
* Acc@1 81.484 Acc@5 96.026 loss 0.954
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.48%
Epoch: [144]  [   0/1251]  eta: 1:03:03  lr: 0.002358  min_lr: 0.002358  loss: 3.2928 (3.2928)  weight_decay: 0.0500 (0.0500)  time: 3.0247  data: 2.3774  max mem: 54228
Epoch: [144]  [ 200/1251]  eta: 0:11:14  lr: 0.002354  min_lr: 0.002354  loss: 3.2741 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (0.8812)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [144]  [ 400/1251]  eta: 0:08:59  lr: 0.002350  min_lr: 0.002350  loss: 3.3201 (3.0971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7541 (0.8539)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [144]  [ 600/1251]  eta: 0:06:52  lr: 0.002347  min_lr: 0.002347  loss: 3.1685 (3.0809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9496 (0.8991)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [144]  [ 800/1251]  eta: 0:04:45  lr: 0.002343  min_lr: 0.002343  loss: 3.2799 (3.0890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.8899)  time: 0.6291  data: 0.0004  max mem: 54228
Epoch: [144]  [1000/1251]  eta: 0:02:38  lr: 0.002340  min_lr: 0.002340  loss: 3.2205 (3.0942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.8765)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [144]  [1200/1251]  eta: 0:00:32  lr: 0.002336  min_lr: 0.002336  loss: 3.1817 (3.0912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.8797)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.0222 (3.0897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.8811)  time: 0.5330  data: 0.0005  max mem: 54228
Epoch: [144] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.0222 (3.0790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8564 (0.8811)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5761 (0.5761)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.5207  data: 6.1778  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7966 (0.7623)  acc1: 83.6000 (84.9091)  acc5: 97.2000 (97.3455)  time: 0.8658  data: 0.5623  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9414 (0.8950)  acc1: 79.6000 (81.3714)  acc5: 95.6000 (96.0191)  time: 0.2999  data: 0.0004  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9681 (0.9078)  acc1: 78.4000 (80.9760)  acc5: 95.2000 (95.9200)  time: 0.2999  data: 0.0003  max mem: 54228
Test: Total time: 0:00:13 (0.5526 s / it)
* Acc@1 81.436 Acc@5 95.992 loss 0.902
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.48%
Epoch: [145]  [   0/1251]  eta: 1:26:18  lr: 0.002335  min_lr: 0.002335  loss: 3.4190 (3.4190)  weight_decay: 0.0500 (0.0500)  time: 4.1398  data: 3.1287  max mem: 54228
Epoch: [145]  [ 200/1251]  eta: 0:11:18  lr: 0.002332  min_lr: 0.002332  loss: 2.9903 (3.0158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8842 (1.0184)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [145]  [ 400/1251]  eta: 0:09:03  lr: 0.002328  min_lr: 0.002328  loss: 3.1314 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6197 (0.8944)  time: 0.6364  data: 0.0005  max mem: 54228
Epoch: [145]  [ 600/1251]  eta: 0:06:53  lr: 0.002325  min_lr: 0.002325  loss: 2.9507 (3.0510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6304 (0.8843)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [145]  [ 800/1251]  eta: 0:04:45  lr: 0.002321  min_lr: 0.002321  loss: 3.1555 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8312 (0.8680)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [145]  [1000/1251]  eta: 0:02:38  lr: 0.002318  min_lr: 0.002318  loss: 3.2634 (3.0491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.8579)  time: 0.6291  data: 0.0006  max mem: 54228
Epoch: [145]  [1200/1251]  eta: 0:00:32  lr: 0.002314  min_lr: 0.002314  loss: 3.1453 (3.0533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8797 (0.8540)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 3.1647 (3.0553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9021 (0.8586)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [145] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 3.1647 (3.0660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9021 (0.8586)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6727 (0.6727)  acc1: 90.4000 (90.4000)  acc5: 98.0000 (98.0000)  time: 6.2267  data: 5.8788  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7995 (0.8086)  acc1: 84.8000 (84.9091)  acc5: 98.0000 (97.3455)  time: 0.8386  data: 0.5348  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9844 (0.9573)  acc1: 79.6000 (81.6952)  acc5: 96.0000 (95.9238)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0726 (0.9672)  acc1: 78.0000 (81.2960)  acc5: 96.0000 (95.8880)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5417 s / it)
* Acc@1 81.484 Acc@5 96.006 loss 0.963
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.48%
Epoch: [146]  [   0/1251]  eta: 1:24:48  lr: 0.002313  min_lr: 0.002313  loss: 3.5666 (3.5666)  weight_decay: 0.0500 (0.0500)  time: 4.0677  data: 3.1165  max mem: 54228
Epoch: [146]  [ 200/1251]  eta: 0:11:19  lr: 0.002310  min_lr: 0.002310  loss: 3.0938 (3.0379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7399 (0.7870)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [146]  [ 400/1251]  eta: 0:09:02  lr: 0.002306  min_lr: 0.002306  loss: 3.2801 (3.0582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7028 (0.8141)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [146]  [ 600/1251]  eta: 0:06:53  lr: 0.002303  min_lr: 0.002303  loss: 2.8154 (3.0505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6835 (0.8174)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [146]  [ 800/1251]  eta: 0:04:45  lr: 0.002299  min_lr: 0.002299  loss: 2.7214 (3.0422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7544 (0.8247)  time: 0.6279  data: 0.0006  max mem: 54228
Epoch: [146]  [1000/1251]  eta: 0:02:38  lr: 0.002296  min_lr: 0.002296  loss: 2.9515 (3.0515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6332 (0.8094)  time: 0.6280  data: 0.0006  max mem: 54228
Epoch: [146]  [1200/1251]  eta: 0:00:32  lr: 0.002292  min_lr: 0.002292  loss: 2.9964 (3.0610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9745 (0.8202)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.1523 (3.0640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6320 (0.8137)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [146] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.1523 (3.0542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6320 (0.8137)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.6004 (0.6004)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.0666  data: 5.7364  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7967 (0.7736)  acc1: 85.2000 (84.8727)  acc5: 97.6000 (97.4909)  time: 0.8239  data: 0.5218  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9945 (0.9020)  acc1: 80.4000 (81.8857)  acc5: 95.2000 (96.0381)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9945 (0.9155)  acc1: 79.6000 (81.4080)  acc5: 95.2000 (95.9040)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5343 s / it)
* Acc@1 81.602 Acc@5 96.006 loss 0.910
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.60%
Epoch: [147]  [   0/1251]  eta: 1:09:00  lr: 0.002291  min_lr: 0.002291  loss: 3.4183 (3.4183)  weight_decay: 0.0500 (0.0500)  time: 3.3097  data: 2.6723  max mem: 54228
Epoch: [147]  [ 200/1251]  eta: 0:11:16  lr: 0.002288  min_lr: 0.002288  loss: 3.0573 (3.0144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.7205)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [147]  [ 400/1251]  eta: 0:09:01  lr: 0.002284  min_lr: 0.002284  loss: 3.0494 (3.0108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7962 (0.8537)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [147]  [ 600/1251]  eta: 0:06:52  lr: 0.002280  min_lr: 0.002280  loss: 3.0559 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6981 (0.8497)  time: 0.6351  data: 0.0005  max mem: 54228
Epoch: [147]  [ 800/1251]  eta: 0:04:45  lr: 0.002277  min_lr: 0.002277  loss: 3.1835 (3.0250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7901 (0.8605)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [147]  [1000/1251]  eta: 0:02:38  lr: 0.002273  min_lr: 0.002273  loss: 3.2413 (3.0344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.8778)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [147]  [1200/1251]  eta: 0:00:32  lr: 0.002270  min_lr: 0.002270  loss: 3.1647 (3.0439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0327 (0.8888)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 2.9550 (3.0416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0418 (0.8958)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [147] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 2.9550 (3.0599)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0418 (0.8958)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5920 (0.5920)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 6.3851  data: 6.0344  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7401 (0.7439)  acc1: 85.2000 (84.9818)  acc5: 98.0000 (97.7091)  time: 0.8526  data: 0.5489  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9010 (0.8752)  acc1: 79.6000 (81.7143)  acc5: 95.6000 (96.0571)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9606 (0.8888)  acc1: 79.2000 (81.3280)  acc5: 95.6000 (96.0800)  time: 0.2990  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5467 s / it)
* Acc@1 81.504 Acc@5 96.142 loss 0.887
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.60%
Epoch: [148]  [   0/1251]  eta: 1:25:40  lr: 0.002269  min_lr: 0.002269  loss: 2.9409 (2.9409)  weight_decay: 0.0500 (0.0500)  time: 4.1094  data: 2.8997  max mem: 54228
Epoch: [148]  [ 200/1251]  eta: 0:11:18  lr: 0.002265  min_lr: 0.002265  loss: 3.3197 (3.0140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7849 (0.8172)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [148]  [ 400/1251]  eta: 0:09:02  lr: 0.002262  min_lr: 0.002262  loss: 3.1421 (3.0339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6170 (0.8070)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [148]  [ 600/1251]  eta: 0:06:53  lr: 0.002258  min_lr: 0.002258  loss: 3.1691 (3.0352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (0.8584)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [148]  [ 800/1251]  eta: 0:04:45  lr: 0.002255  min_lr: 0.002255  loss: 3.1489 (3.0293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8037 (0.8432)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [148]  [1000/1251]  eta: 0:02:38  lr: 0.002251  min_lr: 0.002251  loss: 3.1029 (3.0379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8296)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [148]  [1200/1251]  eta: 0:00:32  lr: 0.002248  min_lr: 0.002248  loss: 3.0512 (3.0423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.8287)  time: 0.6344  data: 0.0004  max mem: 54228
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 3.2820 (3.0503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8277)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [148] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 3.2820 (3.0578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8277)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.6998 (0.6998)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.4968  data: 6.1604  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7933 (0.8198)  acc1: 83.2000 (84.7636)  acc5: 98.0000 (97.6727)  time: 0.8631  data: 0.5603  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9559 (0.9605)  acc1: 80.8000 (81.7524)  acc5: 95.6000 (96.1143)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0487 (0.9737)  acc1: 80.0000 (81.4080)  acc5: 95.2000 (96.0960)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5518 s / it)
* Acc@1 81.656 Acc@5 96.094 loss 0.968
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.66%
Epoch: [149]  [   0/1251]  eta: 1:17:03  lr: 0.002247  min_lr: 0.002247  loss: 1.7342 (1.7342)  weight_decay: 0.0500 (0.0500)  time: 3.6960  data: 3.0569  max mem: 54228
Epoch: [149]  [ 200/1251]  eta: 0:11:17  lr: 0.002243  min_lr: 0.002243  loss: 2.7455 (3.0339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9797 (0.8839)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [149]  [ 400/1251]  eta: 0:09:02  lr: 0.002240  min_lr: 0.002240  loss: 3.1570 (3.0105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7265 (0.8196)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [149]  [ 600/1251]  eta: 0:06:53  lr: 0.002236  min_lr: 0.002236  loss: 3.1643 (3.0422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7923 (0.8093)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [149]  [ 800/1251]  eta: 0:04:45  lr: 0.002232  min_lr: 0.002232  loss: 3.3691 (3.0499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7523 (0.8032)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [149]  [1000/1251]  eta: 0:02:38  lr: 0.002229  min_lr: 0.002229  loss: 3.2288 (3.0532)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8072 (0.8111)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [149]  [1200/1251]  eta: 0:00:32  lr: 0.002225  min_lr: 0.002225  loss: 3.1424 (3.0605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (0.8213)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 3.1653 (3.0615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8281)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [149] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 3.1653 (3.0585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8281)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6871 (0.6871)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.9422  data: 5.6097  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8513 (0.8438)  acc1: 85.6000 (85.3091)  acc5: 97.2000 (97.2727)  time: 0.8130  data: 0.5103  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9863 (0.9790)  acc1: 79.6000 (81.6000)  acc5: 96.0000 (95.9048)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0293 (0.9878)  acc1: 78.8000 (81.3440)  acc5: 95.6000 (95.9040)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5345 s / it)
* Acc@1 81.550 Acc@5 96.110 loss 0.981
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.66%
Epoch: [150]  [   0/1251]  eta: 1:22:32  lr: 0.002224  min_lr: 0.002224  loss: 3.0358 (3.0358)  weight_decay: 0.0500 (0.0500)  time: 3.9590  data: 1.7660  max mem: 54228
Epoch: [150]  [ 200/1251]  eta: 0:11:21  lr: 0.002221  min_lr: 0.002221  loss: 2.8801 (3.0217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8325 (0.8833)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [150]  [ 400/1251]  eta: 0:09:03  lr: 0.002217  min_lr: 0.002217  loss: 3.3067 (3.0532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1372 (0.9004)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [150]  [ 600/1251]  eta: 0:06:53  lr: 0.002214  min_lr: 0.002214  loss: 3.1541 (3.0676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7968 (0.9091)  time: 0.6342  data: 0.0005  max mem: 54228
Epoch: [150]  [ 800/1251]  eta: 0:04:45  lr: 0.002210  min_lr: 0.002210  loss: 3.2114 (3.0590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0937 (0.9192)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [150]  [1000/1251]  eta: 0:02:38  lr: 0.002207  min_lr: 0.002207  loss: 3.2519 (3.0695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6877 (0.8878)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [150]  [1200/1251]  eta: 0:00:32  lr: 0.002203  min_lr: 0.002203  loss: 3.0657 (3.0744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8917)  time: 0.6347  data: 0.0005  max mem: 54228
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.1193 (3.0727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8449 (0.8972)  time: 0.5340  data: 0.0006  max mem: 54228
Epoch: [150] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.1193 (3.0571)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8449 (0.8972)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6184 (0.6184)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 6.3226  data: 5.9850  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7398 (0.7648)  acc1: 85.2000 (84.9818)  acc5: 97.6000 (97.5636)  time: 0.8476  data: 0.5445  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9616 (0.9028)  acc1: 79.2000 (81.6191)  acc5: 96.0000 (96.2476)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9984 (0.9107)  acc1: 79.2000 (81.4560)  acc5: 95.6000 (96.1440)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5478 s / it)
* Acc@1 81.668 Acc@5 96.218 loss 0.909
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.67%
Epoch: [151]  [   0/1251]  eta: 1:11:32  lr: 0.002202  min_lr: 0.002202  loss: 3.4865 (3.4865)  weight_decay: 0.0500 (0.0500)  time: 3.4310  data: 2.7936  max mem: 54228
Epoch: [151]  [ 200/1251]  eta: 0:11:15  lr: 0.002198  min_lr: 0.002198  loss: 3.1854 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (0.7475)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [151]  [ 400/1251]  eta: 0:09:00  lr: 0.002195  min_lr: 0.002195  loss: 3.0561 (3.0435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7158 (0.7939)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [151]  [ 600/1251]  eta: 0:06:52  lr: 0.002191  min_lr: 0.002191  loss: 2.9238 (3.0342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (0.7964)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [151]  [ 800/1251]  eta: 0:04:45  lr: 0.002188  min_lr: 0.002188  loss: 3.2180 (3.0312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7325 (0.8511)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [151]  [1000/1251]  eta: 0:02:38  lr: 0.002184  min_lr: 0.002184  loss: 3.1558 (3.0399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (0.8488)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [151]  [1200/1251]  eta: 0:00:32  lr: 0.002181  min_lr: 0.002181  loss: 3.1095 (3.0431)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8875 (0.8499)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 2.9444 (3.0417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6656 (0.8418)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [151] Total time: 0:13:08 (0.6300 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 2.9444 (3.0379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6656 (0.8418)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.5759 (0.5759)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 6.4770  data: 6.1509  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7126 (0.7216)  acc1: 83.6000 (84.2909)  acc5: 97.6000 (97.3818)  time: 0.8608  data: 0.5595  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9330 (0.8528)  acc1: 80.8000 (81.2571)  acc5: 96.0000 (95.9810)  time: 0.2990  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8988 (0.8560)  acc1: 80.8000 (81.1520)  acc5: 96.0000 (96.0640)  time: 0.2990  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5502 s / it)
* Acc@1 81.510 Acc@5 96.256 loss 0.847
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.67%
Epoch: [152]  [   0/1251]  eta: 1:28:50  lr: 0.002180  min_lr: 0.002180  loss: 2.6690 (2.6690)  weight_decay: 0.0500 (0.0500)  time: 4.2611  data: 1.6664  max mem: 54228
Epoch: [152]  [ 200/1251]  eta: 0:11:19  lr: 0.002176  min_lr: 0.002176  loss: 2.8972 (3.0555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6762 (0.9026)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [152]  [ 400/1251]  eta: 0:09:03  lr: 0.002173  min_lr: 0.002173  loss: 3.1216 (3.0413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8222 (0.8354)  time: 0.6272  data: 0.0004  max mem: 54228
Epoch: [152]  [ 600/1251]  eta: 0:06:53  lr: 0.002169  min_lr: 0.002169  loss: 3.1617 (3.0155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7398 (0.8530)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [152]  [ 800/1251]  eta: 0:04:45  lr: 0.002165  min_lr: 0.002165  loss: 3.1792 (3.0122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8432 (0.8722)  time: 0.6332  data: 0.0004  max mem: 54228
Epoch: [152]  [1000/1251]  eta: 0:02:38  lr: 0.002162  min_lr: 0.002162  loss: 3.0943 (3.0219)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (0.8544)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [152]  [1200/1251]  eta: 0:00:32  lr: 0.002158  min_lr: 0.002158  loss: 3.2156 (3.0190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.8414)  time: 0.6326  data: 0.0004  max mem: 54228
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.1956 (3.0183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8490)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [152] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.1956 (3.0464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7818 (0.8490)
Test:  [ 0/25]  eta: 0:02:01  loss: 0.6446 (0.6446)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 4.8510  data: 4.5090  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7635 (0.7616)  acc1: 85.6000 (84.4727)  acc5: 97.6000 (97.4182)  time: 0.8000  data: 0.4968  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9477 (0.9001)  acc1: 78.8000 (80.9143)  acc5: 96.0000 (96.0191)  time: 0.3473  data: 0.0479  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9751 (0.9003)  acc1: 78.8000 (80.8800)  acc5: 95.2000 (95.9840)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5250 s / it)
* Acc@1 81.622 Acc@5 96.198 loss 0.886
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.67%
Epoch: [153]  [   0/1251]  eta: 1:22:11  lr: 0.002157  min_lr: 0.002157  loss: 2.2935 (2.2935)  weight_decay: 0.0500 (0.0500)  time: 3.9422  data: 3.1926  max mem: 54228
Epoch: [153]  [ 200/1251]  eta: 0:11:19  lr: 0.002154  min_lr: 0.002154  loss: 3.4040 (3.0150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7938 (0.8750)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [153]  [ 400/1251]  eta: 0:09:02  lr: 0.002150  min_lr: 0.002150  loss: 2.9086 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7148 (0.8658)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [153]  [ 600/1251]  eta: 0:06:53  lr: 0.002147  min_lr: 0.002147  loss: 3.1570 (3.0277)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [153]  [ 800/1251]  eta: 0:04:45  lr: 0.002143  min_lr: 0.002143  loss: 2.9122 (3.0368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7310 (nan)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [153]  [1000/1251]  eta: 0:02:38  lr: 0.002139  min_lr: 0.002139  loss: 3.2188 (3.0527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8395 (nan)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [153]  [1200/1251]  eta: 0:00:32  lr: 0.002136  min_lr: 0.002136  loss: 3.0613 (3.0482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (nan)  time: 0.6347  data: 0.0004  max mem: 54228
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 3.0591 (3.0490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (nan)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [153] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 3.0591 (3.0361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (nan)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6040 (0.6040)  acc1: 86.0000 (86.0000)  acc5: 99.2000 (99.2000)  time: 6.6938  data: 6.3581  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7215 (0.7298)  acc1: 85.2000 (84.4727)  acc5: 98.0000 (97.6727)  time: 0.8814  data: 0.5783  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8681 (0.8574)  acc1: 79.6000 (81.3905)  acc5: 96.0000 (96.4381)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9537 (0.8690)  acc1: 79.6000 (81.0240)  acc5: 95.6000 (96.3200)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5609 s / it)
* Acc@1 81.686 Acc@5 96.190 loss 0.860
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.69%
Epoch: [154]  [   0/1251]  eta: 1:01:24  lr: 0.002135  min_lr: 0.002135  loss: 2.0457 (2.0457)  weight_decay: 0.0500 (0.0500)  time: 2.9450  data: 2.3026  max mem: 54228
Epoch: [154]  [ 200/1251]  eta: 0:11:14  lr: 0.002131  min_lr: 0.002131  loss: 2.7423 (3.0481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9391 (0.9015)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [154]  [ 400/1251]  eta: 0:09:00  lr: 0.002128  min_lr: 0.002128  loss: 3.1360 (3.0477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.8225)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [154]  [ 600/1251]  eta: 0:06:52  lr: 0.002124  min_lr: 0.002124  loss: 3.1713 (3.0479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1112 (0.8702)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [154]  [ 800/1251]  eta: 0:04:45  lr: 0.002121  min_lr: 0.002121  loss: 2.8466 (3.0457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0390 (0.8978)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [154]  [1000/1251]  eta: 0:02:38  lr: 0.002117  min_lr: 0.002117  loss: 3.1877 (3.0504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7128 (0.8726)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [154]  [1200/1251]  eta: 0:00:32  lr: 0.002113  min_lr: 0.002113  loss: 3.1501 (3.0517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1056 (0.8765)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 3.4115 (3.0537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7490 (0.8692)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [154] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 3.4115 (3.0347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7490 (0.8692)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6369 (0.6369)  acc1: 90.8000 (90.8000)  acc5: 98.0000 (98.0000)  time: 6.4057  data: 6.0665  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8183 (0.8191)  acc1: 84.0000 (84.6545)  acc5: 98.0000 (97.4546)  time: 0.8547  data: 0.5518  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9833 (0.9550)  acc1: 78.0000 (81.0857)  acc5: 96.0000 (95.9619)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0632 (0.9658)  acc1: 77.6000 (80.5920)  acc5: 95.2000 (95.9360)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5500 s / it)
* Acc@1 81.470 Acc@5 96.102 loss 0.954
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.69%
Epoch: [155]  [   0/1251]  eta: 1:28:32  lr: 0.002113  min_lr: 0.002113  loss: 3.3121 (3.3121)  weight_decay: 0.0500 (0.0500)  time: 4.2466  data: 2.3801  max mem: 54228
Epoch: [155]  [ 200/1251]  eta: 0:11:18  lr: 0.002109  min_lr: 0.002109  loss: 2.9917 (2.9851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.7983)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [155]  [ 400/1251]  eta: 0:09:02  lr: 0.002105  min_lr: 0.002105  loss: 3.0731 (2.9942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8232 (0.8452)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [155]  [ 600/1251]  eta: 0:06:53  lr: 0.002102  min_lr: 0.002102  loss: 3.0677 (3.0104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8321)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [155]  [ 800/1251]  eta: 0:04:45  lr: 0.002098  min_lr: 0.002098  loss: 3.1979 (3.0124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8348)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [155]  [1000/1251]  eta: 0:02:38  lr: 0.002095  min_lr: 0.002095  loss: 3.1102 (3.0209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7341 (0.8452)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [155]  [1200/1251]  eta: 0:00:32  lr: 0.002091  min_lr: 0.002091  loss: 3.1777 (3.0303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9949 (0.8584)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 2.7968 (3.0288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9175 (0.8569)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [155] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 2.7968 (3.0267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9175 (0.8569)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6370 (0.6370)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.2333  data: 5.9031  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7937 (0.7624)  acc1: 85.2000 (84.2182)  acc5: 97.6000 (97.6000)  time: 0.8396  data: 0.5371  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8909 (0.8968)  acc1: 79.2000 (81.1048)  acc5: 95.6000 (96.0191)  time: 0.3001  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9621 (0.9048)  acc1: 78.4000 (80.8160)  acc5: 96.0000 (96.0480)  time: 0.3001  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5415 s / it)
* Acc@1 81.658 Acc@5 96.114 loss 0.897
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.69%
Epoch: [156]  [   0/1251]  eta: 1:28:57  lr: 0.002090  min_lr: 0.002090  loss: 2.7545 (2.7545)  weight_decay: 0.0500 (0.0500)  time: 4.2665  data: 3.1944  max mem: 54228
Epoch: [156]  [ 200/1251]  eta: 0:11:20  lr: 0.002087  min_lr: 0.002087  loss: 3.0504 (2.9775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9849 (0.9209)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [156]  [ 400/1251]  eta: 0:09:03  lr: 0.002083  min_lr: 0.002083  loss: 3.1014 (2.9874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.9206)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [156]  [ 600/1251]  eta: 0:06:53  lr: 0.002079  min_lr: 0.002079  loss: 2.9773 (2.9901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.8712)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [156]  [ 800/1251]  eta: 0:04:46  lr: 0.002076  min_lr: 0.002076  loss: 3.3391 (2.9959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7592 (0.8717)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [156]  [1000/1251]  eta: 0:02:38  lr: 0.002072  min_lr: 0.002072  loss: 3.1030 (3.0013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8832 (0.8799)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [156]  [1200/1251]  eta: 0:00:32  lr: 0.002069  min_lr: 0.002069  loss: 3.1064 (3.0045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7069 (0.8742)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 3.0384 (3.0010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6008 (0.8653)  time: 0.5413  data: 0.0007  max mem: 54228
Epoch: [156] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 3.0384 (3.0217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6008 (0.8653)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.6658 (0.6658)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 6.5450  data: 6.2065  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7564 (0.7546)  acc1: 84.4000 (84.8000)  acc5: 97.6000 (97.3818)  time: 0.8674  data: 0.5645  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8942 (0.8967)  acc1: 78.4000 (81.0286)  acc5: 95.6000 (95.8286)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9731 (0.9018)  acc1: 80.0000 (80.9760)  acc5: 95.2000 (95.8720)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5552 s / it)
* Acc@1 81.588 Acc@5 96.120 loss 0.893
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.69%
Epoch: [157]  [   0/1251]  eta: 1:28:47  lr: 0.002068  min_lr: 0.002068  loss: 3.4153 (3.4153)  weight_decay: 0.0500 (0.0500)  time: 4.2584  data: 3.2092  max mem: 54228
Epoch: [157]  [ 200/1251]  eta: 0:11:21  lr: 0.002064  min_lr: 0.002064  loss: 3.1970 (3.0144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8264 (0.8722)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [157]  [ 400/1251]  eta: 0:09:03  lr: 0.002061  min_lr: 0.002061  loss: 2.9706 (3.0346)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9623 (0.8934)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [157]  [ 600/1251]  eta: 0:06:53  lr: 0.002057  min_lr: 0.002057  loss: 2.9406 (3.0117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6748 (0.8953)  time: 0.6330  data: 0.0005  max mem: 54228
Epoch: [157]  [ 800/1251]  eta: 0:04:45  lr: 0.002053  min_lr: 0.002053  loss: 3.1322 (3.0102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8028 (0.8739)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [157]  [1000/1251]  eta: 0:02:38  lr: 0.002050  min_lr: 0.002050  loss: 2.9314 (3.0155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8335 (0.9127)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [157]  [1200/1251]  eta: 0:00:32  lr: 0.002046  min_lr: 0.002046  loss: 3.0645 (3.0154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.8884)  time: 0.6366  data: 0.0005  max mem: 54228
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 3.1188 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8269 (0.8909)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [157] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 3.1188 (3.0207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8269 (0.8909)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6167 (0.6167)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 6.3678  data: 6.0267  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7510 (0.7232)  acc1: 84.4000 (84.2545)  acc5: 98.0000 (97.7091)  time: 0.8515  data: 0.5482  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8964 (0.8552)  acc1: 79.2000 (81.3333)  acc5: 96.0000 (96.2476)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9379 (0.8647)  acc1: 78.4000 (80.9280)  acc5: 95.6000 (96.1440)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5475 s / it)
* Acc@1 81.752 Acc@5 96.232 loss 0.851
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.75%
Epoch: [158]  [   0/1251]  eta: 1:17:32  lr: 0.002045  min_lr: 0.002045  loss: 3.3275 (3.3275)  weight_decay: 0.0500 (0.0500)  time: 3.7192  data: 3.0817  max mem: 54228
Epoch: [158]  [ 200/1251]  eta: 0:11:16  lr: 0.002042  min_lr: 0.002042  loss: 3.1648 (3.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.9728)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [158]  [ 400/1251]  eta: 0:09:01  lr: 0.002038  min_lr: 0.002038  loss: 3.1559 (3.0483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7297 (0.9234)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [158]  [ 600/1251]  eta: 0:06:52  lr: 0.002035  min_lr: 0.002035  loss: 3.0425 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9099 (0.8731)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [158]  [ 800/1251]  eta: 0:04:45  lr: 0.002031  min_lr: 0.002031  loss: 3.1248 (3.0395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7453 (0.8861)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [158]  [1000/1251]  eta: 0:02:38  lr: 0.002027  min_lr: 0.002027  loss: 3.3041 (3.0434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8478 (0.9063)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [158]  [1200/1251]  eta: 0:00:32  lr: 0.002024  min_lr: 0.002024  loss: 3.2425 (3.0516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8959)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 3.2079 (3.0530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6064 (0.8886)  time: 0.5336  data: 0.0005  max mem: 54228
Epoch: [158] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 3.2079 (3.0224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6064 (0.8886)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.6588 (0.6588)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 6.1017  data: 5.7771  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7728 (0.7881)  acc1: 84.0000 (84.6546)  acc5: 98.0000 (97.7091)  time: 0.8269  data: 0.5255  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9323 (0.9217)  acc1: 79.6000 (81.2952)  acc5: 96.0000 (96.1333)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0054 (0.9324)  acc1: 79.2000 (80.8000)  acc5: 95.2000 (96.0800)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5368 s / it)
* Acc@1 81.750 Acc@5 96.204 loss 0.913
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.75%
Epoch: [159]  [   0/1251]  eta: 1:24:48  lr: 0.002023  min_lr: 0.002023  loss: 3.4409 (3.4409)  weight_decay: 0.0500 (0.0500)  time: 4.0678  data: 2.5383  max mem: 54228
Epoch: [159]  [ 200/1251]  eta: 0:11:18  lr: 0.002019  min_lr: 0.002019  loss: 3.2212 (2.9828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6650 (0.8648)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [159]  [ 400/1251]  eta: 0:09:02  lr: 0.002016  min_lr: 0.002016  loss: 3.0760 (2.9844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7550 (0.8761)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [159]  [ 600/1251]  eta: 0:06:53  lr: 0.002012  min_lr: 0.002012  loss: 2.9353 (2.9873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7376 (0.8811)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [159]  [ 800/1251]  eta: 0:04:45  lr: 0.002009  min_lr: 0.002009  loss: 3.1307 (2.9902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6534 (0.8660)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [159]  [1000/1251]  eta: 0:02:38  lr: 0.002005  min_lr: 0.002005  loss: 3.0500 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7073 (0.8631)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [159]  [1200/1251]  eta: 0:00:32  lr: 0.002001  min_lr: 0.002001  loss: 3.1730 (3.0031)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 2.8533 (3.0036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8279 (nan)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [159] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 2.8533 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8279 (nan)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.5917 (0.5917)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 6.5053  data: 6.1674  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7710 (0.7664)  acc1: 84.0000 (84.6909)  acc5: 98.0000 (97.6364)  time: 0.8638  data: 0.5610  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9523 (0.9008)  acc1: 80.4000 (81.7714)  acc5: 96.4000 (96.2857)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9825 (0.9129)  acc1: 80.4000 (81.3600)  acc5: 96.0000 (96.1920)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5516 s / it)
* Acc@1 81.956 Acc@5 96.208 loss 0.904
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 81.96%
Epoch: [160]  [   0/1251]  eta: 1:15:40  lr: 0.002001  min_lr: 0.002001  loss: 3.1688 (3.1688)  weight_decay: 0.0500 (0.0500)  time: 3.6298  data: 2.9874  max mem: 54228
Epoch: [160]  [ 200/1251]  eta: 0:11:18  lr: 0.001997  min_lr: 0.001997  loss: 2.8962 (3.0020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.8085)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [160]  [ 400/1251]  eta: 0:09:02  lr: 0.001993  min_lr: 0.001993  loss: 3.0993 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6672 (0.8254)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [160]  [ 600/1251]  eta: 0:06:53  lr: 0.001990  min_lr: 0.001990  loss: 3.0705 (2.9822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9455 (0.8338)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [160]  [ 800/1251]  eta: 0:04:45  lr: 0.001986  min_lr: 0.001986  loss: 3.0220 (2.9888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7512 (0.8373)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [160]  [1000/1251]  eta: 0:02:38  lr: 0.001983  min_lr: 0.001983  loss: 3.2137 (2.9947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8139 (0.8415)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [160]  [1200/1251]  eta: 0:00:32  lr: 0.001979  min_lr: 0.001979  loss: 3.1258 (3.0055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0123 (0.8782)  time: 0.6333  data: 0.0005  max mem: 54228
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 3.2599 (3.0111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0090 (0.8886)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [160] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 3.2599 (3.0064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0090 (0.8886)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.6757 (0.6757)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 6.0293  data: 5.7038  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8236 (0.7920)  acc1: 85.2000 (84.7273)  acc5: 97.6000 (97.4546)  time: 0.8198  data: 0.5188  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9118 (0.9171)  acc1: 80.0000 (81.8095)  acc5: 96.0000 (95.9619)  time: 0.2987  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9892 (0.9303)  acc1: 80.0000 (81.4080)  acc5: 96.0000 (95.9840)  time: 0.2987  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5334 s / it)
* Acc@1 81.782 Acc@5 96.200 loss 0.925
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.96%
Epoch: [161]  [   0/1251]  eta: 1:21:39  lr: 0.001978  min_lr: 0.001978  loss: 2.3081 (2.3081)  weight_decay: 0.0500 (0.0500)  time: 3.9168  data: 2.8605  max mem: 54228
Epoch: [161]  [ 200/1251]  eta: 0:11:18  lr: 0.001974  min_lr: 0.001974  loss: 3.1708 (3.0194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0043 (0.8831)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [161]  [ 400/1251]  eta: 0:09:01  lr: 0.001971  min_lr: 0.001971  loss: 3.0256 (3.0312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7705 (0.8744)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [161]  [ 600/1251]  eta: 0:06:53  lr: 0.001967  min_lr: 0.001967  loss: 3.0683 (3.0258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6537 (0.8412)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [161]  [ 800/1251]  eta: 0:04:45  lr: 0.001964  min_lr: 0.001964  loss: 2.8067 (3.0228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8706 (0.8993)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [161]  [1000/1251]  eta: 0:02:38  lr: 0.001960  min_lr: 0.001960  loss: 2.9634 (3.0250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8435 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [161]  [1200/1251]  eta: 0:00:32  lr: 0.001956  min_lr: 0.001956  loss: 2.9727 (3.0148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (nan)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 3.0956 (3.0166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (nan)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [161] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 3.0956 (3.0104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (nan)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6408 (0.6408)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 6.2437  data: 5.9107  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7818 (0.7685)  acc1: 85.2000 (84.4364)  acc5: 98.4000 (98.0364)  time: 0.8403  data: 0.5377  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9487 (0.9055)  acc1: 80.4000 (81.3524)  acc5: 96.4000 (96.5143)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9870 (0.9137)  acc1: 79.6000 (81.0400)  acc5: 95.6000 (96.4160)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5419 s / it)
* Acc@1 81.860 Acc@5 96.338 loss 0.904
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [162]  [   0/1251]  eta: 1:26:52  lr: 0.001956  min_lr: 0.001956  loss: 3.0911 (3.0911)  weight_decay: 0.0500 (0.0500)  time: 4.1663  data: 2.6989  max mem: 54228
Epoch: [162]  [ 200/1251]  eta: 0:11:18  lr: 0.001952  min_lr: 0.001952  loss: 3.1953 (2.9603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7837 (0.8389)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [162]  [ 400/1251]  eta: 0:09:03  lr: 0.001948  min_lr: 0.001948  loss: 3.1820 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9025 (0.8429)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [162]  [ 600/1251]  eta: 0:06:53  lr: 0.001945  min_lr: 0.001945  loss: 2.8755 (2.9832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8174 (0.8497)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [162]  [ 800/1251]  eta: 0:04:45  lr: 0.001941  min_lr: 0.001941  loss: 3.1631 (2.9822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8615 (0.8600)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [162]  [1000/1251]  eta: 0:02:38  lr: 0.001938  min_lr: 0.001938  loss: 3.1365 (2.9841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.8579)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [162]  [1200/1251]  eta: 0:00:32  lr: 0.001934  min_lr: 0.001934  loss: 3.3051 (2.9887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.8634)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 2.9101 (2.9900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7620 (0.8637)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [162] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 2.9101 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7620 (0.8637)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.6029 (0.6029)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.5016  data: 6.1633  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7719 (0.7476)  acc1: 84.4000 (85.2000)  acc5: 98.4000 (97.6727)  time: 0.8634  data: 0.5606  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8800 (0.8849)  acc1: 81.2000 (81.9810)  acc5: 96.0000 (96.1714)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9817 (0.8959)  acc1: 80.4000 (81.4560)  acc5: 95.6000 (96.1600)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5529 s / it)
* Acc@1 81.872 Acc@5 96.290 loss 0.889
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [163]  [   0/1251]  eta: 1:26:10  lr: 0.001933  min_lr: 0.001933  loss: 3.4738 (3.4738)  weight_decay: 0.0500 (0.0500)  time: 4.1329  data: 3.1469  max mem: 54228
Epoch: [163]  [ 200/1251]  eta: 0:11:21  lr: 0.001930  min_lr: 0.001930  loss: 3.0188 (2.9689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.8916)  time: 0.6364  data: 0.0004  max mem: 54228
Epoch: [163]  [ 400/1251]  eta: 0:09:02  lr: 0.001926  min_lr: 0.001926  loss: 3.2458 (2.9839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.8794)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [163]  [ 600/1251]  eta: 0:06:53  lr: 0.001922  min_lr: 0.001922  loss: 3.1503 (3.0035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9063 (0.8941)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [163]  [ 800/1251]  eta: 0:04:45  lr: 0.001919  min_lr: 0.001919  loss: 2.6729 (2.9875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8065 (0.9051)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [163]  [1000/1251]  eta: 0:02:38  lr: 0.001915  min_lr: 0.001915  loss: 3.1859 (2.9939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7224 (0.8763)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [163]  [1200/1251]  eta: 0:00:32  lr: 0.001912  min_lr: 0.001912  loss: 3.1221 (2.9958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8478 (0.8871)  time: 0.6367  data: 0.0005  max mem: 54228
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.1083 (2.9956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2283 (0.8972)  time: 0.5385  data: 0.0007  max mem: 54228
Epoch: [163] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.1083 (2.9952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2283 (0.8972)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6148 (0.6148)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 6.3983  data: 6.0538  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7790 (0.7768)  acc1: 86.0000 (85.0545)  acc5: 97.6000 (97.6364)  time: 0.8540  data: 0.5506  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9302 (0.9023)  acc1: 80.4000 (81.7333)  acc5: 95.6000 (95.9810)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9729 (0.9081)  acc1: 80.4000 (81.5680)  acc5: 95.6000 (96.0160)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5474 s / it)
* Acc@1 81.864 Acc@5 96.160 loss 0.900
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [164]  [   0/1251]  eta: 1:26:08  lr: 0.001911  min_lr: 0.001911  loss: 3.0624 (3.0624)  weight_decay: 0.0500 (0.0500)  time: 4.1316  data: 2.0278  max mem: 54228
Epoch: [164]  [ 200/1251]  eta: 0:11:19  lr: 0.001907  min_lr: 0.001907  loss: 2.9504 (2.9469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7127 (0.8039)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [164]  [ 400/1251]  eta: 0:09:02  lr: 0.001904  min_lr: 0.001904  loss: 3.0922 (2.9680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.7876)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [164]  [ 600/1251]  eta: 0:06:53  lr: 0.001900  min_lr: 0.001900  loss: 3.0779 (2.9645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9869 (0.8476)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [164]  [ 800/1251]  eta: 0:04:45  lr: 0.001896  min_lr: 0.001896  loss: 3.2012 (2.9728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9178 (0.8773)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [164]  [1000/1251]  eta: 0:02:38  lr: 0.001893  min_lr: 0.001893  loss: 2.8862 (2.9797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8374 (nan)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [164]  [1200/1251]  eta: 0:00:32  lr: 0.001889  min_lr: 0.001889  loss: 3.2477 (2.9874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8798 (nan)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.1646 (2.9825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (nan)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [164] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.1646 (2.9926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (nan)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6588 (0.6588)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 6.3873  data: 6.0506  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7808 (0.8091)  acc1: 84.8000 (84.9818)  acc5: 98.0000 (97.8909)  time: 0.8522  data: 0.5504  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9998 (0.9443)  acc1: 80.8000 (80.9905)  acc5: 96.0000 (96.3238)  time: 0.2985  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0091 (0.9528)  acc1: 79.6000 (80.8640)  acc5: 96.0000 (96.3840)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5461 s / it)
* Acc@1 81.874 Acc@5 96.354 loss 0.941
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.96%
Epoch: [165]  [   0/1251]  eta: 1:23:27  lr: 0.001888  min_lr: 0.001888  loss: 2.9965 (2.9965)  weight_decay: 0.0500 (0.0500)  time: 4.0025  data: 2.7838  max mem: 54228
Epoch: [165]  [ 200/1251]  eta: 0:11:17  lr: 0.001885  min_lr: 0.001885  loss: 3.0770 (2.9594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2054 (0.9804)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [165]  [ 400/1251]  eta: 0:09:02  lr: 0.001881  min_lr: 0.001881  loss: 3.0118 (2.9623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7827 (0.9220)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [165]  [ 600/1251]  eta: 0:06:53  lr: 0.001878  min_lr: 0.001878  loss: 2.7819 (2.9673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.8861)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [165]  [ 800/1251]  eta: 0:04:45  lr: 0.001874  min_lr: 0.001874  loss: 3.0783 (2.9687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6358 (0.8685)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [165]  [1000/1251]  eta: 0:02:38  lr: 0.001870  min_lr: 0.001870  loss: 2.8022 (2.9756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9179 (0.8722)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [165]  [1200/1251]  eta: 0:00:32  lr: 0.001867  min_lr: 0.001867  loss: 3.1039 (2.9810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (0.8716)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.2554 (2.9839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.8677)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [165] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.2554 (2.9818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.8677)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.6211 (0.6211)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.2474  data: 5.8944  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7591 (0.7729)  acc1: 84.4000 (85.2364)  acc5: 97.6000 (97.4909)  time: 0.8404  data: 0.5362  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9415 (0.9035)  acc1: 80.4000 (81.8095)  acc5: 95.6000 (96.1905)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9551 (0.9095)  acc1: 80.4000 (81.6320)  acc5: 95.6000 (96.1440)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5411 s / it)
* Acc@1 82.160 Acc@5 96.320 loss 0.905
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.16%
Epoch: [166]  [   0/1251]  eta: 1:22:06  lr: 0.001866  min_lr: 0.001866  loss: 2.8705 (2.8705)  weight_decay: 0.0500 (0.0500)  time: 3.9383  data: 3.3029  max mem: 54228
Epoch: [166]  [ 200/1251]  eta: 0:11:18  lr: 0.001862  min_lr: 0.001862  loss: 2.8775 (2.8839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.7969)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [166]  [ 400/1251]  eta: 0:09:02  lr: 0.001859  min_lr: 0.001859  loss: 2.9399 (2.9490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8837 (0.9108)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [166]  [ 600/1251]  eta: 0:06:53  lr: 0.001855  min_lr: 0.001855  loss: 3.2243 (2.9559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8357 (0.9235)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [166]  [ 800/1251]  eta: 0:04:45  lr: 0.001852  min_lr: 0.001852  loss: 3.0837 (2.9603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7909 (0.9041)  time: 0.6337  data: 0.0005  max mem: 54228
Epoch: [166]  [1000/1251]  eta: 0:02:38  lr: 0.001848  min_lr: 0.001848  loss: 3.2939 (2.9656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7625 (0.8837)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [166]  [1200/1251]  eta: 0:00:32  lr: 0.001844  min_lr: 0.001844  loss: 3.0600 (2.9660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.8918)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 3.1095 (2.9679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.8914)  time: 0.5372  data: 0.0005  max mem: 54228
Epoch: [166] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 3.1095 (2.9821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.8914)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.5832 (0.5832)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 6.4059  data: 6.0813  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7657 (0.7366)  acc1: 85.6000 (85.7091)  acc5: 98.4000 (97.7455)  time: 0.8548  data: 0.5531  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9242 (0.8717)  acc1: 79.6000 (82.4762)  acc5: 95.6000 (96.3429)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9642 (0.8824)  acc1: 79.6000 (82.0800)  acc5: 95.6000 (96.3200)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5481 s / it)
* Acc@1 82.210 Acc@5 96.402 loss 0.877
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.21%
Epoch: [167]  [   0/1251]  eta: 1:10:33  lr: 0.001844  min_lr: 0.001844  loss: 2.3108 (2.3108)  weight_decay: 0.0500 (0.0500)  time: 3.3839  data: 2.7414  max mem: 54228
Epoch: [167]  [ 200/1251]  eta: 0:11:17  lr: 0.001840  min_lr: 0.001840  loss: 2.9894 (2.9656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.8819)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [167]  [ 400/1251]  eta: 0:09:01  lr: 0.001836  min_lr: 0.001836  loss: 3.0709 (2.9648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8320 (0.8536)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [167]  [ 600/1251]  eta: 0:06:52  lr: 0.001833  min_lr: 0.001833  loss: 3.0516 (2.9625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7465 (0.8777)  time: 0.6333  data: 0.0005  max mem: 54228
Epoch: [167]  [ 800/1251]  eta: 0:04:45  lr: 0.001829  min_lr: 0.001829  loss: 3.0083 (2.9660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9175 (0.8704)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [167]  [1000/1251]  eta: 0:02:38  lr: 0.001826  min_lr: 0.001826  loss: 2.9684 (2.9695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7756 (0.8640)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [167]  [1200/1251]  eta: 0:00:32  lr: 0.001822  min_lr: 0.001822  loss: 3.0792 (2.9697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.8806)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 3.1920 (2.9701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8047 (0.8810)  time: 0.5386  data: 0.0006  max mem: 54228
Epoch: [167] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 3.1920 (2.9759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8047 (0.8810)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6455 (0.6455)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.2183  data: 4.8953  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7669 (0.7684)  acc1: 84.8000 (85.7091)  acc5: 98.0000 (97.8182)  time: 0.7981  data: 0.4963  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9690 (0.9095)  acc1: 80.8000 (82.0381)  acc5: 95.6000 (96.2857)  time: 0.3288  data: 0.0282  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0040 (0.9196)  acc1: 79.6000 (81.6480)  acc5: 95.6000 (96.3200)  time: 0.3010  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5247 s / it)
* Acc@1 82.166 Acc@5 96.296 loss 0.911
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.21%
Epoch: [168]  [   0/1251]  eta: 1:25:28  lr: 0.001821  min_lr: 0.001821  loss: 2.5762 (2.5762)  weight_decay: 0.0500 (0.0500)  time: 4.0998  data: 2.8727  max mem: 54228
Epoch: [168]  [ 200/1251]  eta: 0:11:18  lr: 0.001818  min_lr: 0.001818  loss: 3.0686 (2.9417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8925 (0.9371)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [168]  [ 400/1251]  eta: 0:09:02  lr: 0.001814  min_lr: 0.001814  loss: 3.0614 (2.9466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8394 (0.9017)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [168]  [ 600/1251]  eta: 0:06:53  lr: 0.001811  min_lr: 0.001811  loss: 2.8851 (2.9623)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8134 (0.9051)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [168]  [ 800/1251]  eta: 0:04:45  lr: 0.001807  min_lr: 0.001807  loss: 3.0142 (2.9638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8239 (0.9047)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [168]  [1000/1251]  eta: 0:02:38  lr: 0.001803  min_lr: 0.001803  loss: 3.1293 (2.9735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.9604)  time: 0.6328  data: 0.0005  max mem: 54228
Epoch: [168]  [1200/1251]  eta: 0:00:32  lr: 0.001800  min_lr: 0.001800  loss: 3.1665 (2.9735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7016 (0.9336)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 3.1471 (2.9733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.9279)  time: 0.5334  data: 0.0007  max mem: 54228
Epoch: [168] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 3.1471 (2.9776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7038 (0.9279)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.5941 (0.5941)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 6.0012  data: 5.6762  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7341 (0.7375)  acc1: 85.6000 (85.7455)  acc5: 98.0000 (97.8182)  time: 0.8181  data: 0.5163  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8918 (0.8669)  acc1: 80.8000 (82.4191)  acc5: 96.4000 (96.5143)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9447 (0.8748)  acc1: 79.6000 (82.0480)  acc5: 96.0000 (96.5440)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5317 s / it)
* Acc@1 82.238 Acc@5 96.440 loss 0.877
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.24%
Epoch: [169]  [   0/1251]  eta: 1:15:14  lr: 0.001799  min_lr: 0.001799  loss: 2.4714 (2.4714)  weight_decay: 0.0500 (0.0500)  time: 3.6085  data: 2.9816  max mem: 54228
Epoch: [169]  [ 200/1251]  eta: 0:11:16  lr: 0.001795  min_lr: 0.001795  loss: 3.1242 (2.9655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.8654)  time: 0.6356  data: 0.0005  max mem: 54228
Epoch: [169]  [ 400/1251]  eta: 0:09:01  lr: 0.001792  min_lr: 0.001792  loss: 3.1443 (2.9535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9279 (0.8825)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [169]  [ 600/1251]  eta: 0:06:52  lr: 0.001788  min_lr: 0.001788  loss: 3.0714 (2.9608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9359 (0.8667)  time: 0.6283  data: 0.0006  max mem: 54228
Epoch: [169]  [ 800/1251]  eta: 0:04:45  lr: 0.001785  min_lr: 0.001785  loss: 3.0785 (2.9731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0274 (0.8724)  time: 0.6331  data: 0.0005  max mem: 54228
Epoch: [169]  [1000/1251]  eta: 0:02:38  lr: 0.001781  min_lr: 0.001781  loss: 2.9853 (2.9784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9759 (0.8722)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [169]  [1200/1251]  eta: 0:00:32  lr: 0.001777  min_lr: 0.001777  loss: 3.1119 (2.9773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9895 (0.8931)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.1443 (2.9761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0020 (0.8979)  time: 0.5328  data: 0.0006  max mem: 54228
Epoch: [169] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.1443 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0020 (0.8979)
Test:  [ 0/25]  eta: 0:02:01  loss: 0.6046 (0.6046)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 4.8777  data: 4.5366  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7332 (0.7388)  acc1: 85.6000 (85.3818)  acc5: 97.6000 (97.7818)  time: 0.7773  data: 0.4742  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9477 (0.8666)  acc1: 80.4000 (82.0191)  acc5: 96.0000 (96.2476)  time: 0.3333  data: 0.0340  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9177 (0.8734)  acc1: 78.8000 (81.5520)  acc5: 96.8000 (96.3840)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5138 s / it)
* Acc@1 82.180 Acc@5 96.388 loss 0.863
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.24%
Epoch: [170]  [   0/1251]  eta: 1:23:45  lr: 0.001777  min_lr: 0.001777  loss: 3.4811 (3.4811)  weight_decay: 0.0500 (0.0500)  time: 4.0175  data: 3.1668  max mem: 54228
Epoch: [170]  [ 200/1251]  eta: 0:11:20  lr: 0.001773  min_lr: 0.001773  loss: 3.0184 (2.9514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8837 (0.9251)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [170]  [ 400/1251]  eta: 0:09:02  lr: 0.001769  min_lr: 0.001769  loss: 3.1434 (2.9325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8513 (0.9603)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [170]  [ 600/1251]  eta: 0:06:53  lr: 0.001766  min_lr: 0.001766  loss: 3.1829 (2.9317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7342 (0.9163)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [170]  [ 800/1251]  eta: 0:04:45  lr: 0.001762  min_lr: 0.001762  loss: 3.1458 (2.9469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8373 (0.9076)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [170]  [1000/1251]  eta: 0:02:38  lr: 0.001759  min_lr: 0.001759  loss: 3.0617 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.9347)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [170]  [1200/1251]  eta: 0:00:32  lr: 0.001755  min_lr: 0.001755  loss: 3.2626 (2.9680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6823 (0.9048)  time: 0.6347  data: 0.0005  max mem: 54228
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 2.9877 (2.9689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7450 (0.9014)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [170] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 2.9877 (2.9666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7450 (0.9014)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6370 (0.6370)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 6.3311  data: 6.0082  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7714 (0.7538)  acc1: 87.6000 (86.4364)  acc5: 98.0000 (97.7818)  time: 0.8479  data: 0.5465  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9252 (0.8892)  acc1: 80.4000 (82.3429)  acc5: 96.0000 (96.3810)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9357 (0.8946)  acc1: 80.4000 (82.0800)  acc5: 96.0000 (96.4000)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5500 s / it)
* Acc@1 82.320 Acc@5 96.378 loss 0.891
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.32%
Epoch: [171]  [   0/1251]  eta: 1:05:47  lr: 0.001754  min_lr: 0.001754  loss: 3.1325 (3.1325)  weight_decay: 0.0500 (0.0500)  time: 3.1552  data: 2.5267  max mem: 54228
Epoch: [171]  [ 200/1251]  eta: 0:11:14  lr: 0.001751  min_lr: 0.001751  loss: 3.1082 (2.9410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8861 (0.8724)  time: 0.6358  data: 0.0004  max mem: 54228
Epoch: [171]  [ 400/1251]  eta: 0:09:00  lr: 0.001747  min_lr: 0.001747  loss: 3.1777 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7422 (0.8744)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [171]  [ 600/1251]  eta: 0:06:52  lr: 0.001744  min_lr: 0.001744  loss: 2.8345 (2.9451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (0.8840)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [171]  [ 800/1251]  eta: 0:04:45  lr: 0.001740  min_lr: 0.001740  loss: 3.0093 (2.9542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.8931)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [171]  [1000/1251]  eta: 0:02:38  lr: 0.001737  min_lr: 0.001737  loss: 3.0452 (2.9795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7753 (0.9063)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [171]  [1200/1251]  eta: 0:00:32  lr: 0.001733  min_lr: 0.001733  loss: 3.0492 (2.9794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8684 (0.9089)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 2.9202 (2.9797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (0.9154)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [171] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 2.9202 (2.9645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (0.9154)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6025 (0.6025)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6779  data: 5.3560  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7151 (0.7299)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 0.8257  data: 0.5244  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9181 (0.8638)  acc1: 81.6000 (82.3810)  acc5: 96.8000 (96.3810)  time: 0.3200  data: 0.0207  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9499 (0.8711)  acc1: 80.4000 (82.1920)  acc5: 96.4000 (96.4000)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5356 s / it)
* Acc@1 82.358 Acc@5 96.432 loss 0.859
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.36%
Epoch: [172]  [   0/1251]  eta: 1:20:09  lr: 0.001732  min_lr: 0.001732  loss: 3.2740 (3.2740)  weight_decay: 0.0500 (0.0500)  time: 3.8442  data: 3.2198  max mem: 54228
Epoch: [172]  [ 200/1251]  eta: 0:11:16  lr: 0.001729  min_lr: 0.001729  loss: 2.9328 (2.9046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8203 (0.9768)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [172]  [ 400/1251]  eta: 0:09:02  lr: 0.001725  min_lr: 0.001725  loss: 3.1024 (2.9481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7980 (0.9693)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [172]  [ 600/1251]  eta: 0:06:53  lr: 0.001721  min_lr: 0.001721  loss: 3.2291 (2.9831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8566 (0.9597)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [172]  [ 800/1251]  eta: 0:04:45  lr: 0.001718  min_lr: 0.001718  loss: 2.9520 (2.9822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0449 (0.9903)  time: 0.6365  data: 0.0005  max mem: 54228
Epoch: [172]  [1000/1251]  eta: 0:02:38  lr: 0.001714  min_lr: 0.001714  loss: 2.9472 (2.9808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.9655)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [172]  [1200/1251]  eta: 0:00:32  lr: 0.001711  min_lr: 0.001711  loss: 3.2220 (2.9805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6363 (0.9316)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 3.1636 (2.9810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5930 (0.9230)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [172] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 3.1636 (2.9657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5930 (0.9230)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.6893 (0.6893)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 6.4653  data: 6.1273  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8381 (0.8378)  acc1: 86.0000 (85.6000)  acc5: 97.6000 (97.5636)  time: 0.8602  data: 0.5573  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0235 (0.9704)  acc1: 81.6000 (82.5333)  acc5: 96.0000 (96.2667)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0286 (0.9799)  acc1: 80.0000 (82.1120)  acc5: 95.6000 (96.1760)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5511 s / it)
* Acc@1 82.402 Acc@5 96.366 loss 0.967
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.40%
Epoch: [173]  [   0/1251]  eta: 1:12:05  lr: 0.001710  min_lr: 0.001710  loss: 3.3146 (3.3146)  weight_decay: 0.0500 (0.0500)  time: 3.4575  data: 2.8316  max mem: 54228
Epoch: [173]  [ 200/1251]  eta: 0:11:18  lr: 0.001706  min_lr: 0.001706  loss: 3.0972 (2.9575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6770 (0.7554)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [173]  [ 400/1251]  eta: 0:09:02  lr: 0.001703  min_lr: 0.001703  loss: 3.1237 (2.9519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8809 (0.8781)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [173]  [ 600/1251]  eta: 0:06:52  lr: 0.001699  min_lr: 0.001699  loss: 3.1430 (2.9355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8992 (0.8706)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [173]  [ 800/1251]  eta: 0:04:45  lr: 0.001696  min_lr: 0.001696  loss: 3.0906 (2.9340)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0780 (0.9069)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [173]  [1000/1251]  eta: 0:02:38  lr: 0.001692  min_lr: 0.001692  loss: 2.9755 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9337 (0.9114)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [173]  [1200/1251]  eta: 0:00:32  lr: 0.001689  min_lr: 0.001689  loss: 3.1655 (2.9492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8733 (0.9086)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 3.2584 (2.9497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (0.8996)  time: 0.5390  data: 0.0008  max mem: 54228
Epoch: [173] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 3.2584 (2.9598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6432 (0.8996)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6319 (0.6319)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 6.4373  data: 6.0939  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8106 (0.8073)  acc1: 85.6000 (84.8000)  acc5: 98.4000 (97.6364)  time: 0.8578  data: 0.5544  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0157 (0.9356)  acc1: 79.6000 (81.7143)  acc5: 96.0000 (96.3810)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0135 (0.9445)  acc1: 79.6000 (81.4720)  acc5: 95.6000 (96.4000)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5487 s / it)
* Acc@1 82.380 Acc@5 96.440 loss 0.929
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.40%
Epoch: [174]  [   0/1251]  eta: 1:24:39  lr: 0.001688  min_lr: 0.001688  loss: 3.1710 (3.1710)  weight_decay: 0.0500 (0.0500)  time: 4.0603  data: 1.8625  max mem: 54228
Epoch: [174]  [ 200/1251]  eta: 0:11:19  lr: 0.001684  min_lr: 0.001684  loss: 3.0277 (2.9608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9158 (0.8852)  time: 0.6290  data: 0.0004  max mem: 54228
Epoch: [174]  [ 400/1251]  eta: 0:09:02  lr: 0.001681  min_lr: 0.001681  loss: 3.0197 (2.9386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7106 (0.9203)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [174]  [ 600/1251]  eta: 0:06:53  lr: 0.001677  min_lr: 0.001677  loss: 2.7952 (2.9441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.9151)  time: 0.6274  data: 0.0004  max mem: 54228
Epoch: [174]  [ 800/1251]  eta: 0:04:45  lr: 0.001674  min_lr: 0.001674  loss: 2.9176 (2.9484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.9124)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [174]  [1000/1251]  eta: 0:02:38  lr: 0.001670  min_lr: 0.001670  loss: 3.0433 (2.9461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1348 (0.9423)  time: 0.6357  data: 0.0004  max mem: 54228
Epoch: [174]  [1200/1251]  eta: 0:00:32  lr: 0.001666  min_lr: 0.001666  loss: 3.0760 (2.9439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.9228)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.8168 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8945 (0.9222)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [174] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.8168 (2.9514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8945 (0.9222)
Test:  [ 0/25]  eta: 0:01:58  loss: 0.5518 (0.5518)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 4.7312  data: 4.3985  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7354 (0.7112)  acc1: 85.6000 (85.9273)  acc5: 98.0000 (97.4909)  time: 0.7720  data: 0.4698  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9136 (0.8380)  acc1: 82.0000 (82.4762)  acc5: 95.6000 (96.1524)  time: 0.3377  data: 0.0385  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9136 (0.8483)  acc1: 81.2000 (82.1280)  acc5: 95.6000 (96.1760)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5120 s / it)
* Acc@1 82.298 Acc@5 96.372 loss 0.838
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.40%
Epoch: [175]  [   0/1251]  eta: 1:27:53  lr: 0.001666  min_lr: 0.001666  loss: 3.1202 (3.1202)  weight_decay: 0.0500 (0.0500)  time: 4.2156  data: 3.1301  max mem: 54228
Epoch: [175]  [ 200/1251]  eta: 0:11:18  lr: 0.001662  min_lr: 0.001662  loss: 3.1548 (2.9477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7939 (0.8472)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [175]  [ 400/1251]  eta: 0:09:03  lr: 0.001658  min_lr: 0.001658  loss: 3.0805 (2.9364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7677 (0.8414)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [175]  [ 600/1251]  eta: 0:06:53  lr: 0.001655  min_lr: 0.001655  loss: 2.9013 (2.9334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (0.8930)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [175]  [ 800/1251]  eta: 0:04:45  lr: 0.001651  min_lr: 0.001651  loss: 2.8943 (2.9415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7939 (0.8939)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [175]  [1000/1251]  eta: 0:02:38  lr: 0.001648  min_lr: 0.001648  loss: 3.0781 (2.9445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.8923)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [175]  [1200/1251]  eta: 0:00:32  lr: 0.001644  min_lr: 0.001644  loss: 2.9858 (2.9446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8056 (0.8881)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 3.0970 (2.9440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8860)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [175] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 3.0970 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8007 (0.8860)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6548 (0.6548)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.7731  data: 5.4421  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7810 (0.7879)  acc1: 84.8000 (85.1273)  acc5: 98.0000 (97.7455)  time: 0.8349  data: 0.5333  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9957 (0.9166)  acc1: 81.2000 (81.8857)  acc5: 96.0000 (96.5333)  time: 0.3198  data: 0.0213  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9666 (0.9220)  acc1: 80.8000 (81.6480)  acc5: 96.0000 (96.4960)  time: 0.2987  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5388 s / it)
* Acc@1 82.150 Acc@5 96.356 loss 0.912
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.40%
Epoch: [176]  [   0/1251]  eta: 1:28:55  lr: 0.001643  min_lr: 0.001643  loss: 2.1553 (2.1553)  weight_decay: 0.0500 (0.0500)  time: 4.2649  data: 1.9956  max mem: 54228
Epoch: [176]  [ 200/1251]  eta: 0:11:20  lr: 0.001640  min_lr: 0.001640  loss: 3.1319 (2.9580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9236 (0.8842)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [176]  [ 400/1251]  eta: 0:09:03  lr: 0.001636  min_lr: 0.001636  loss: 2.7454 (2.9366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8278 (0.9294)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [176]  [ 600/1251]  eta: 0:06:53  lr: 0.001633  min_lr: 0.001633  loss: 3.0816 (2.9539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (0.9627)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [176]  [ 800/1251]  eta: 0:04:46  lr: 0.001629  min_lr: 0.001629  loss: 3.0697 (2.9521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0303 (0.9889)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [176]  [1000/1251]  eta: 0:02:38  lr: 0.001626  min_lr: 0.001626  loss: 3.0977 (2.9556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9406 (0.9727)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [176]  [1200/1251]  eta: 0:00:32  lr: 0.001622  min_lr: 0.001622  loss: 3.0098 (2.9443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9676 (0.9883)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 3.1721 (2.9473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7162 (0.9795)  time: 0.5336  data: 0.0007  max mem: 54228
Epoch: [176] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 3.1721 (2.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7162 (0.9795)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.6631 (0.6631)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 6.1775  data: 5.8550  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7953 (0.8023)  acc1: 86.0000 (85.5273)  acc5: 98.0000 (97.9273)  time: 0.8341  data: 0.5326  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9990 (0.9299)  acc1: 81.2000 (82.0952)  acc5: 95.6000 (96.3238)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0125 (0.9403)  acc1: 79.6000 (81.7440)  acc5: 95.6000 (96.3200)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5393 s / it)
* Acc@1 82.214 Acc@5 96.362 loss 0.929
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.40%
Epoch: [177]  [   0/1251]  eta: 1:20:38  lr: 0.001621  min_lr: 0.001621  loss: 3.3465 (3.3465)  weight_decay: 0.0500 (0.0500)  time: 3.8677  data: 2.7017  max mem: 54228
Epoch: [177]  [ 200/1251]  eta: 0:11:19  lr: 0.001618  min_lr: 0.001618  loss: 2.8672 (2.9684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7709 (0.8605)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [177]  [ 400/1251]  eta: 0:09:02  lr: 0.001614  min_lr: 0.001614  loss: 2.7414 (2.9379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1224 (0.9252)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [177]  [ 600/1251]  eta: 0:06:53  lr: 0.001611  min_lr: 0.001611  loss: 3.1381 (2.9248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9804 (0.9178)  time: 0.6330  data: 0.0004  max mem: 54228
Epoch: [177]  [ 800/1251]  eta: 0:04:45  lr: 0.001607  min_lr: 0.001607  loss: 3.1020 (2.9345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.8893)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [177]  [1000/1251]  eta: 0:02:38  lr: 0.001604  min_lr: 0.001604  loss: 2.8272 (2.9262)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0353 (0.9373)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [177]  [1200/1251]  eta: 0:00:32  lr: 0.001600  min_lr: 0.001600  loss: 2.9014 (2.9343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.9071)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 2.9826 (2.9371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8148 (0.9059)  time: 0.5337  data: 0.0009  max mem: 54228
Epoch: [177] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 2.9826 (2.9369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8148 (0.9059)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.5929 (0.5929)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 6.7078  data: 6.3800  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7982 (0.7559)  acc1: 84.4000 (85.3455)  acc5: 97.6000 (97.5273)  time: 0.8825  data: 0.5803  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9400 (0.8884)  acc1: 80.8000 (82.0191)  acc5: 96.0000 (96.3429)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9475 (0.8920)  acc1: 80.8000 (82.0000)  acc5: 96.0000 (96.3360)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5597 s / it)
* Acc@1 82.380 Acc@5 96.400 loss 0.885
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.40%
Epoch: [178]  [   0/1251]  eta: 1:27:59  lr: 0.001599  min_lr: 0.001599  loss: 3.3839 (3.3839)  weight_decay: 0.0500 (0.0500)  time: 4.2199  data: 3.3987  max mem: 54228
Epoch: [178]  [ 200/1251]  eta: 0:11:19  lr: 0.001596  min_lr: 0.001596  loss: 2.9448 (2.9016)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8745 (0.9030)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [178]  [ 400/1251]  eta: 0:09:02  lr: 0.001592  min_lr: 0.001592  loss: 2.9561 (2.9140)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0059 (0.9392)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [178]  [ 600/1251]  eta: 0:06:53  lr: 0.001589  min_lr: 0.001589  loss: 3.1333 (2.9309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7903 (0.9124)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [178]  [ 800/1251]  eta: 0:04:45  lr: 0.001585  min_lr: 0.001585  loss: 2.9956 (2.9271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8992 (0.9061)  time: 0.6294  data: 0.0005  max mem: 54228
Epoch: [178]  [1000/1251]  eta: 0:02:38  lr: 0.001582  min_lr: 0.001582  loss: 2.9194 (2.9266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9265 (0.9196)  time: 0.6373  data: 0.0005  max mem: 54228
Epoch: [178]  [1200/1251]  eta: 0:00:32  lr: 0.001578  min_lr: 0.001578  loss: 2.9447 (2.9284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7924 (0.9068)  time: 0.6334  data: 0.0004  max mem: 54228
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 3.0381 (2.9296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1432 (0.9164)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [178] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 3.0381 (2.9358)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1432 (0.9164)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5868 (0.5868)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5448  data: 6.2108  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.8121 (0.7832)  acc1: 85.6000 (85.0546)  acc5: 97.6000 (97.4545)  time: 0.8673  data: 0.5649  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9602 (0.9142)  acc1: 80.4000 (81.9238)  acc5: 96.0000 (96.3619)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9839 (0.9189)  acc1: 80.4000 (81.8880)  acc5: 96.0000 (96.3520)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5534 s / it)
* Acc@1 82.430 Acc@5 96.420 loss 0.906
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.43%
Epoch: [179]  [   0/1251]  eta: 1:16:19  lr: 0.001577  min_lr: 0.001577  loss: 2.6884 (2.6884)  weight_decay: 0.0500 (0.0500)  time: 3.6609  data: 3.0260  max mem: 54228
Epoch: [179]  [ 200/1251]  eta: 0:11:17  lr: 0.001574  min_lr: 0.001574  loss: 3.1584 (2.9207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6741 (0.8602)  time: 0.6386  data: 0.0005  max mem: 54228
Epoch: [179]  [ 400/1251]  eta: 0:09:02  lr: 0.001570  min_lr: 0.001570  loss: 3.1005 (2.9304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9108 (0.8781)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [179]  [ 600/1251]  eta: 0:06:52  lr: 0.001567  min_lr: 0.001567  loss: 3.0199 (2.9159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8354 (0.9532)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [179]  [ 800/1251]  eta: 0:04:45  lr: 0.001563  min_lr: 0.001563  loss: 3.1644 (2.9211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9509 (0.9854)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [179]  [1000/1251]  eta: 0:02:38  lr: 0.001560  min_lr: 0.001560  loss: 3.1702 (2.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9307 (0.9734)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [179]  [1200/1251]  eta: 0:00:32  lr: 0.001556  min_lr: 0.001556  loss: 2.9601 (2.9137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8038 (0.9833)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 3.1748 (2.9163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8207 (0.9856)  time: 0.5335  data: 0.0004  max mem: 54228
Epoch: [179] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 3.1748 (2.9325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8207 (0.9856)
Test:  [ 0/25]  eta: 0:02:54  loss: 0.5713 (0.5713)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 6.9787  data: 6.6530  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6790 (0.7121)  acc1: 86.0000 (85.8182)  acc5: 98.4000 (97.7818)  time: 0.9068  data: 0.6051  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9241 (0.8371)  acc1: 82.0000 (82.7810)  acc5: 96.0000 (96.4381)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9212 (0.8465)  acc1: 81.2000 (82.4960)  acc5: 96.0000 (96.4160)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5715 s / it)
* Acc@1 82.708 Acc@5 96.456 loss 0.842
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.71%
Epoch: [180]  [   0/1251]  eta: 1:13:39  lr: 0.001556  min_lr: 0.001556  loss: 3.2500 (3.2500)  weight_decay: 0.0500 (0.0500)  time: 3.5329  data: 2.8970  max mem: 54228
Epoch: [180]  [ 200/1251]  eta: 0:11:19  lr: 0.001552  min_lr: 0.001552  loss: 3.0137 (2.8908)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [180]  [ 400/1251]  eta: 0:09:02  lr: 0.001549  min_lr: 0.001549  loss: 2.8960 (2.9014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8315 (nan)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [180]  [ 600/1251]  eta: 0:06:53  lr: 0.001545  min_lr: 0.001545  loss: 2.8892 (2.9209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6734 (nan)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [180]  [ 800/1251]  eta: 0:04:45  lr: 0.001542  min_lr: 0.001542  loss: 3.2301 (2.9335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8483 (nan)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [180]  [1000/1251]  eta: 0:02:38  lr: 0.001538  min_lr: 0.001538  loss: 3.2073 (2.9442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9822 (nan)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [180]  [1200/1251]  eta: 0:00:32  lr: 0.001535  min_lr: 0.001535  loss: 3.1244 (2.9383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0484 (nan)  time: 0.6394  data: 0.0005  max mem: 54228
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 3.0557 (2.9413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8657 (nan)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [180] Total time: 0:13:10 (0.6317 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 3.0557 (2.9259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8657 (nan)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.5935 (0.5935)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 6.2461  data: 5.9070  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7434 (0.7292)  acc1: 86.0000 (85.4909)  acc5: 98.0000 (97.8909)  time: 0.8403  data: 0.5373  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9098 (0.8653)  acc1: 80.8000 (82.3429)  acc5: 96.4000 (96.4571)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9390 (0.8746)  acc1: 80.0000 (82.0800)  acc5: 96.4000 (96.4800)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5415 s / it)
* Acc@1 82.468 Acc@5 96.434 loss 0.870
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.71%
Epoch: [181]  [   0/1251]  eta: 1:26:13  lr: 0.001534  min_lr: 0.001534  loss: 3.3484 (3.3484)  weight_decay: 0.0500 (0.0500)  time: 4.1352  data: 3.3758  max mem: 54228
Epoch: [181]  [ 200/1251]  eta: 0:11:18  lr: 0.001530  min_lr: 0.001530  loss: 2.9364 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7308 (0.7987)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [181]  [ 400/1251]  eta: 0:09:02  lr: 0.001527  min_lr: 0.001527  loss: 3.0224 (2.9324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.8036)  time: 0.6385  data: 0.0005  max mem: 54228
Epoch: [181]  [ 600/1251]  eta: 0:06:53  lr: 0.001523  min_lr: 0.001523  loss: 2.8670 (2.9277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8739 (0.8543)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [181]  [ 800/1251]  eta: 0:04:45  lr: 0.001520  min_lr: 0.001520  loss: 2.7499 (2.9264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9999 (0.9232)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [181]  [1000/1251]  eta: 0:02:38  lr: 0.001516  min_lr: 0.001516  loss: 3.0849 (2.9339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7610 (0.9199)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [181]  [1200/1251]  eta: 0:00:32  lr: 0.001513  min_lr: 0.001513  loss: 3.0599 (2.9355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (0.9135)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.0899 (2.9367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7714 (0.9076)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [181] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.0899 (2.9237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7714 (0.9076)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6465 (0.6465)  acc1: 91.6000 (91.6000)  acc5: 98.0000 (98.0000)  time: 5.9790  data: 5.6546  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7958 (0.7777)  acc1: 86.8000 (86.4000)  acc5: 97.6000 (97.6364)  time: 0.8159  data: 0.5144  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9887 (0.9138)  acc1: 81.2000 (82.8571)  acc5: 96.4000 (96.4571)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9821 (0.9230)  acc1: 80.4000 (82.5120)  acc5: 96.4000 (96.4800)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5328 s / it)
* Acc@1 82.614 Acc@5 96.496 loss 0.920
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.71%
Epoch: [182]  [   0/1251]  eta: 1:29:14  lr: 0.001512  min_lr: 0.001512  loss: 2.8136 (2.8136)  weight_decay: 0.0500 (0.0500)  time: 4.2802  data: 1.8703  max mem: 54228
Epoch: [182]  [ 200/1251]  eta: 0:11:20  lr: 0.001508  min_lr: 0.001508  loss: 3.0820 (2.9415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9628 (0.9312)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [182]  [ 400/1251]  eta: 0:09:04  lr: 0.001505  min_lr: 0.001505  loss: 2.9477 (2.9075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.9132)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [182]  [ 600/1251]  eta: 0:06:53  lr: 0.001501  min_lr: 0.001501  loss: 3.1578 (2.9247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8198 (0.9368)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [182]  [ 800/1251]  eta: 0:04:45  lr: 0.001498  min_lr: 0.001498  loss: 2.9761 (2.9234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (0.9129)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [182]  [1000/1251]  eta: 0:02:38  lr: 0.001495  min_lr: 0.001495  loss: 3.0531 (2.9211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7966 (0.9040)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [182]  [1200/1251]  eta: 0:00:32  lr: 0.001491  min_lr: 0.001491  loss: 2.9134 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7580 (0.9142)  time: 0.6272  data: 0.0004  max mem: 54228
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 2.9391 (2.9148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.9162)  time: 0.5377  data: 0.0005  max mem: 54228
Epoch: [182] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 2.9391 (2.9130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.9162)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6206 (0.6206)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.3077  data: 4.9785  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7573 (0.7513)  acc1: 86.4000 (85.8545)  acc5: 98.0000 (97.7455)  time: 0.7544  data: 0.4531  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9198 (0.8770)  acc1: 81.6000 (82.5714)  acc5: 96.0000 (96.4381)  time: 0.2987  data: 0.0004  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9396 (0.8840)  acc1: 80.8000 (82.3200)  acc5: 95.6000 (96.3840)  time: 0.2984  data: 0.0002  max mem: 54228
Test: Total time: 0:00:12 (0.5060 s / it)
* Acc@1 82.588 Acc@5 96.486 loss 0.876
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.71%
Epoch: [183]  [   0/1251]  eta: 1:28:23  lr: 0.001490  min_lr: 0.001490  loss: 2.7503 (2.7503)  weight_decay: 0.0500 (0.0500)  time: 4.2391  data: 2.8517  max mem: 54228
Epoch: [183]  [ 200/1251]  eta: 0:11:21  lr: 0.001487  min_lr: 0.001487  loss: 3.0867 (2.8969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8351 (0.9050)  time: 0.6383  data: 0.0005  max mem: 54228
Epoch: [183]  [ 400/1251]  eta: 0:09:03  lr: 0.001483  min_lr: 0.001483  loss: 2.6189 (2.9014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7336 (0.9521)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [183]  [ 600/1251]  eta: 0:06:53  lr: 0.001480  min_lr: 0.001480  loss: 3.0925 (2.9055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0446 (0.9752)  time: 0.6357  data: 0.0004  max mem: 54228
Epoch: [183]  [ 800/1251]  eta: 0:04:45  lr: 0.001476  min_lr: 0.001476  loss: 3.1454 (2.9065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8686 (0.9481)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [183]  [1000/1251]  eta: 0:02:38  lr: 0.001473  min_lr: 0.001473  loss: 3.0110 (2.9020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.9462)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [183]  [1200/1251]  eta: 0:00:32  lr: 0.001469  min_lr: 0.001469  loss: 3.0846 (2.9090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8968 (0.9427)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.2141 (2.9160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9128 (0.9440)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [183] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.2141 (2.9143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9128 (0.9440)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.6492 (0.6492)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 6.2166  data: 5.8690  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7659 (0.8003)  acc1: 85.6000 (85.6727)  acc5: 97.6000 (97.6000)  time: 0.8377  data: 0.5339  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9642 (0.9248)  acc1: 80.0000 (82.3429)  acc5: 96.4000 (96.3619)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9805 (0.9320)  acc1: 80.0000 (82.0320)  acc5: 96.0000 (96.3520)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5425 s / it)
* Acc@1 82.618 Acc@5 96.438 loss 0.926
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.71%
Epoch: [184]  [   0/1251]  eta: 1:21:13  lr: 0.001469  min_lr: 0.001469  loss: 2.2865 (2.2865)  weight_decay: 0.0500 (0.0500)  time: 3.8955  data: 2.6525  max mem: 54228
Epoch: [184]  [ 200/1251]  eta: 0:11:19  lr: 0.001465  min_lr: 0.001465  loss: 3.0523 (2.8800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8118 (0.9250)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [184]  [ 400/1251]  eta: 0:09:02  lr: 0.001462  min_lr: 0.001462  loss: 3.1080 (2.8935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0476 (0.9448)  time: 0.6278  data: 0.0006  max mem: 54228
Epoch: [184]  [ 600/1251]  eta: 0:06:53  lr: 0.001458  min_lr: 0.001458  loss: 3.0216 (2.9117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.9460)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [184]  [ 800/1251]  eta: 0:04:45  lr: 0.001455  min_lr: 0.001455  loss: 3.0303 (2.9109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6716 (0.9296)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [184]  [1000/1251]  eta: 0:02:38  lr: 0.001451  min_lr: 0.001451  loss: 3.0522 (2.9177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0237 (0.9272)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [184]  [1200/1251]  eta: 0:00:32  lr: 0.001448  min_lr: 0.001448  loss: 2.9674 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0203 (0.9319)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 2.9899 (2.9139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.9302)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [184] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 2.9899 (2.9124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.9302)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5681 (0.5681)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.8077  data: 6.4961  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7269 (0.7307)  acc1: 86.8000 (85.5273)  acc5: 97.6000 (97.4909)  time: 0.8958  data: 0.5908  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9045 (0.8689)  acc1: 80.0000 (82.3048)  acc5: 95.6000 (96.2476)  time: 0.3053  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9573 (0.8769)  acc1: 80.4000 (82.1760)  acc5: 95.6000 (96.2400)  time: 0.3058  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5736 s / it)
* Acc@1 82.712 Acc@5 96.490 loss 0.861
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.71%
Epoch: [185]  [   0/1251]  eta: 1:16:42  lr: 0.001447  min_lr: 0.001447  loss: 3.2288 (3.2288)  weight_decay: 0.0500 (0.0500)  time: 3.6788  data: 3.0503  max mem: 54228
Epoch: [185]  [ 200/1251]  eta: 0:11:15  lr: 0.001444  min_lr: 0.001444  loss: 2.8642 (2.8154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8152 (0.9747)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [185]  [ 400/1251]  eta: 0:09:01  lr: 0.001440  min_lr: 0.001440  loss: 3.0819 (2.8417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.9201)  time: 0.6360  data: 0.0004  max mem: 54228
Epoch: [185]  [ 600/1251]  eta: 0:06:52  lr: 0.001437  min_lr: 0.001437  loss: 3.0250 (2.8662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8484 (0.8964)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [185]  [ 800/1251]  eta: 0:04:45  lr: 0.001433  min_lr: 0.001433  loss: 3.0364 (2.8734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8280 (0.8881)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [185]  [1000/1251]  eta: 0:02:38  lr: 0.001430  min_lr: 0.001430  loss: 2.9683 (2.8746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9526 (0.8883)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [185]  [1200/1251]  eta: 0:00:32  lr: 0.001426  min_lr: 0.001426  loss: 3.1351 (2.8788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.8941)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 2.8859 (2.8806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8726 (0.9029)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [185] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 2.8859 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8726 (0.9029)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5919 (0.5919)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 6.8014  data: 6.4658  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7402 (0.7469)  acc1: 84.0000 (85.3818)  acc5: 98.0000 (97.7091)  time: 0.8900  data: 0.5881  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9653 (0.8810)  acc1: 80.8000 (82.4191)  acc5: 95.6000 (96.3619)  time: 0.2987  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9653 (0.8899)  acc1: 81.2000 (82.1600)  acc5: 95.6000 (96.3040)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5634 s / it)
* Acc@1 82.692 Acc@5 96.554 loss 0.876
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.71%
Epoch: [186]  [   0/1251]  eta: 1:22:52  lr: 0.001425  min_lr: 0.001425  loss: 2.8804 (2.8804)  weight_decay: 0.0500 (0.0500)  time: 3.9749  data: 3.1830  max mem: 54228
Epoch: [186]  [ 200/1251]  eta: 0:11:19  lr: 0.001422  min_lr: 0.001422  loss: 2.8404 (2.8650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9296 (1.0299)  time: 0.6354  data: 0.0004  max mem: 54228
Epoch: [186]  [ 400/1251]  eta: 0:09:03  lr: 0.001419  min_lr: 0.001419  loss: 2.8402 (2.8856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9095 (0.9948)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [186]  [ 600/1251]  eta: 0:06:53  lr: 0.001415  min_lr: 0.001415  loss: 3.0492 (2.8845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8221 (0.9783)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [186]  [ 800/1251]  eta: 0:04:46  lr: 0.001412  min_lr: 0.001412  loss: 3.0797 (2.8929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0013 (0.9648)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [186]  [1000/1251]  eta: 0:02:39  lr: 0.001408  min_lr: 0.001408  loss: 2.7884 (2.8945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8231 (0.9522)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [186]  [1200/1251]  eta: 0:00:32  lr: 0.001405  min_lr: 0.001405  loss: 3.0173 (2.8970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2234 (0.9945)  time: 0.6362  data: 0.0004  max mem: 54228
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.0550 (2.8984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9929)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [186] Total time: 0:13:10 (0.6319 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.0550 (2.9048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9929)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6460 (0.6460)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 6.5897  data: 6.2432  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7579 (0.7657)  acc1: 86.0000 (85.8182)  acc5: 98.4000 (97.8182)  time: 0.8708  data: 0.5679  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0021 (0.9040)  acc1: 80.8000 (82.0952)  acc5: 96.0000 (96.4381)  time: 0.2987  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9480 (0.9114)  acc1: 80.4000 (81.8720)  acc5: 96.4000 (96.4960)  time: 0.2985  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5544 s / it)
* Acc@1 82.662 Acc@5 96.538 loss 0.902
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.71%
Epoch: [187]  [   0/1251]  eta: 1:26:21  lr: 0.001404  min_lr: 0.001404  loss: 2.9760 (2.9760)  weight_decay: 0.0500 (0.0500)  time: 4.1416  data: 3.4553  max mem: 54228
Epoch: [187]  [ 200/1251]  eta: 0:11:20  lr: 0.001401  min_lr: 0.001401  loss: 3.0694 (2.9315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8103 (0.8560)  time: 0.6274  data: 0.0004  max mem: 54228
Epoch: [187]  [ 400/1251]  eta: 0:09:02  lr: 0.001397  min_lr: 0.001397  loss: 2.7331 (2.9016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (nan)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [187]  [ 600/1251]  eta: 0:06:53  lr: 0.001394  min_lr: 0.001394  loss: 2.7651 (2.8940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8068 (nan)  time: 0.6398  data: 0.0004  max mem: 54228
Epoch: [187]  [ 800/1251]  eta: 0:04:45  lr: 0.001390  min_lr: 0.001390  loss: 3.1538 (2.9081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1096 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [187]  [1000/1251]  eta: 0:02:38  lr: 0.001387  min_lr: 0.001387  loss: 3.0405 (2.9035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8291 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [187]  [1200/1251]  eta: 0:00:32  lr: 0.001383  min_lr: 0.001383  loss: 2.9962 (2.9053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (nan)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 3.0949 (2.9034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (nan)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [187] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 3.0949 (2.8905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (nan)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5816 (0.5816)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 6.6225  data: 6.2828  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7199 (0.7206)  acc1: 86.8000 (85.7818)  acc5: 98.0000 (97.9273)  time: 0.8743  data: 0.5715  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9056 (0.8611)  acc1: 80.8000 (82.4952)  acc5: 96.0000 (96.5905)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9358 (0.8694)  acc1: 80.8000 (82.2560)  acc5: 96.0000 (96.5280)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5560 s / it)
* Acc@1 82.796 Acc@5 96.516 loss 0.863
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.80%
Epoch: [188]  [   0/1251]  eta: 1:14:07  lr: 0.001383  min_lr: 0.001383  loss: 3.3841 (3.3841)  weight_decay: 0.0500 (0.0500)  time: 3.5554  data: 2.9152  max mem: 54228
Epoch: [188]  [ 200/1251]  eta: 0:11:15  lr: 0.001379  min_lr: 0.001379  loss: 2.7114 (2.8564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7994 (0.9707)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [188]  [ 400/1251]  eta: 0:09:01  lr: 0.001376  min_lr: 0.001376  loss: 2.9602 (2.8628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8985 (1.0127)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [188]  [ 600/1251]  eta: 0:06:52  lr: 0.001372  min_lr: 0.001372  loss: 2.8502 (2.8795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8623 (0.9954)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [188]  [ 800/1251]  eta: 0:04:45  lr: 0.001369  min_lr: 0.001369  loss: 2.9756 (2.8966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.9912)  time: 0.6352  data: 0.0004  max mem: 54228
Epoch: [188]  [1000/1251]  eta: 0:02:38  lr: 0.001366  min_lr: 0.001366  loss: 2.8082 (2.8874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8084 (0.9709)  time: 0.6345  data: 0.0004  max mem: 54228
Epoch: [188]  [1200/1251]  eta: 0:00:32  lr: 0.001362  min_lr: 0.001362  loss: 3.1368 (2.8865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7847 (0.9724)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.1235 (2.8878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.9763)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [188] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.1235 (2.8905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7815 (0.9763)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.7114 (0.7114)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 6.2729  data: 5.9217  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7709 (0.7991)  acc1: 86.4000 (85.3455)  acc5: 98.4000 (97.7455)  time: 0.8459  data: 0.5419  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9622 (0.9279)  acc1: 80.0000 (82.1714)  acc5: 96.4000 (96.6286)  time: 0.3014  data: 0.0020  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0342 (0.9376)  acc1: 79.6000 (81.7760)  acc5: 96.4000 (96.6400)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5460 s / it)
* Acc@1 82.582 Acc@5 96.470 loss 0.930
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.80%
Epoch: [189]  [   0/1251]  eta: 1:20:09  lr: 0.001361  min_lr: 0.001361  loss: 2.8062 (2.8062)  weight_decay: 0.0500 (0.0500)  time: 3.8449  data: 3.1203  max mem: 54228
Epoch: [189]  [ 200/1251]  eta: 0:11:18  lr: 0.001358  min_lr: 0.001358  loss: 2.9026 (2.8785)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1080 (1.1607)  time: 0.6272  data: 0.0004  max mem: 54228
Epoch: [189]  [ 400/1251]  eta: 0:09:02  lr: 0.001355  min_lr: 0.001355  loss: 3.0223 (2.8863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7674 (1.0436)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [189]  [ 600/1251]  eta: 0:06:53  lr: 0.001351  min_lr: 0.001351  loss: 2.9069 (2.8745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9633 (1.0112)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [189]  [ 800/1251]  eta: 0:04:45  lr: 0.001348  min_lr: 0.001348  loss: 3.0928 (2.8811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9080 (1.0161)  time: 0.6330  data: 0.0004  max mem: 54228
Epoch: [189]  [1000/1251]  eta: 0:02:38  lr: 0.001344  min_lr: 0.001344  loss: 2.8863 (2.8738)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7379 (0.9858)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [189]  [1200/1251]  eta: 0:00:32  lr: 0.001341  min_lr: 0.001341  loss: 2.8835 (2.8815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7488 (0.9623)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 2.8631 (2.8833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (0.9568)  time: 0.5377  data: 0.0005  max mem: 54228
Epoch: [189] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 2.8631 (2.8915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (0.9568)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5670 (0.5670)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.8186  data: 6.4918  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7297 (0.7033)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (97.9636)  time: 0.8925  data: 0.5905  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8693 (0.8292)  acc1: 81.2000 (82.9905)  acc5: 96.4000 (96.8571)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8741 (0.8365)  acc1: 81.2000 (82.8320)  acc5: 96.4000 (96.8000)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5688 s / it)
* Acc@1 82.982 Acc@5 96.618 loss 0.826
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 82.98%
Epoch: [190]  [   0/1251]  eta: 1:17:43  lr: 0.001340  min_lr: 0.001340  loss: 3.0238 (3.0238)  weight_decay: 0.0500 (0.0500)  time: 3.7280  data: 3.1026  max mem: 54228
Epoch: [190]  [ 200/1251]  eta: 0:11:17  lr: 0.001337  min_lr: 0.001337  loss: 3.0415 (2.9055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9144 (0.9487)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [190]  [ 400/1251]  eta: 0:09:02  lr: 0.001333  min_lr: 0.001333  loss: 2.8191 (2.8824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8779 (0.9344)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [190]  [ 600/1251]  eta: 0:06:52  lr: 0.001330  min_lr: 0.001330  loss: 3.1502 (2.8829)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9090 (0.9651)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [190]  [ 800/1251]  eta: 0:04:45  lr: 0.001327  min_lr: 0.001327  loss: 2.8625 (2.8844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.9371)  time: 0.6346  data: 0.0005  max mem: 54228
Epoch: [190]  [1000/1251]  eta: 0:02:38  lr: 0.001323  min_lr: 0.001323  loss: 2.9737 (2.8880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.9466)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [190]  [1200/1251]  eta: 0:00:32  lr: 0.001320  min_lr: 0.001320  loss: 3.1302 (2.8969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.9510)  time: 0.6416  data: 0.0005  max mem: 54228
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 2.9858 (2.8976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.9524)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [190] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 2.9858 (2.8800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9028 (0.9524)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.6446 (0.6446)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 6.6086  data: 6.2709  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7335 (0.7431)  acc1: 86.0000 (85.5273)  acc5: 98.4000 (97.9273)  time: 0.8723  data: 0.5705  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8829 (0.8608)  acc1: 80.4000 (82.6286)  acc5: 96.8000 (96.7048)  time: 0.2985  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9301 (0.8698)  acc1: 80.4000 (82.3840)  acc5: 96.0000 (96.6080)  time: 0.2983  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5554 s / it)
* Acc@1 82.996 Acc@5 96.634 loss 0.856
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.00%
Epoch: [191]  [   0/1251]  eta: 1:03:38  lr: 0.001319  min_lr: 0.001319  loss: 2.3508 (2.3508)  weight_decay: 0.0500 (0.0500)  time: 3.0524  data: 2.4196  max mem: 54228
Epoch: [191]  [ 200/1251]  eta: 0:11:12  lr: 0.001316  min_lr: 0.001316  loss: 2.9659 (2.8411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8782 (0.9251)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [191]  [ 400/1251]  eta: 0:08:59  lr: 0.001312  min_lr: 0.001312  loss: 2.9850 (2.8766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (0.9478)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [191]  [ 600/1251]  eta: 0:06:52  lr: 0.001309  min_lr: 0.001309  loss: 2.8222 (2.8757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9086 (0.9609)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [191]  [ 800/1251]  eta: 0:04:44  lr: 0.001305  min_lr: 0.001305  loss: 2.9396 (2.8775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7458 (0.9320)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [191]  [1000/1251]  eta: 0:02:38  lr: 0.001302  min_lr: 0.001302  loss: 3.0730 (2.8807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8245 (0.9379)  time: 0.6294  data: 0.0004  max mem: 54228
Epoch: [191]  [1200/1251]  eta: 0:00:32  lr: 0.001299  min_lr: 0.001299  loss: 2.7548 (2.8757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8890 (0.9466)  time: 0.6291  data: 0.0004  max mem: 54228
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 3.0251 (2.8778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.9423)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [191] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 3.0251 (2.8727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.9423)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.5725 (0.5725)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.4753  data: 6.1345  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7401 (0.7278)  acc1: 86.0000 (85.6727)  acc5: 98.4000 (97.7818)  time: 0.8612  data: 0.5580  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9312 (0.8536)  acc1: 82.4000 (82.7238)  acc5: 96.4000 (96.6476)  time: 0.3000  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9312 (0.8609)  acc1: 82.0000 (82.4800)  acc5: 96.4000 (96.6240)  time: 0.3001  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5548 s / it)
* Acc@1 82.882 Acc@5 96.606 loss 0.851
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.00%
Epoch: [192]  [   0/1251]  eta: 1:26:14  lr: 0.001298  min_lr: 0.001298  loss: 2.6453 (2.6453)  weight_decay: 0.0500 (0.0500)  time: 4.1363  data: 2.6989  max mem: 54228
Epoch: [192]  [ 200/1251]  eta: 0:11:19  lr: 0.001295  min_lr: 0.001295  loss: 2.9319 (2.8139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9885 (0.9890)  time: 0.6357  data: 0.0004  max mem: 54228
Epoch: [192]  [ 400/1251]  eta: 0:09:02  lr: 0.001291  min_lr: 0.001291  loss: 2.9135 (2.8520)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9412 (1.0471)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [192]  [ 600/1251]  eta: 0:06:53  lr: 0.001288  min_lr: 0.001288  loss: 2.9434 (2.8424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0134 (1.0503)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [192]  [ 800/1251]  eta: 0:04:45  lr: 0.001284  min_lr: 0.001284  loss: 2.8875 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8404 (1.0250)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [192]  [1000/1251]  eta: 0:02:38  lr: 0.001281  min_lr: 0.001281  loss: 3.0336 (2.8433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0021 (1.0023)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [192]  [1200/1251]  eta: 0:00:32  lr: 0.001278  min_lr: 0.001278  loss: 3.1499 (2.8486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (1.0031)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 2.8872 (2.8491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (0.9996)  time: 0.5337  data: 0.0007  max mem: 54228
Epoch: [192] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 2.8872 (2.8686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8631 (0.9996)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6345 (0.6345)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 6.6579  data: 6.3315  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7857 (0.7455)  acc1: 84.4000 (86.0364)  acc5: 98.0000 (97.6364)  time: 0.8776  data: 0.5759  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9408 (0.8784)  acc1: 82.8000 (82.6286)  acc5: 96.4000 (96.4381)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9408 (0.8857)  acc1: 81.6000 (82.3040)  acc5: 96.4000 (96.4320)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5584 s / it)
* Acc@1 82.938 Acc@5 96.558 loss 0.874
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.00%
Epoch: [193]  [   0/1251]  eta: 1:23:50  lr: 0.001277  min_lr: 0.001277  loss: 2.8157 (2.8157)  weight_decay: 0.0500 (0.0500)  time: 4.0211  data: 2.0662  max mem: 54228
Epoch: [193]  [ 200/1251]  eta: 0:11:21  lr: 0.001274  min_lr: 0.001274  loss: 2.8108 (2.8193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (1.0661)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [193]  [ 400/1251]  eta: 0:09:02  lr: 0.001270  min_lr: 0.001270  loss: 3.0419 (2.8574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.0377)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [193]  [ 600/1251]  eta: 0:06:53  lr: 0.001267  min_lr: 0.001267  loss: 2.8686 (2.8665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8238 (0.9688)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [193]  [ 800/1251]  eta: 0:04:45  lr: 0.001264  min_lr: 0.001264  loss: 3.0176 (2.8823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8179 (0.9614)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [193]  [1000/1251]  eta: 0:02:38  lr: 0.001260  min_lr: 0.001260  loss: 2.8973 (2.8736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9391 (0.9788)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [193]  [1200/1251]  eta: 0:00:32  lr: 0.001257  min_lr: 0.001257  loss: 2.9660 (2.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0205 (0.9725)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 2.8244 (2.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0689 (0.9748)  time: 0.5366  data: 0.0006  max mem: 54228
Epoch: [193] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 2.8244 (2.8670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0689 (0.9748)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6252 (0.6252)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 6.3369  data: 6.0118  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7654 (0.7629)  acc1: 86.8000 (86.1455)  acc5: 97.2000 (97.5273)  time: 0.8476  data: 0.5469  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9593 (0.8762)  acc1: 81.6000 (83.0476)  acc5: 96.0000 (96.5905)  time: 0.2985  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9397 (0.8852)  acc1: 81.2000 (82.7840)  acc5: 96.4000 (96.6080)  time: 0.2984  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5448 s / it)
* Acc@1 83.212 Acc@5 96.584 loss 0.879
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.21%
Epoch: [194]  [   0/1251]  eta: 1:08:25  lr: 0.001256  min_lr: 0.001256  loss: 1.9678 (1.9678)  weight_decay: 0.0500 (0.0500)  time: 3.2814  data: 2.6444  max mem: 54228
Epoch: [194]  [ 200/1251]  eta: 0:11:13  lr: 0.001253  min_lr: 0.001253  loss: 3.0767 (2.8201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (0.9585)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [194]  [ 400/1251]  eta: 0:09:00  lr: 0.001249  min_lr: 0.001249  loss: 2.8840 (2.8419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8558 (0.9546)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [194]  [ 600/1251]  eta: 0:06:52  lr: 0.001246  min_lr: 0.001246  loss: 2.9738 (2.8454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9174 (0.9505)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [194]  [ 800/1251]  eta: 0:04:45  lr: 0.001243  min_lr: 0.001243  loss: 2.9727 (2.8480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9987 (0.9529)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [194]  [1000/1251]  eta: 0:02:38  lr: 0.001239  min_lr: 0.001239  loss: 3.0624 (2.8476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8723 (0.9771)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [194]  [1200/1251]  eta: 0:00:32  lr: 0.001236  min_lr: 0.001236  loss: 2.7909 (2.8488)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0142 (nan)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 2.9332 (2.8508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7947 (nan)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [194] Total time: 0:13:08 (0.6300 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 2.9332 (2.8701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7947 (nan)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6205 (0.6205)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.4217  data: 6.0944  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7777 (0.7989)  acc1: 86.8000 (86.0000)  acc5: 97.2000 (97.6000)  time: 0.8561  data: 0.5544  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9640 (0.9253)  acc1: 81.2000 (82.7238)  acc5: 96.0000 (96.4762)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9854 (0.9330)  acc1: 81.2000 (82.5440)  acc5: 95.6000 (96.4320)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5484 s / it)
* Acc@1 83.094 Acc@5 96.562 loss 0.920
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.21%
Epoch: [195]  [   0/1251]  eta: 1:26:22  lr: 0.001235  min_lr: 0.001235  loss: 2.7885 (2.7885)  weight_decay: 0.0500 (0.0500)  time: 4.1428  data: 2.5606  max mem: 54228
Epoch: [195]  [ 200/1251]  eta: 0:11:19  lr: 0.001232  min_lr: 0.001232  loss: 2.9623 (2.8366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0879 (1.0907)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [195]  [ 400/1251]  eta: 0:09:03  lr: 0.001229  min_lr: 0.001229  loss: 3.0541 (2.8453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8372 (1.0867)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [195]  [ 600/1251]  eta: 0:06:53  lr: 0.001225  min_lr: 0.001225  loss: 2.9261 (2.8509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9233 (nan)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [195]  [ 800/1251]  eta: 0:04:45  lr: 0.001222  min_lr: 0.001222  loss: 3.1187 (2.8666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9761 (nan)  time: 0.6370  data: 0.0004  max mem: 54228
Epoch: [195]  [1000/1251]  eta: 0:02:38  lr: 0.001219  min_lr: 0.001219  loss: 3.0002 (2.8719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8023 (nan)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [195]  [1200/1251]  eta: 0:00:32  lr: 0.001215  min_lr: 0.001215  loss: 3.0315 (2.8655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (nan)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 2.8405 (2.8654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (nan)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [195] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 2.8405 (2.8590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (nan)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.5694 (0.5694)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.0009  data: 4.6822  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6920 (0.7050)  acc1: 86.8000 (86.2545)  acc5: 98.0000 (97.9273)  time: 0.8566  data: 0.5554  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8963 (0.8327)  acc1: 81.2000 (82.8191)  acc5: 96.4000 (96.6667)  time: 0.3708  data: 0.0714  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9036 (0.8453)  acc1: 81.2000 (82.5280)  acc5: 96.0000 (96.6080)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5497 s / it)
* Acc@1 83.264 Acc@5 96.620 loss 0.841
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.26%
Epoch: [196]  [   0/1251]  eta: 1:09:47  lr: 0.001215  min_lr: 0.001215  loss: 3.0499 (3.0499)  weight_decay: 0.0500 (0.0500)  time: 3.3476  data: 2.7171  max mem: 54228
Epoch: [196]  [ 200/1251]  eta: 0:11:17  lr: 0.001211  min_lr: 0.001211  loss: 2.8222 (2.8803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9307 (0.9445)  time: 0.6358  data: 0.0004  max mem: 54228
Epoch: [196]  [ 400/1251]  eta: 0:09:01  lr: 0.001208  min_lr: 0.001208  loss: 3.0467 (2.8663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8209 (0.9160)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [196]  [ 600/1251]  eta: 0:06:52  lr: 0.001205  min_lr: 0.001205  loss: 2.8953 (2.8614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9759 (0.9681)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [196]  [ 800/1251]  eta: 0:04:45  lr: 0.001201  min_lr: 0.001201  loss: 2.8745 (2.8608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0636 (0.9984)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [196]  [1000/1251]  eta: 0:02:38  lr: 0.001198  min_lr: 0.001198  loss: 2.8269 (2.8522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8998 (1.0161)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [196]  [1200/1251]  eta: 0:00:32  lr: 0.001195  min_lr: 0.001195  loss: 2.8279 (2.8511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.9895)  time: 0.6376  data: 0.0005  max mem: 54228
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.0043 (2.8501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.9850)  time: 0.5327  data: 0.0006  max mem: 54228
Epoch: [196] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.0043 (2.8521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (0.9850)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6158 (0.6158)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 6.5621  data: 6.2356  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7831 (0.7695)  acc1: 85.6000 (85.7455)  acc5: 98.0000 (97.6000)  time: 0.8680  data: 0.5672  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9085 (0.8876)  acc1: 81.2000 (82.6286)  acc5: 96.4000 (96.6095)  time: 0.2984  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9482 (0.8992)  acc1: 80.8000 (82.3200)  acc5: 96.0000 (96.4800)  time: 0.2982  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5540 s / it)
* Acc@1 82.946 Acc@5 96.620 loss 0.890
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.26%
Epoch: [197]  [   0/1251]  eta: 1:24:41  lr: 0.001194  min_lr: 0.001194  loss: 3.0459 (3.0459)  weight_decay: 0.0500 (0.0500)  time: 4.0618  data: 2.6810  max mem: 54228
Epoch: [197]  [ 200/1251]  eta: 0:11:20  lr: 0.001191  min_lr: 0.001191  loss: 2.7098 (2.8064)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.9281)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [197]  [ 400/1251]  eta: 0:09:02  lr: 0.001187  min_lr: 0.001187  loss: 2.9700 (2.8197)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1241 (1.0213)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [197]  [ 600/1251]  eta: 0:06:53  lr: 0.001184  min_lr: 0.001184  loss: 3.0368 (2.8230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (0.9932)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [197]  [ 800/1251]  eta: 0:04:45  lr: 0.001181  min_lr: 0.001181  loss: 2.7851 (2.8376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.9876)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [197]  [1000/1251]  eta: 0:02:38  lr: 0.001178  min_lr: 0.001178  loss: 2.9830 (2.8339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9379 (1.0039)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [197]  [1200/1251]  eta: 0:00:32  lr: 0.001174  min_lr: 0.001174  loss: 2.9070 (2.8343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9120 (0.9850)  time: 0.6341  data: 0.0005  max mem: 54228
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 3.0507 (2.8373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.9828)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [197] Total time: 0:13:09 (0.6315 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 3.0507 (2.8483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.9828)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6166 (0.6166)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 6.6827  data: 6.3514  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7773 (0.7666)  acc1: 86.4000 (85.9273)  acc5: 97.6000 (97.7455)  time: 0.8799  data: 0.5777  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9453 (0.8887)  acc1: 81.2000 (82.8381)  acc5: 96.4000 (96.4762)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9726 (0.8970)  acc1: 80.4000 (82.4320)  acc5: 96.4000 (96.5280)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5599 s / it)
* Acc@1 83.104 Acc@5 96.620 loss 0.887
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.26%
Epoch: [198]  [   0/1251]  eta: 1:25:02  lr: 0.001174  min_lr: 0.001174  loss: 1.9881 (1.9881)  weight_decay: 0.0500 (0.0500)  time: 4.0786  data: 3.1354  max mem: 54228
Epoch: [198]  [ 200/1251]  eta: 0:11:19  lr: 0.001170  min_lr: 0.001170  loss: 2.8118 (2.7736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8585 (0.9167)  time: 0.6290  data: 0.0004  max mem: 54228
Epoch: [198]  [ 400/1251]  eta: 0:09:03  lr: 0.001167  min_lr: 0.001167  loss: 3.0402 (2.8116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0576 (0.9816)  time: 0.6434  data: 0.0004  max mem: 54228
Epoch: [198]  [ 600/1251]  eta: 0:06:53  lr: 0.001164  min_lr: 0.001164  loss: 2.8408 (2.8107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8119 (0.9748)  time: 0.6301  data: 0.0004  max mem: 54228
Epoch: [198]  [ 800/1251]  eta: 0:04:46  lr: 0.001161  min_lr: 0.001161  loss: 2.9562 (2.8326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7852 (0.9581)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [198]  [1000/1251]  eta: 0:02:39  lr: 0.001157  min_lr: 0.001157  loss: 2.9096 (2.8360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2418 (0.9949)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [198]  [1200/1251]  eta: 0:00:32  lr: 0.001154  min_lr: 0.001154  loss: 2.9640 (2.8371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8809 (0.9981)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 2.4844 (2.8330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8684 (1.0035)  time: 0.5338  data: 0.0007  max mem: 54228
Epoch: [198] Total time: 0:13:10 (0.6321 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 2.4844 (2.8388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8684 (1.0035)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.4852 (0.4852)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 6.7549  data: 6.4190  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6594 (0.6423)  acc1: 86.8000 (86.1818)  acc5: 98.0000 (97.7818)  time: 0.8866  data: 0.5839  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8122 (0.7699)  acc1: 81.6000 (83.0667)  acc5: 96.4000 (96.6286)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8382 (0.7812)  acc1: 81.6000 (82.5920)  acc5: 96.0000 (96.5920)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5632 s / it)
* Acc@1 83.180 Acc@5 96.698 loss 0.772
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.26%
Epoch: [199]  [   0/1251]  eta: 1:22:31  lr: 0.001153  min_lr: 0.001153  loss: 3.0887 (3.0887)  weight_decay: 0.0500 (0.0500)  time: 3.9578  data: 2.8739  max mem: 54228
Epoch: [199]  [ 200/1251]  eta: 0:11:19  lr: 0.001150  min_lr: 0.001150  loss: 2.8647 (2.7650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8240 (1.0098)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [199]  [ 400/1251]  eta: 0:09:02  lr: 0.001147  min_lr: 0.001147  loss: 2.8833 (2.8260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (0.9438)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [199]  [ 600/1251]  eta: 0:06:53  lr: 0.001143  min_lr: 0.001143  loss: 2.7763 (2.8242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9513 (0.9382)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [199]  [ 800/1251]  eta: 0:04:45  lr: 0.001140  min_lr: 0.001140  loss: 3.0242 (2.8398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8415 (0.9767)  time: 0.6403  data: 0.0005  max mem: 54228
Epoch: [199]  [1000/1251]  eta: 0:02:38  lr: 0.001137  min_lr: 0.001137  loss: 2.9317 (2.8437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0551 (0.9686)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [199]  [1200/1251]  eta: 0:00:32  lr: 0.001134  min_lr: 0.001134  loss: 3.0354 (2.8388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8533 (0.9895)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 2.9094 (2.8365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (0.9949)  time: 0.5380  data: 0.0007  max mem: 54228
Epoch: [199] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 2.9094 (2.8391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (0.9949)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.5828 (0.5828)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 4.9935  data: 4.6723  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7486 (0.7231)  acc1: 86.8000 (86.2182)  acc5: 98.0000 (97.7455)  time: 0.8035  data: 0.5032  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8979 (0.8548)  acc1: 81.6000 (82.8952)  acc5: 96.0000 (96.5714)  time: 0.3413  data: 0.0432  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8908 (0.8606)  acc1: 81.6000 (82.7200)  acc5: 96.0000 (96.5280)  time: 0.2982  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5243 s / it)
* Acc@1 83.212 Acc@5 96.718 loss 0.850
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.26%
Epoch: [200]  [   0/1251]  eta: 1:24:16  lr: 0.001133  min_lr: 0.001133  loss: 2.8304 (2.8304)  weight_decay: 0.0500 (0.0500)  time: 4.0416  data: 3.0446  max mem: 54228
Epoch: [200]  [ 200/1251]  eta: 0:11:18  lr: 0.001130  min_lr: 0.001130  loss: 2.9228 (2.8512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0001 (0.9626)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [200]  [ 400/1251]  eta: 0:09:01  lr: 0.001126  min_lr: 0.001126  loss: 3.0129 (2.8190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8923 (1.0154)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [200]  [ 600/1251]  eta: 0:06:53  lr: 0.001123  min_lr: 0.001123  loss: 2.9689 (2.8303)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2719 (1.0641)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [200]  [ 800/1251]  eta: 0:04:45  lr: 0.001120  min_lr: 0.001120  loss: 2.8820 (2.8293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7766 (1.0203)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [200]  [1000/1251]  eta: 0:02:38  lr: 0.001117  min_lr: 0.001117  loss: 2.9324 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.9886)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [200]  [1200/1251]  eta: 0:00:32  lr: 0.001114  min_lr: 0.001114  loss: 3.0119 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8689 (0.9722)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.6874 (2.8236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.9710)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [200] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.6874 (2.8303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.9710)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.5365 (0.5365)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 6.1224  data: 5.7772  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6772 (0.6676)  acc1: 86.4000 (86.6546)  acc5: 98.4000 (97.8909)  time: 0.8288  data: 0.5255  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8541 (0.7881)  acc1: 82.8000 (83.6762)  acc5: 96.0000 (96.7619)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8640 (0.7966)  acc1: 81.2000 (83.1680)  acc5: 96.4000 (96.7680)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5368 s / it)
* Acc@1 83.322 Acc@5 96.636 loss 0.791
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.32%
Epoch: [201]  [   0/1251]  eta: 1:18:04  lr: 0.001113  min_lr: 0.001113  loss: 2.8341 (2.8341)  weight_decay: 0.0500 (0.0500)  time: 3.7445  data: 3.1134  max mem: 54228
Epoch: [201]  [ 200/1251]  eta: 0:11:17  lr: 0.001110  min_lr: 0.001110  loss: 2.4133 (2.8349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8325 (0.9558)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [201]  [ 400/1251]  eta: 0:09:02  lr: 0.001106  min_lr: 0.001106  loss: 2.9573 (2.8115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7401 (0.9166)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [201]  [ 600/1251]  eta: 0:06:52  lr: 0.001103  min_lr: 0.001103  loss: 2.8478 (2.8125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9828 (0.9895)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [201]  [ 800/1251]  eta: 0:04:45  lr: 0.001100  min_lr: 0.001100  loss: 2.9160 (2.8173)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1926 (1.0408)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [201]  [1000/1251]  eta: 0:02:38  lr: 0.001097  min_lr: 0.001097  loss: 2.7031 (2.8249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9426 (1.0066)  time: 0.6391  data: 0.0005  max mem: 54228
Epoch: [201]  [1200/1251]  eta: 0:00:32  lr: 0.001094  min_lr: 0.001094  loss: 2.9183 (2.8242)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0280 (1.0141)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 2.6895 (2.8229)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8707 (1.0093)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [201] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 2.6895 (2.8335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8707 (1.0093)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5397 (0.5397)  acc1: 92.4000 (92.4000)  acc5: 99.2000 (99.2000)  time: 6.3309  data: 6.0149  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7266 (0.7181)  acc1: 86.4000 (86.1455)  acc5: 98.4000 (97.9273)  time: 0.8493  data: 0.5471  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9237 (0.8436)  acc1: 80.4000 (82.8000)  acc5: 96.4000 (96.6286)  time: 0.3020  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8939 (0.8505)  acc1: 80.4000 (82.5280)  acc5: 96.4000 (96.6240)  time: 0.3028  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5530 s / it)
* Acc@1 83.074 Acc@5 96.600 loss 0.845
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.32%
Epoch: [202]  [   0/1251]  eta: 1:25:57  lr: 0.001093  min_lr: 0.001093  loss: 2.8483 (2.8483)  weight_decay: 0.0500 (0.0500)  time: 4.1228  data: 3.3152  max mem: 54228
Epoch: [202]  [ 200/1251]  eta: 0:11:18  lr: 0.001090  min_lr: 0.001090  loss: 3.0484 (2.8201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0479 (0.8866)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [202]  [ 400/1251]  eta: 0:09:02  lr: 0.001086  min_lr: 0.001086  loss: 2.9059 (2.8202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9628 (0.9260)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [202]  [ 600/1251]  eta: 0:06:53  lr: 0.001083  min_lr: 0.001083  loss: 2.8489 (2.8237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0276 (0.9804)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [202]  [ 800/1251]  eta: 0:04:45  lr: 0.001080  min_lr: 0.001080  loss: 2.7920 (2.8262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9564 (0.9795)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [202]  [1000/1251]  eta: 0:02:38  lr: 0.001077  min_lr: 0.001077  loss: 2.9149 (2.8152)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0045 (0.9905)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [202]  [1200/1251]  eta: 0:00:32  lr: 0.001074  min_lr: 0.001074  loss: 3.0176 (2.8129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8967 (0.9958)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.0472 (2.8130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (0.9941)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [202] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.0472 (2.8203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8034 (0.9941)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5870 (0.5870)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.5405  data: 6.2133  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7353 (0.7580)  acc1: 86.8000 (86.6909)  acc5: 98.0000 (97.7091)  time: 0.8671  data: 0.5651  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9513 (0.8888)  acc1: 80.4000 (83.2191)  acc5: 96.0000 (96.5714)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9672 (0.8971)  acc1: 80.0000 (82.7520)  acc5: 96.8000 (96.5920)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5539 s / it)
* Acc@1 83.240 Acc@5 96.712 loss 0.884
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.32%
Epoch: [203]  [   0/1251]  eta: 1:27:11  lr: 0.001073  min_lr: 0.001073  loss: 2.2512 (2.2512)  weight_decay: 0.0500 (0.0500)  time: 4.1815  data: 3.0475  max mem: 54228
Epoch: [203]  [ 200/1251]  eta: 0:11:20  lr: 0.001070  min_lr: 0.001070  loss: 2.8810 (2.7117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8245 (0.9706)  time: 0.6273  data: 0.0005  max mem: 54228
Epoch: [203]  [ 400/1251]  eta: 0:09:03  lr: 0.001066  min_lr: 0.001066  loss: 2.9091 (2.7321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8783 (0.9637)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [203]  [ 600/1251]  eta: 0:06:53  lr: 0.001063  min_lr: 0.001063  loss: 3.0443 (2.7794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9006 (0.9600)  time: 0.6340  data: 0.0005  max mem: 54228
Epoch: [203]  [ 800/1251]  eta: 0:04:45  lr: 0.001060  min_lr: 0.001060  loss: 3.0148 (2.7947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0134 (0.9724)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [203]  [1000/1251]  eta: 0:02:38  lr: 0.001057  min_lr: 0.001057  loss: 2.9063 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8224 (0.9750)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [203]  [1200/1251]  eta: 0:00:32  lr: 0.001054  min_lr: 0.001054  loss: 3.0060 (2.8101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0637 (0.9788)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 3.0397 (2.8156)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (0.9807)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [203] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 3.0397 (2.8203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0247 (0.9807)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6367 (0.6367)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.3042  data: 5.9723  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.8312 (0.8066)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (97.8909)  time: 0.8456  data: 0.5433  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 1.0156 (0.9356)  acc1: 81.2000 (82.5905)  acc5: 96.4000 (96.6476)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 1.0156 (0.9463)  acc1: 81.2000 (82.4320)  acc5: 96.0000 (96.5280)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5443 s / it)
* Acc@1 83.108 Acc@5 96.646 loss 0.935
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.32%
Epoch: [204]  [   0/1251]  eta: 1:28:09  lr: 0.001053  min_lr: 0.001053  loss: 1.7086 (1.7086)  weight_decay: 0.0500 (0.0500)  time: 4.2283  data: 2.8307  max mem: 54228
Epoch: [204]  [ 200/1251]  eta: 0:11:19  lr: 0.001050  min_lr: 0.001050  loss: 2.9864 (2.7651)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0199 (1.1723)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [204]  [ 400/1251]  eta: 0:09:02  lr: 0.001047  min_lr: 0.001047  loss: 2.8527 (2.8114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7437 (1.0259)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [204]  [ 600/1251]  eta: 0:06:53  lr: 0.001044  min_lr: 0.001044  loss: 2.9596 (2.8194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9349 (1.0754)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [204]  [ 800/1251]  eta: 0:04:45  lr: 0.001040  min_lr: 0.001040  loss: 2.9619 (2.8171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8502 (1.0409)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [204]  [1000/1251]  eta: 0:02:38  lr: 0.001037  min_lr: 0.001037  loss: 2.9445 (2.8079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7696 (1.0110)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [204]  [1200/1251]  eta: 0:00:32  lr: 0.001034  min_lr: 0.001034  loss: 2.8546 (2.8063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9476 (0.9938)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 2.9847 (2.8065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (1.0014)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [204] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 2.9847 (2.8130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0208 (1.0014)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.6033 (0.6033)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.6846  data: 6.3730  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7290 (0.7376)  acc1: 86.0000 (85.9273)  acc5: 98.4000 (97.9273)  time: 0.8843  data: 0.5796  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9149 (0.8655)  acc1: 82.4000 (82.8571)  acc5: 96.0000 (96.7619)  time: 0.3022  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9149 (0.8748)  acc1: 82.0000 (82.7360)  acc5: 96.4000 (96.7200)  time: 0.3010  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5632 s / it)
* Acc@1 83.322 Acc@5 96.728 loss 0.866
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.32%
Epoch: [205]  [   0/1251]  eta: 1:27:53  lr: 0.001033  min_lr: 0.001033  loss: 3.1538 (3.1538)  weight_decay: 0.0500 (0.0500)  time: 4.2150  data: 2.9440  max mem: 54228
Epoch: [205]  [ 200/1251]  eta: 0:11:18  lr: 0.001030  min_lr: 0.001030  loss: 2.8786 (2.7780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8607 (0.9957)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [205]  [ 400/1251]  eta: 0:09:03  lr: 0.001027  min_lr: 0.001027  loss: 2.9718 (2.7732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0956 (1.0379)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [205]  [ 600/1251]  eta: 0:06:53  lr: 0.001024  min_lr: 0.001024  loss: 3.0114 (2.7914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9757 (0.9972)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [205]  [ 800/1251]  eta: 0:04:45  lr: 0.001021  min_lr: 0.001021  loss: 2.9402 (2.7819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8705 (1.0217)  time: 0.6360  data: 0.0004  max mem: 54228
Epoch: [205]  [1000/1251]  eta: 0:02:38  lr: 0.001018  min_lr: 0.001018  loss: 2.9830 (2.7952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9311 (1.0319)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [205]  [1200/1251]  eta: 0:00:32  lr: 0.001014  min_lr: 0.001014  loss: 2.8500 (2.8006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (1.0678)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 2.8363 (2.8000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9841 (1.0747)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [205] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 2.8363 (2.8113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9841 (1.0747)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5843 (0.5843)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6421  data: 5.2954  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7220 (0.7253)  acc1: 86.0000 (86.3636)  acc5: 98.0000 (97.9636)  time: 0.8711  data: 0.5675  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8817 (0.8475)  acc1: 82.4000 (83.2571)  acc5: 96.4000 (96.8191)  time: 0.3466  data: 0.0474  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9168 (0.8571)  acc1: 81.6000 (82.6880)  acc5: 96.4000 (96.7360)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5551 s / it)
* Acc@1 83.346 Acc@5 96.766 loss 0.846
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.35%
Epoch: [206]  [   0/1251]  eta: 1:12:39  lr: 0.001014  min_lr: 0.001014  loss: 2.9777 (2.9777)  weight_decay: 0.0500 (0.0500)  time: 3.4849  data: 2.8476  max mem: 54228
Epoch: [206]  [ 200/1251]  eta: 0:11:19  lr: 0.001011  min_lr: 0.001011  loss: 3.0356 (2.8028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7465 (0.9565)  time: 0.6375  data: 0.0004  max mem: 54228
Epoch: [206]  [ 400/1251]  eta: 0:09:02  lr: 0.001007  min_lr: 0.001007  loss: 2.7110 (2.7864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8098 (0.9626)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [206]  [ 600/1251]  eta: 0:06:53  lr: 0.001004  min_lr: 0.001004  loss: 2.9957 (2.7917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1178 (1.0069)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [206]  [ 800/1251]  eta: 0:04:45  lr: 0.001001  min_lr: 0.001001  loss: 2.9998 (2.7973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (1.0090)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [206]  [1000/1251]  eta: 0:02:38  lr: 0.000998  min_lr: 0.000998  loss: 2.9753 (2.8000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8889 (0.9878)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [206]  [1200/1251]  eta: 0:00:32  lr: 0.000995  min_lr: 0.000995  loss: 2.6327 (2.7951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9861 (0.9829)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 2.7993 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9886 (0.9833)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [206] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 2.7993 (2.8082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9886 (0.9833)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.5623 (0.5623)  acc1: 89.2000 (89.2000)  acc5: 99.6000 (99.6000)  time: 6.2701  data: 5.9348  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6833 (0.7024)  acc1: 86.0000 (86.5455)  acc5: 98.4000 (98.0727)  time: 0.8426  data: 0.5398  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8467 (0.8264)  acc1: 82.0000 (83.4286)  acc5: 96.8000 (96.7619)  time: 0.2999  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8467 (0.8337)  acc1: 82.0000 (83.1520)  acc5: 96.4000 (96.6880)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5432 s / it)
* Acc@1 83.418 Acc@5 96.722 loss 0.827
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.42%
Epoch: [207]  [   0/1251]  eta: 1:17:43  lr: 0.000994  min_lr: 0.000994  loss: 3.0593 (3.0593)  weight_decay: 0.0500 (0.0500)  time: 3.7275  data: 3.0857  max mem: 54228
Epoch: [207]  [ 200/1251]  eta: 0:11:17  lr: 0.000991  min_lr: 0.000991  loss: 3.0309 (2.8162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (1.0315)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [207]  [ 400/1251]  eta: 0:09:01  lr: 0.000988  min_lr: 0.000988  loss: 3.0873 (2.7971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9648 (1.0114)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [207]  [ 600/1251]  eta: 0:06:53  lr: 0.000985  min_lr: 0.000985  loss: 3.0387 (2.7882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8956 (0.9824)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [207]  [ 800/1251]  eta: 0:04:45  lr: 0.000982  min_lr: 0.000982  loss: 2.9137 (2.8029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9459 (0.9940)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [207]  [1000/1251]  eta: 0:02:38  lr: 0.000979  min_lr: 0.000979  loss: 2.8637 (2.8081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9091 (1.0001)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [207]  [1200/1251]  eta: 0:00:32  lr: 0.000976  min_lr: 0.000976  loss: 2.7106 (2.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9688 (1.0017)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 3.0133 (2.8023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9491 (1.0050)  time: 0.5337  data: 0.0005  max mem: 54228
Epoch: [207] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 3.0133 (2.8006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9491 (1.0050)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6643 (0.6643)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 6.7396  data: 6.4051  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7681 (0.7651)  acc1: 87.2000 (86.7636)  acc5: 98.0000 (97.8909)  time: 0.8849  data: 0.5826  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9714 (0.8923)  acc1: 82.8000 (83.2381)  acc5: 96.0000 (96.6286)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9786 (0.9015)  acc1: 81.2000 (82.9440)  acc5: 96.0000 (96.6080)  time: 0.2992  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5633 s / it)
* Acc@1 83.388 Acc@5 96.664 loss 0.896
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.42%
Epoch: [208]  [   0/1251]  eta: 1:28:26  lr: 0.000975  min_lr: 0.000975  loss: 2.9569 (2.9569)  weight_decay: 0.0500 (0.0500)  time: 4.2419  data: 1.9269  max mem: 54228
Epoch: [208]  [ 200/1251]  eta: 0:11:19  lr: 0.000972  min_lr: 0.000972  loss: 2.9317 (2.7705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8636 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [208]  [ 400/1251]  eta: 0:09:03  lr: 0.000969  min_lr: 0.000969  loss: 2.8516 (2.7919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8106 (nan)  time: 0.6360  data: 0.0005  max mem: 54228
Epoch: [208]  [ 600/1251]  eta: 0:06:53  lr: 0.000966  min_lr: 0.000966  loss: 2.9854 (2.7884)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1014 (nan)  time: 0.6344  data: 0.0004  max mem: 54228
Epoch: [208]  [ 800/1251]  eta: 0:04:45  lr: 0.000963  min_lr: 0.000963  loss: 2.8859 (2.7969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9788 (nan)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [208]  [1000/1251]  eta: 0:02:38  lr: 0.000960  min_lr: 0.000960  loss: 2.9957 (2.7918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9425 (nan)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [208]  [1200/1251]  eta: 0:00:32  lr: 0.000956  min_lr: 0.000956  loss: 2.9615 (2.7919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9321 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 2.5354 (2.7881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9176 (nan)  time: 0.5329  data: 0.0007  max mem: 54228
Epoch: [208] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 2.5354 (2.7902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9176 (nan)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5162 (0.5162)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3848  data: 6.0563  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6488 (0.6668)  acc1: 86.4000 (86.8000)  acc5: 98.0000 (97.7818)  time: 0.8523  data: 0.5509  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8606 (0.7920)  acc1: 81.6000 (83.5238)  acc5: 96.4000 (96.6857)  time: 0.2989  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8642 (0.8024)  acc1: 81.2000 (83.1840)  acc5: 96.4000 (96.6400)  time: 0.2988  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5472 s / it)
* Acc@1 83.442 Acc@5 96.692 loss 0.803
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.44%
Epoch: [209]  [   0/1251]  eta: 1:16:38  lr: 0.000956  min_lr: 0.000956  loss: 2.6691 (2.6691)  weight_decay: 0.0500 (0.0500)  time: 3.6756  data: 3.0304  max mem: 54228
Epoch: [209]  [ 200/1251]  eta: 0:11:16  lr: 0.000953  min_lr: 0.000953  loss: 2.9043 (2.8143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1086 (1.0904)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [209]  [ 400/1251]  eta: 0:09:02  lr: 0.000950  min_lr: 0.000950  loss: 3.0146 (2.7831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9007 (1.0354)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [209]  [ 600/1251]  eta: 0:06:52  lr: 0.000947  min_lr: 0.000947  loss: 2.6019 (2.7916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (1.0173)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [209]  [ 800/1251]  eta: 0:04:45  lr: 0.000944  min_lr: 0.000944  loss: 2.8999 (2.7945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0729 (1.0420)  time: 0.6330  data: 0.0004  max mem: 54228
Epoch: [209]  [1000/1251]  eta: 0:02:38  lr: 0.000940  min_lr: 0.000940  loss: 3.0347 (2.8052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9836 (1.0402)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [209]  [1200/1251]  eta: 0:00:32  lr: 0.000937  min_lr: 0.000937  loss: 2.7882 (2.8034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0339 (1.0663)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 2.6626 (2.7990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8845 (1.0653)  time: 0.5330  data: 0.0005  max mem: 54228
Epoch: [209] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 2.6626 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8845 (1.0653)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6219 (0.6219)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.9742  data: 5.6290  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7105 (0.7191)  acc1: 86.4000 (86.3636)  acc5: 98.0000 (97.7455)  time: 0.8149  data: 0.5121  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8992 (0.8439)  acc1: 81.2000 (83.1619)  acc5: 96.4000 (96.7619)  time: 0.2988  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8992 (0.8531)  acc1: 80.8000 (83.0080)  acc5: 96.4000 (96.7040)  time: 0.2987  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5294 s / it)
* Acc@1 83.304 Acc@5 96.734 loss 0.848
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.44%
Epoch: [210]  [   0/1251]  eta: 1:27:01  lr: 0.000937  min_lr: 0.000937  loss: 3.1649 (3.1649)  weight_decay: 0.0500 (0.0500)  time: 4.1742  data: 3.4513  max mem: 54228
Epoch: [210]  [ 200/1251]  eta: 0:11:21  lr: 0.000934  min_lr: 0.000934  loss: 2.9533 (2.8333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8878 (1.0208)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [210]  [ 400/1251]  eta: 0:09:03  lr: 0.000931  min_lr: 0.000931  loss: 2.9470 (2.7968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0518 (1.0712)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [210]  [ 600/1251]  eta: 0:06:53  lr: 0.000928  min_lr: 0.000928  loss: 2.9006 (2.7868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (1.0239)  time: 0.6344  data: 0.0005  max mem: 54228
Epoch: [210]  [ 800/1251]  eta: 0:04:45  lr: 0.000925  min_lr: 0.000925  loss: 2.6158 (2.7817)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9624 (1.0120)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [210]  [1000/1251]  eta: 0:02:38  lr: 0.000922  min_lr: 0.000922  loss: 2.9232 (2.7953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9968 (1.0298)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [210]  [1200/1251]  eta: 0:00:32  lr: 0.000918  min_lr: 0.000918  loss: 2.6422 (2.7958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8016 (1.0127)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 2.9144 (2.7929)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (1.0078)  time: 0.5330  data: 0.0007  max mem: 54228
Epoch: [210] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 2.9144 (2.7882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (1.0078)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6344 (0.6344)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 6.7424  data: 6.4107  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7303 (0.7481)  acc1: 86.8000 (86.5091)  acc5: 98.0000 (97.9636)  time: 0.8854  data: 0.5831  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9430 (0.8695)  acc1: 81.2000 (83.4095)  acc5: 96.4000 (96.8000)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9430 (0.8807)  acc1: 81.2000 (83.1520)  acc5: 96.0000 (96.7200)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5623 s / it)
* Acc@1 83.616 Acc@5 96.702 loss 0.869
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.62%
Epoch: [211]  [   0/1251]  eta: 1:20:00  lr: 0.000918  min_lr: 0.000918  loss: 1.7758 (1.7758)  weight_decay: 0.0500 (0.0500)  time: 3.8376  data: 3.2064  max mem: 54228
Epoch: [211]  [ 200/1251]  eta: 0:11:16  lr: 0.000915  min_lr: 0.000915  loss: 2.8577 (2.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9618 (0.9853)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [211]  [ 400/1251]  eta: 0:09:02  lr: 0.000912  min_lr: 0.000912  loss: 2.8421 (2.7685)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8259 (0.9743)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [211]  [ 600/1251]  eta: 0:06:53  lr: 0.000909  min_lr: 0.000909  loss: 2.7798 (2.7722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2147 (1.0099)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [211]  [ 800/1251]  eta: 0:04:45  lr: 0.000906  min_lr: 0.000906  loss: 2.7832 (2.7773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (1.0389)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [211]  [1000/1251]  eta: 0:02:38  lr: 0.000903  min_lr: 0.000903  loss: 2.8703 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9856 (1.0376)  time: 0.6393  data: 0.0005  max mem: 54228
Epoch: [211]  [1200/1251]  eta: 0:00:32  lr: 0.000900  min_lr: 0.000900  loss: 2.6891 (2.7892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9160 (1.0314)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 2.9710 (2.7933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (1.0294)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [211] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 2.9710 (2.7811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (1.0294)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6162 (0.6162)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.2941  data: 5.9584  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7254 (0.7533)  acc1: 86.4000 (86.2909)  acc5: 97.6000 (97.9636)  time: 0.8445  data: 0.5420  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9564 (0.8729)  acc1: 81.2000 (83.0857)  acc5: 96.4000 (96.8381)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9445 (0.8820)  acc1: 81.2000 (82.8160)  acc5: 96.4000 (96.7200)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5443 s / it)
* Acc@1 83.554 Acc@5 96.778 loss 0.871
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.62%
Epoch: [212]  [   0/1251]  eta: 1:27:32  lr: 0.000899  min_lr: 0.000899  loss: 2.9130 (2.9130)  weight_decay: 0.0500 (0.0500)  time: 4.1989  data: 3.3876  max mem: 54228
Epoch: [212]  [ 200/1251]  eta: 0:11:20  lr: 0.000896  min_lr: 0.000896  loss: 2.8413 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9286 (1.0124)  time: 0.6358  data: 0.0004  max mem: 54228
Epoch: [212]  [ 400/1251]  eta: 0:09:03  lr: 0.000893  min_lr: 0.000893  loss: 2.5115 (2.7492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0477 (1.0498)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [212]  [ 600/1251]  eta: 0:06:53  lr: 0.000890  min_lr: 0.000890  loss: 2.8128 (2.7604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (1.0439)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [212]  [ 800/1251]  eta: 0:04:45  lr: 0.000887  min_lr: 0.000887  loss: 2.8280 (2.7581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8023 (1.0175)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [212]  [1000/1251]  eta: 0:02:38  lr: 0.000884  min_lr: 0.000884  loss: 2.9488 (2.7639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8348 (1.0129)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [212]  [1200/1251]  eta: 0:00:32  lr: 0.000881  min_lr: 0.000881  loss: 2.8816 (2.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9114 (1.0364)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 2.8947 (2.7708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7725 (1.0281)  time: 0.5335  data: 0.0006  max mem: 54228
Epoch: [212] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 2.8947 (2.7747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7725 (1.0281)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.5617 (0.5617)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 6.1672  data: 5.8358  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7493 (0.7314)  acc1: 86.4000 (87.0182)  acc5: 98.0000 (97.8182)  time: 0.8337  data: 0.5308  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9483 (0.8580)  acc1: 82.4000 (83.7333)  acc5: 96.4000 (96.7619)  time: 0.3004  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9481 (0.8716)  acc1: 81.6000 (83.4080)  acc5: 96.4000 (96.6720)  time: 0.3005  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5448 s / it)
* Acc@1 83.688 Acc@5 96.798 loss 0.867
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.69%
Epoch: [213]  [   0/1251]  eta: 1:18:20  lr: 0.000880  min_lr: 0.000880  loss: 3.1182 (3.1182)  weight_decay: 0.0500 (0.0500)  time: 3.7577  data: 3.1188  max mem: 54228
Epoch: [213]  [ 200/1251]  eta: 0:11:19  lr: 0.000877  min_lr: 0.000877  loss: 2.9068 (2.7199)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0519 (1.1325)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [213]  [ 400/1251]  eta: 0:09:02  lr: 0.000874  min_lr: 0.000874  loss: 2.8729 (2.7359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0848 (1.1958)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [213]  [ 600/1251]  eta: 0:06:53  lr: 0.000871  min_lr: 0.000871  loss: 2.9675 (2.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8867 (1.1214)  time: 0.6384  data: 0.0005  max mem: 54228
Epoch: [213]  [ 800/1251]  eta: 0:04:45  lr: 0.000868  min_lr: 0.000868  loss: 2.8547 (2.7541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8605 (1.0866)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [213]  [1000/1251]  eta: 0:02:38  lr: 0.000865  min_lr: 0.000865  loss: 2.9071 (2.7689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0088 (1.0852)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [213]  [1200/1251]  eta: 0:00:32  lr: 0.000863  min_lr: 0.000863  loss: 2.6714 (2.7646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0174 (1.0786)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 2.7424 (2.7647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (1.0727)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [213] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 2.7424 (2.7708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (1.0727)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5669 (0.5669)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.6184  data: 6.2958  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6965 (0.7043)  acc1: 86.8000 (86.6909)  acc5: 98.4000 (98.0364)  time: 0.8742  data: 0.5727  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8858 (0.8191)  acc1: 81.6000 (83.2762)  acc5: 96.4000 (96.9905)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8858 (0.8317)  acc1: 80.8000 (82.8160)  acc5: 96.4000 (96.9920)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5570 s / it)
* Acc@1 83.678 Acc@5 96.770 loss 0.829
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.69%
Epoch: [214]  [   0/1251]  eta: 1:22:57  lr: 0.000862  min_lr: 0.000862  loss: 2.2312 (2.2312)  weight_decay: 0.0500 (0.0500)  time: 3.9788  data: 2.8396  max mem: 54228
Epoch: [214]  [ 200/1251]  eta: 0:11:18  lr: 0.000859  min_lr: 0.000859  loss: 2.8219 (2.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8841 (0.9838)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [214]  [ 400/1251]  eta: 0:09:01  lr: 0.000856  min_lr: 0.000856  loss: 2.7107 (2.7419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.1320)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [214]  [ 600/1251]  eta: 0:06:53  lr: 0.000853  min_lr: 0.000853  loss: 2.6773 (2.7535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8617 (1.0602)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [214]  [ 800/1251]  eta: 0:04:45  lr: 0.000850  min_lr: 0.000850  loss: 2.8061 (2.7434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9210 (1.0357)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [214]  [1000/1251]  eta: 0:02:38  lr: 0.000847  min_lr: 0.000847  loss: 2.7979 (2.7454)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6338  data: 0.0005  max mem: 54228
Epoch: [214]  [1200/1251]  eta: 0:00:32  lr: 0.000844  min_lr: 0.000844  loss: 2.8046 (2.7579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9794 (nan)  time: 0.6353  data: 0.0005  max mem: 54228
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 2.9623 (2.7570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0240 (nan)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [214] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 2.9623 (2.7659)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0240 (nan)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5773 (0.5773)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 6.2191  data: 5.8823  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7118 (0.7058)  acc1: 87.2000 (86.9091)  acc5: 98.0000 (98.1455)  time: 0.8378  data: 0.5351  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8756 (0.8302)  acc1: 81.2000 (83.3714)  acc5: 96.4000 (96.8191)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8957 (0.8402)  acc1: 81.2000 (83.1520)  acc5: 96.4000 (96.8000)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5405 s / it)
* Acc@1 83.616 Acc@5 96.794 loss 0.832
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.69%
Epoch: [215]  [   0/1251]  eta: 1:23:52  lr: 0.000843  min_lr: 0.000843  loss: 2.7566 (2.7566)  weight_decay: 0.0500 (0.0500)  time: 4.0226  data: 2.5498  max mem: 54228
Epoch: [215]  [ 200/1251]  eta: 0:11:17  lr: 0.000841  min_lr: 0.000841  loss: 2.8606 (2.7327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (1.0456)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [215]  [ 400/1251]  eta: 0:09:02  lr: 0.000838  min_lr: 0.000838  loss: 2.9556 (2.7593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9914 (1.1203)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [215]  [ 600/1251]  eta: 0:06:53  lr: 0.000835  min_lr: 0.000835  loss: 2.8651 (2.7453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8240 (1.0643)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [215]  [ 800/1251]  eta: 0:04:45  lr: 0.000832  min_lr: 0.000832  loss: 2.8467 (2.7374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8811 (1.0651)  time: 0.6330  data: 0.0004  max mem: 54228
Epoch: [215]  [1000/1251]  eta: 0:02:38  lr: 0.000829  min_lr: 0.000829  loss: 2.9508 (2.7443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9775 (1.0625)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [215]  [1200/1251]  eta: 0:00:32  lr: 0.000826  min_lr: 0.000826  loss: 2.7097 (2.7464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (1.0709)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 2.9632 (2.7457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1245 (1.0755)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [215] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 2.9632 (2.7590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1245 (1.0755)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6039 (0.6039)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 6.3215  data: 6.0065  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7029 (0.7138)  acc1: 87.2000 (86.2545)  acc5: 98.4000 (98.0364)  time: 0.8505  data: 0.5463  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9039 (0.8415)  acc1: 81.6000 (83.3143)  acc5: 96.4000 (96.7619)  time: 0.3040  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9055 (0.8501)  acc1: 81.6000 (83.0240)  acc5: 96.4000 (96.7520)  time: 0.3046  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5567 s / it)
* Acc@1 83.594 Acc@5 96.762 loss 0.843
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.69%
Epoch: [216]  [   0/1251]  eta: 1:21:44  lr: 0.000825  min_lr: 0.000825  loss: 2.1935 (2.1935)  weight_decay: 0.0500 (0.0500)  time: 3.9202  data: 2.5830  max mem: 54228
Epoch: [216]  [ 200/1251]  eta: 0:11:20  lr: 0.000822  min_lr: 0.000822  loss: 2.8790 (2.8031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9803 (0.9578)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [216]  [ 400/1251]  eta: 0:09:02  lr: 0.000819  min_lr: 0.000819  loss: 2.5744 (2.7831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0549 (1.0513)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [216]  [ 600/1251]  eta: 0:06:53  lr: 0.000817  min_lr: 0.000817  loss: 2.7530 (2.7731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9306 (1.0582)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [216]  [ 800/1251]  eta: 0:04:45  lr: 0.000814  min_lr: 0.000814  loss: 2.8575 (2.7725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8991 (1.0261)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [216]  [1000/1251]  eta: 0:02:38  lr: 0.000811  min_lr: 0.000811  loss: 2.9228 (2.7670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0039 (1.0433)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [216]  [1200/1251]  eta: 0:00:32  lr: 0.000808  min_lr: 0.000808  loss: 2.7641 (2.7725)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0100 (1.0473)  time: 0.6342  data: 0.0004  max mem: 54228
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.7659 (2.7739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0278 (1.0460)  time: 0.5368  data: 0.0005  max mem: 54228
Epoch: [216] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.7659 (2.7620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0278 (1.0460)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6445 (0.6445)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 6.4327  data: 6.1002  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7421 (0.7539)  acc1: 86.4000 (87.0545)  acc5: 98.0000 (97.9636)  time: 0.8573  data: 0.5550  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9232 (0.8784)  acc1: 82.0000 (83.9619)  acc5: 96.4000 (96.7810)  time: 0.2996  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9385 (0.8891)  acc1: 82.0000 (83.6960)  acc5: 96.4000 (96.6880)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5487 s / it)
* Acc@1 83.734 Acc@5 96.812 loss 0.878
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [217]  [   0/1251]  eta: 1:06:53  lr: 0.000807  min_lr: 0.000807  loss: 3.1415 (3.1415)  weight_decay: 0.0500 (0.0500)  time: 3.2081  data: 2.5706  max mem: 54228
Epoch: [217]  [ 200/1251]  eta: 0:11:14  lr: 0.000804  min_lr: 0.000804  loss: 2.9565 (2.7361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8832 (0.9981)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [217]  [ 400/1251]  eta: 0:09:00  lr: 0.000801  min_lr: 0.000801  loss: 2.8949 (2.7469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9428 (1.0262)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [217]  [ 600/1251]  eta: 0:06:52  lr: 0.000799  min_lr: 0.000799  loss: 2.7063 (2.7401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9812 (1.0303)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [217]  [ 800/1251]  eta: 0:04:45  lr: 0.000796  min_lr: 0.000796  loss: 2.6957 (2.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9590 (1.0467)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [217]  [1000/1251]  eta: 0:02:38  lr: 0.000793  min_lr: 0.000793  loss: 2.7998 (2.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1031 (1.0627)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [217]  [1200/1251]  eta: 0:00:32  lr: 0.000790  min_lr: 0.000790  loss: 2.7962 (2.7445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8242 (1.0475)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 2.8195 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9691 (1.0480)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [217] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 2.8195 (2.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9691 (1.0480)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.6258 (0.6258)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.1932  data: 5.8614  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7420 (0.7575)  acc1: 87.2000 (86.9091)  acc5: 98.4000 (97.9273)  time: 0.8353  data: 0.5331  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9606 (0.8764)  acc1: 82.0000 (83.9048)  acc5: 96.0000 (96.6667)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9606 (0.8887)  acc1: 82.0000 (83.5040)  acc5: 95.6000 (96.5760)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5404 s / it)
* Acc@1 83.712 Acc@5 96.830 loss 0.878
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [218]  [   0/1251]  eta: 1:22:26  lr: 0.000789  min_lr: 0.000789  loss: 2.1865 (2.1865)  weight_decay: 0.0500 (0.0500)  time: 3.9544  data: 2.6140  max mem: 54228
Epoch: [218]  [ 200/1251]  eta: 0:11:18  lr: 0.000786  min_lr: 0.000786  loss: 3.0665 (2.7826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9508 (1.0473)  time: 0.6361  data: 0.0004  max mem: 54228
Epoch: [218]  [ 400/1251]  eta: 0:09:03  lr: 0.000784  min_lr: 0.000784  loss: 2.8960 (2.7533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9448 (1.0341)  time: 0.6364  data: 0.0004  max mem: 54228
Epoch: [218]  [ 600/1251]  eta: 0:06:53  lr: 0.000781  min_lr: 0.000781  loss: 2.8756 (2.7684)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9704 (1.0323)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [218]  [ 800/1251]  eta: 0:04:45  lr: 0.000778  min_lr: 0.000778  loss: 2.8883 (2.7672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7906 (1.0086)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [218]  [1000/1251]  eta: 0:02:38  lr: 0.000775  min_lr: 0.000775  loss: 2.8812 (2.7670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8680 (1.0060)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [218]  [1200/1251]  eta: 0:00:32  lr: 0.000772  min_lr: 0.000772  loss: 2.9578 (2.7656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0499 (1.0665)  time: 0.6285  data: 0.0006  max mem: 54228
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 2.9469 (2.7648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1189 (1.0794)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [218] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 2.9469 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1189 (1.0794)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5628 (0.5628)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 6.2077  data: 5.8803  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7043 (0.7088)  acc1: 85.2000 (86.1455)  acc5: 97.6000 (97.8545)  time: 0.8366  data: 0.5349  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9111 (0.8329)  acc1: 81.6000 (83.2762)  acc5: 96.4000 (96.8571)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9223 (0.8459)  acc1: 81.6000 (83.0400)  acc5: 96.4000 (96.8160)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5400 s / it)
* Acc@1 83.660 Acc@5 96.896 loss 0.829
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [219]  [   0/1251]  eta: 1:29:52  lr: 0.000771  min_lr: 0.000771  loss: 2.2234 (2.2234)  weight_decay: 0.0500 (0.0500)  time: 4.3102  data: 2.6496  max mem: 54228
Epoch: [219]  [ 200/1251]  eta: 0:11:21  lr: 0.000769  min_lr: 0.000769  loss: 2.9796 (2.7836)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1008 (1.1696)  time: 0.6354  data: 0.0004  max mem: 54228
Epoch: [219]  [ 400/1251]  eta: 0:09:03  lr: 0.000766  min_lr: 0.000766  loss: 2.7371 (2.7635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7861 (1.0570)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [219]  [ 600/1251]  eta: 0:06:53  lr: 0.000763  min_lr: 0.000763  loss: 2.7006 (2.7439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8963 (1.0768)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [219]  [ 800/1251]  eta: 0:04:46  lr: 0.000760  min_lr: 0.000760  loss: 2.9290 (2.7381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (1.0582)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [219]  [1000/1251]  eta: 0:02:38  lr: 0.000757  min_lr: 0.000757  loss: 2.7543 (2.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9250 (1.0529)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [219]  [1200/1251]  eta: 0:00:32  lr: 0.000755  min_lr: 0.000755  loss: 2.8124 (2.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0790 (1.0670)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.7202 (2.7394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1153 (1.0729)  time: 0.5382  data: 0.0006  max mem: 54228
Epoch: [219] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.7202 (2.7472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1153 (1.0729)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.5565 (0.5565)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 6.7533  data: 6.4289  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6856 (0.7035)  acc1: 86.0000 (86.1455)  acc5: 98.4000 (98.0000)  time: 0.8865  data: 0.5848  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8697 (0.8281)  acc1: 82.0000 (83.2000)  acc5: 96.8000 (96.8571)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9149 (0.8374)  acc1: 81.2000 (82.8960)  acc5: 96.8000 (96.8160)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5644 s / it)
* Acc@1 83.714 Acc@5 96.850 loss 0.820
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [220]  [   0/1251]  eta: 1:26:04  lr: 0.000754  min_lr: 0.000754  loss: 3.1732 (3.1732)  weight_decay: 0.0500 (0.0500)  time: 4.1282  data: 2.3309  max mem: 54228
Epoch: [220]  [ 200/1251]  eta: 0:11:20  lr: 0.000751  min_lr: 0.000751  loss: 2.9092 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7924 (1.0497)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [220]  [ 400/1251]  eta: 0:09:03  lr: 0.000748  min_lr: 0.000748  loss: 2.6607 (2.7374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0479 (1.0681)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [220]  [ 600/1251]  eta: 0:06:53  lr: 0.000745  min_lr: 0.000745  loss: 2.4721 (2.7358)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1738 (1.1251)  time: 0.6380  data: 0.0004  max mem: 54228
Epoch: [220]  [ 800/1251]  eta: 0:04:45  lr: 0.000743  min_lr: 0.000743  loss: 2.8765 (2.7414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9391 (1.0957)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [220]  [1000/1251]  eta: 0:02:38  lr: 0.000740  min_lr: 0.000740  loss: 2.8864 (2.7460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9601 (1.0736)  time: 0.6272  data: 0.0005  max mem: 54228
Epoch: [220]  [1200/1251]  eta: 0:00:32  lr: 0.000737  min_lr: 0.000737  loss: 2.8110 (2.7507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9253 (1.0751)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.7210 (2.7527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (1.0708)  time: 0.5329  data: 0.0006  max mem: 54228
Epoch: [220] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.7210 (2.7406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8983 (1.0708)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.5367 (0.5367)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 4.9829  data: 4.6310  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6932 (0.6988)  acc1: 87.2000 (87.1273)  acc5: 98.0000 (97.9273)  time: 0.8123  data: 0.5081  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8811 (0.8247)  acc1: 82.0000 (83.6191)  acc5: 96.8000 (96.8000)  time: 0.3476  data: 0.0479  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9075 (0.8397)  acc1: 81.2000 (83.1840)  acc5: 96.8000 (96.7520)  time: 0.2998  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5302 s / it)
* Acc@1 83.728 Acc@5 96.900 loss 0.827
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [221]  [   0/1251]  eta: 1:24:13  lr: 0.000736  min_lr: 0.000736  loss: 3.0387 (3.0387)  weight_decay: 0.0500 (0.0500)  time: 4.0398  data: 3.3137  max mem: 54228
Epoch: [221]  [ 200/1251]  eta: 0:11:19  lr: 0.000734  min_lr: 0.000734  loss: 2.9422 (2.7075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0538 (0.9793)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [221]  [ 400/1251]  eta: 0:09:02  lr: 0.000731  min_lr: 0.000731  loss: 2.8120 (2.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9114 (1.0482)  time: 0.6270  data: 0.0004  max mem: 54228
Epoch: [221]  [ 600/1251]  eta: 0:06:53  lr: 0.000728  min_lr: 0.000728  loss: 2.9046 (2.7477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3135 (1.0771)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [221]  [ 800/1251]  eta: 0:04:45  lr: 0.000725  min_lr: 0.000725  loss: 2.5719 (2.7462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9104 (1.0992)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [221]  [1000/1251]  eta: 0:02:38  lr: 0.000722  min_lr: 0.000722  loss: 2.9520 (2.7433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9242 (1.0716)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [221]  [1200/1251]  eta: 0:00:32  lr: 0.000720  min_lr: 0.000720  loss: 2.8144 (2.7396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0724 (1.0726)  time: 0.6323  data: 0.0004  max mem: 54228
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.5577 (2.7357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0724 (1.0737)  time: 0.5328  data: 0.0006  max mem: 54228
Epoch: [221] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.5577 (2.7289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0724 (1.0737)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5886 (0.5886)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.2090  data: 5.8797  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7303 (0.7221)  acc1: 86.4000 (86.7636)  acc5: 98.0000 (97.8546)  time: 0.8370  data: 0.5349  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9498 (0.8550)  acc1: 80.8000 (83.1238)  acc5: 96.4000 (96.7619)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9127 (0.8638)  acc1: 80.8000 (82.7840)  acc5: 96.4000 (96.7200)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5401 s / it)
* Acc@1 83.682 Acc@5 96.866 loss 0.850
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.73%
Epoch: [222]  [   0/1251]  eta: 1:28:46  lr: 0.000719  min_lr: 0.000719  loss: 2.6032 (2.6032)  weight_decay: 0.0500 (0.0500)  time: 4.2576  data: 1.8596  max mem: 54228
Epoch: [222]  [ 200/1251]  eta: 0:11:21  lr: 0.000716  min_lr: 0.000716  loss: 2.7250 (2.7292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8570 (1.1139)  time: 0.6364  data: 0.0005  max mem: 54228
Epoch: [222]  [ 400/1251]  eta: 0:09:04  lr: 0.000714  min_lr: 0.000714  loss: 2.7949 (2.7421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0421 (1.0676)  time: 0.6382  data: 0.0004  max mem: 54228
Epoch: [222]  [ 600/1251]  eta: 0:06:53  lr: 0.000711  min_lr: 0.000711  loss: 2.7706 (2.7191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (1.0520)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [222]  [ 800/1251]  eta: 0:04:46  lr: 0.000708  min_lr: 0.000708  loss: 2.9021 (2.7248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8349 (1.0357)  time: 0.6352  data: 0.0005  max mem: 54228
Epoch: [222]  [1000/1251]  eta: 0:02:38  lr: 0.000705  min_lr: 0.000705  loss: 2.7303 (2.7261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (1.0249)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [222]  [1200/1251]  eta: 0:00:32  lr: 0.000703  min_lr: 0.000703  loss: 2.6878 (2.7296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2416 (1.0815)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.8955 (2.7287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8630 (1.0779)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [222] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.8955 (2.7277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8630 (1.0779)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.6323 (0.6323)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 6.0709  data: 5.7254  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7787 (0.7890)  acc1: 87.2000 (87.1273)  acc5: 97.6000 (98.1091)  time: 0.8243  data: 0.5208  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9814 (0.9122)  acc1: 82.8000 (83.7333)  acc5: 96.8000 (96.8000)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9810 (0.9228)  acc1: 80.8000 (83.2800)  acc5: 96.0000 (96.7680)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5354 s / it)
* Acc@1 83.872 Acc@5 96.852 loss 0.912
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.87%
Epoch: [223]  [   0/1251]  eta: 1:17:51  lr: 0.000702  min_lr: 0.000702  loss: 2.6471 (2.6471)  weight_decay: 0.0500 (0.0500)  time: 3.7346  data: 3.1103  max mem: 54228
Epoch: [223]  [ 200/1251]  eta: 0:11:20  lr: 0.000699  min_lr: 0.000699  loss: 2.7826 (2.6723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1824 (1.0514)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [223]  [ 400/1251]  eta: 0:09:02  lr: 0.000696  min_lr: 0.000696  loss: 2.9059 (2.7117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8608 (1.0221)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [223]  [ 600/1251]  eta: 0:06:53  lr: 0.000694  min_lr: 0.000694  loss: 2.9073 (2.7225)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0365 (1.0566)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [223]  [ 800/1251]  eta: 0:04:45  lr: 0.000691  min_lr: 0.000691  loss: 2.8238 (2.7230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (1.0855)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [223]  [1000/1251]  eta: 0:02:38  lr: 0.000688  min_lr: 0.000688  loss: 2.8770 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1876 (1.0968)  time: 0.6293  data: 0.0005  max mem: 54228
Epoch: [223]  [1200/1251]  eta: 0:00:32  lr: 0.000686  min_lr: 0.000686  loss: 2.9118 (2.7261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0755)  time: 0.6387  data: 0.0005  max mem: 54228
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 2.9167 (2.7264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0677)  time: 0.5331  data: 0.0004  max mem: 54228
Epoch: [223] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 2.9167 (2.7228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0677)
Test:  [ 0/25]  eta: 0:02:36  loss: 0.5760 (0.5760)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.2449  data: 5.9294  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7322 (0.7115)  acc1: 87.6000 (86.9455)  acc5: 98.0000 (98.0364)  time: 0.8412  data: 0.5394  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8835 (0.8373)  acc1: 82.8000 (83.6381)  acc5: 96.8000 (97.0667)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8932 (0.8496)  acc1: 82.0000 (83.2160)  acc5: 96.8000 (96.9920)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5429 s / it)
* Acc@1 83.780 Acc@5 96.796 loss 0.840
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.87%
Epoch: [224]  [   0/1251]  eta: 1:31:40  lr: 0.000685  min_lr: 0.000685  loss: 3.1384 (3.1384)  weight_decay: 0.0500 (0.0500)  time: 4.3965  data: 2.0657  max mem: 54228
Epoch: [224]  [ 200/1251]  eta: 0:11:19  lr: 0.000682  min_lr: 0.000682  loss: 2.8016 (2.7003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9569 (1.0922)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [224]  [ 400/1251]  eta: 0:09:02  lr: 0.000680  min_lr: 0.000680  loss: 3.0187 (2.7248)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [224]  [ 600/1251]  eta: 0:06:53  lr: 0.000677  min_lr: 0.000677  loss: 2.7535 (2.7296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9198 (nan)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [224]  [ 800/1251]  eta: 0:04:45  lr: 0.000674  min_lr: 0.000674  loss: 2.9874 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1677 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [224]  [1000/1251]  eta: 0:02:38  lr: 0.000671  min_lr: 0.000671  loss: 2.7661 (2.7363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2483 (nan)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [224]  [1200/1251]  eta: 0:00:32  lr: 0.000669  min_lr: 0.000669  loss: 2.6201 (2.7257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8826 (nan)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.8953 (2.7237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1105 (nan)  time: 0.5329  data: 0.0005  max mem: 54228
Epoch: [224] Total time: 0:13:09 (0.6315 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.8953 (2.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1105 (nan)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5791 (0.5791)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.3418  data: 5.0032  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7048 (0.7008)  acc1: 87.2000 (86.6909)  acc5: 98.0000 (97.8909)  time: 0.8027  data: 0.4997  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8675 (0.8218)  acc1: 81.6000 (83.5810)  acc5: 96.4000 (96.9333)  time: 0.3242  data: 0.0247  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8988 (0.8328)  acc1: 82.0000 (83.3760)  acc5: 96.4000 (96.8480)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5252 s / it)
* Acc@1 83.920 Acc@5 96.956 loss 0.826
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.92%
Epoch: [225]  [   0/1251]  eta: 1:09:40  lr: 0.000668  min_lr: 0.000668  loss: 2.9726 (2.9726)  weight_decay: 0.0500 (0.0500)  time: 3.3419  data: 2.7094  max mem: 54228
Epoch: [225]  [ 200/1251]  eta: 0:11:14  lr: 0.000665  min_lr: 0.000665  loss: 2.7646 (2.7308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1069 (1.1710)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [225]  [ 400/1251]  eta: 0:09:01  lr: 0.000663  min_lr: 0.000663  loss: 2.8346 (2.7099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0979 (1.1579)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [225]  [ 600/1251]  eta: 0:06:52  lr: 0.000660  min_lr: 0.000660  loss: 2.7944 (2.7121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9250 (1.1203)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [225]  [ 800/1251]  eta: 0:04:45  lr: 0.000657  min_lr: 0.000657  loss: 2.7361 (2.7079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (1.0926)  time: 0.6344  data: 0.0005  max mem: 54228
Epoch: [225]  [1000/1251]  eta: 0:02:38  lr: 0.000655  min_lr: 0.000655  loss: 2.7359 (2.7074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9899 (1.0980)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [225]  [1200/1251]  eta: 0:00:32  lr: 0.000652  min_lr: 0.000652  loss: 2.7995 (2.7075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2880 (1.1250)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.7258 (2.7066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2489 (1.1251)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [225] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.7258 (2.7177)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2489 (1.1251)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5826 (0.5826)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 6.2397  data: 5.9009  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7011 (0.6987)  acc1: 86.8000 (86.8727)  acc5: 98.0000 (97.8909)  time: 0.8396  data: 0.5367  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8520 (0.8210)  acc1: 82.4000 (83.9238)  acc5: 96.4000 (96.9333)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8796 (0.8305)  acc1: 82.0000 (83.5840)  acc5: 96.4000 (96.8480)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5414 s / it)
* Acc@1 84.046 Acc@5 96.854 loss 0.821
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.05%
Epoch: [226]  [   0/1251]  eta: 1:20:32  lr: 0.000651  min_lr: 0.000651  loss: 2.3990 (2.3990)  weight_decay: 0.0500 (0.0500)  time: 3.8631  data: 3.2275  max mem: 54228
Epoch: [226]  [ 200/1251]  eta: 0:11:20  lr: 0.000649  min_lr: 0.000649  loss: 2.8106 (2.6862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1247 (1.0958)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [226]  [ 400/1251]  eta: 0:09:02  lr: 0.000646  min_lr: 0.000646  loss: 2.7828 (2.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (1.0863)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [226]  [ 600/1251]  eta: 0:06:53  lr: 0.000644  min_lr: 0.000644  loss: 2.8809 (2.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9035 (1.0904)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [226]  [ 800/1251]  eta: 0:04:45  lr: 0.000641  min_lr: 0.000641  loss: 2.8076 (2.7001)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0487 (1.1003)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [226]  [1000/1251]  eta: 0:02:38  lr: 0.000638  min_lr: 0.000638  loss: 2.3542 (2.7065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9922 (1.1061)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [226]  [1200/1251]  eta: 0:00:32  lr: 0.000636  min_lr: 0.000636  loss: 2.7740 (2.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9774 (1.1227)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 2.5921 (2.6979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9557 (1.1246)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [226] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 2.5921 (2.7036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9557 (1.1246)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5731 (0.5731)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 6.6054  data: 6.2756  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6916 (0.6975)  acc1: 86.0000 (86.3636)  acc5: 98.0000 (97.7818)  time: 0.8728  data: 0.5708  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8846 (0.8230)  acc1: 81.2000 (83.3524)  acc5: 96.8000 (96.7810)  time: 0.2994  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8974 (0.8328)  acc1: 80.8000 (83.0720)  acc5: 96.8000 (96.8160)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5558 s / it)
* Acc@1 83.990 Acc@5 96.930 loss 0.819
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.05%
Epoch: [227]  [   0/1251]  eta: 1:28:05  lr: 0.000635  min_lr: 0.000635  loss: 2.0828 (2.0828)  weight_decay: 0.0500 (0.0500)  time: 4.2251  data: 3.5567  max mem: 54228
Epoch: [227]  [ 200/1251]  eta: 0:11:20  lr: 0.000632  min_lr: 0.000632  loss: 2.8320 (2.7119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9578 (0.9869)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [227]  [ 400/1251]  eta: 0:09:02  lr: 0.000630  min_lr: 0.000630  loss: 2.9109 (2.7244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (1.0693)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [227]  [ 600/1251]  eta: 0:06:53  lr: 0.000627  min_lr: 0.000627  loss: 2.7865 (2.7272)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0452 (1.1006)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [227]  [ 800/1251]  eta: 0:04:45  lr: 0.000625  min_lr: 0.000625  loss: 2.7952 (2.7282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8781 (1.0568)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [227]  [1000/1251]  eta: 0:02:38  lr: 0.000622  min_lr: 0.000622  loss: 2.5892 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9114 (1.0498)  time: 0.6343  data: 0.0005  max mem: 54228
Epoch: [227]  [1200/1251]  eta: 0:00:32  lr: 0.000619  min_lr: 0.000619  loss: 2.6722 (2.7185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8980 (1.0452)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 2.7663 (2.7169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8857 (1.0416)  time: 0.5334  data: 0.0007  max mem: 54228
Epoch: [227] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 2.7663 (2.7018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8857 (1.0416)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5563 (0.5563)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.2297  data: 5.9013  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6606 (0.6865)  acc1: 86.4000 (86.6909)  acc5: 98.0000 (97.8546)  time: 0.8388  data: 0.5368  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8521 (0.8158)  acc1: 82.0000 (83.5810)  acc5: 96.0000 (96.5905)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8621 (0.8222)  acc1: 81.2000 (83.3760)  acc5: 96.4000 (96.5920)  time: 0.2994  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5401 s / it)
* Acc@1 84.098 Acc@5 96.864 loss 0.810
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.10%
Epoch: [228]  [   0/1251]  eta: 1:13:46  lr: 0.000619  min_lr: 0.000619  loss: 2.5087 (2.5087)  weight_decay: 0.0500 (0.0500)  time: 3.5387  data: 2.8985  max mem: 54228
Epoch: [228]  [ 200/1251]  eta: 0:11:15  lr: 0.000616  min_lr: 0.000616  loss: 2.7739 (2.6762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9278 (1.1694)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [228]  [ 400/1251]  eta: 0:09:02  lr: 0.000614  min_lr: 0.000614  loss: 2.8062 (2.6851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9134 (1.0851)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [228]  [ 600/1251]  eta: 0:06:53  lr: 0.000611  min_lr: 0.000611  loss: 2.6500 (2.6762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0099 (1.0968)  time: 0.6290  data: 0.0004  max mem: 54228
Epoch: [228]  [ 800/1251]  eta: 0:04:45  lr: 0.000608  min_lr: 0.000608  loss: 2.7856 (2.6825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8792 (1.0821)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [228]  [1000/1251]  eta: 0:02:38  lr: 0.000606  min_lr: 0.000606  loss: 2.7029 (2.6839)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [228]  [1200/1251]  eta: 0:00:32  lr: 0.000603  min_lr: 0.000603  loss: 2.8282 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0777 (nan)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.8867 (2.6931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (nan)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [228] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.8867 (2.6990)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (nan)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6221 (0.6221)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.9860  data: 5.6534  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6962 (0.7289)  acc1: 86.0000 (86.8364)  acc5: 98.0000 (98.0000)  time: 0.8167  data: 0.5143  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8986 (0.8516)  acc1: 81.6000 (83.7714)  acc5: 96.0000 (96.7048)  time: 0.2997  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9014 (0.8602)  acc1: 82.0000 (83.6160)  acc5: 96.4000 (96.7360)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5309 s / it)
* Acc@1 84.130 Acc@5 96.934 loss 0.854
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.13%
Epoch: [229]  [   0/1251]  eta: 1:12:32  lr: 0.000603  min_lr: 0.000603  loss: 2.9264 (2.9264)  weight_decay: 0.0500 (0.0500)  time: 3.4793  data: 2.8475  max mem: 54228
Epoch: [229]  [ 200/1251]  eta: 0:11:17  lr: 0.000600  min_lr: 0.000600  loss: 2.9043 (2.7194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2079 (1.1257)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [229]  [ 400/1251]  eta: 0:09:02  lr: 0.000597  min_lr: 0.000597  loss: 2.9310 (2.7201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5527 (1.2200)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [229]  [ 600/1251]  eta: 0:06:53  lr: 0.000595  min_lr: 0.000595  loss: 2.7736 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9136 (1.1717)  time: 0.6355  data: 0.0005  max mem: 54228
Epoch: [229]  [ 800/1251]  eta: 0:04:45  lr: 0.000592  min_lr: 0.000592  loss: 2.8366 (2.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9881 (1.1271)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [229]  [1000/1251]  eta: 0:02:38  lr: 0.000590  min_lr: 0.000590  loss: 2.6757 (2.6933)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0102 (1.1067)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [229]  [1200/1251]  eta: 0:00:32  lr: 0.000587  min_lr: 0.000587  loss: 2.8932 (2.7020)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0244 (1.0944)  time: 0.6274  data: 0.0005  max mem: 54228
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.6978 (2.6988)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0280 (1.0985)  time: 0.5331  data: 0.0007  max mem: 54228
Epoch: [229] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.6978 (2.6959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0280 (1.0985)
Test:  [ 0/25]  eta: 0:02:51  loss: 0.5487 (0.5487)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.8563  data: 6.5318  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6733 (0.6908)  acc1: 86.8000 (86.7636)  acc5: 98.4000 (97.9273)  time: 0.8955  data: 0.5941  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8715 (0.8114)  acc1: 81.6000 (83.7905)  acc5: 96.4000 (96.8571)  time: 0.2993  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8715 (0.8211)  acc1: 82.0000 (83.6000)  acc5: 96.4000 (96.7680)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5657 s / it)
* Acc@1 84.020 Acc@5 96.920 loss 0.815
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.13%
Epoch: [230]  [   0/1251]  eta: 1:23:38  lr: 0.000587  min_lr: 0.000587  loss: 2.3287 (2.3287)  weight_decay: 0.0500 (0.0500)  time: 4.0118  data: 3.2317  max mem: 54228
Epoch: [230]  [ 200/1251]  eta: 0:11:19  lr: 0.000584  min_lr: 0.000584  loss: 2.7302 (2.6734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8752 (1.1429)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [230]  [ 400/1251]  eta: 0:09:02  lr: 0.000582  min_lr: 0.000582  loss: 2.8168 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1585 (1.1103)  time: 0.6382  data: 0.0005  max mem: 54228
Epoch: [230]  [ 600/1251]  eta: 0:06:53  lr: 0.000579  min_lr: 0.000579  loss: 2.4856 (2.6635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9865 (1.1110)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [230]  [ 800/1251]  eta: 0:04:45  lr: 0.000577  min_lr: 0.000577  loss: 2.8065 (2.6728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0157 (1.1026)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [230]  [1000/1251]  eta: 0:02:38  lr: 0.000574  min_lr: 0.000574  loss: 2.7498 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9420 (1.0800)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [230]  [1200/1251]  eta: 0:00:32  lr: 0.000571  min_lr: 0.000571  loss: 2.7888 (2.6738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2538 (1.1066)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 2.8602 (2.6779)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1872 (1.1156)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [230] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 2.8602 (2.6832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1872 (1.1156)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.5659 (0.5659)  acc1: 92.0000 (92.0000)  acc5: 98.4000 (98.4000)  time: 5.0000  data: 4.6762  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7061 (0.7162)  acc1: 87.2000 (87.0546)  acc5: 98.0000 (97.7818)  time: 0.8022  data: 0.5006  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8852 (0.8412)  acc1: 82.4000 (84.0000)  acc5: 96.4000 (96.5143)  time: 0.3409  data: 0.0416  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9058 (0.8502)  acc1: 82.4000 (83.7280)  acc5: 95.6000 (96.4160)  time: 0.2995  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5250 s / it)
* Acc@1 84.094 Acc@5 96.886 loss 0.843
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.13%
Epoch: [231]  [   0/1251]  eta: 1:23:43  lr: 0.000571  min_lr: 0.000571  loss: 1.9557 (1.9557)  weight_decay: 0.0500 (0.0500)  time: 4.0152  data: 3.3286  max mem: 54228
Epoch: [231]  [ 200/1251]  eta: 0:11:18  lr: 0.000568  min_lr: 0.000568  loss: 2.6291 (2.6550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9541 (1.1268)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [231]  [ 400/1251]  eta: 0:09:02  lr: 0.000566  min_lr: 0.000566  loss: 2.8049 (2.6569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9434 (1.0570)  time: 0.6356  data: 0.0004  max mem: 54228
Epoch: [231]  [ 600/1251]  eta: 0:06:53  lr: 0.000563  min_lr: 0.000563  loss: 2.8130 (2.6600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0486 (1.1493)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [231]  [ 800/1251]  eta: 0:04:45  lr: 0.000561  min_lr: 0.000561  loss: 2.7489 (2.6806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0619 (1.1557)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [231]  [1000/1251]  eta: 0:02:38  lr: 0.000558  min_lr: 0.000558  loss: 2.6665 (2.6782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0380 (1.1491)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [231]  [1200/1251]  eta: 0:00:32  lr: 0.000556  min_lr: 0.000556  loss: 2.6287 (2.6830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9098 (1.1323)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 2.6375 (2.6822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9049 (1.1291)  time: 0.5329  data: 0.0007  max mem: 54228
Epoch: [231] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 2.6375 (2.6810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9049 (1.1291)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.5962 (0.5962)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 4.3276  data: 4.0011  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.6890 (0.7111)  acc1: 86.8000 (87.0909)  acc5: 97.6000 (97.9273)  time: 0.7553  data: 0.4544  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8622 (0.8376)  acc1: 82.0000 (83.6571)  acc5: 96.4000 (96.9524)  time: 0.3481  data: 0.0499  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9149 (0.8466)  acc1: 82.0000 (83.6000)  acc5: 96.4000 (96.8960)  time: 0.2983  data: 0.0002  max mem: 54228
Test: Total time: 0:00:12 (0.5034 s / it)
* Acc@1 84.284 Acc@5 96.908 loss 0.840
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.28%
Epoch: [232]  [   0/1251]  eta: 1:09:00  lr: 0.000555  min_lr: 0.000555  loss: 2.8727 (2.8727)  weight_decay: 0.0500 (0.0500)  time: 3.3097  data: 2.6682  max mem: 54228
Epoch: [232]  [ 200/1251]  eta: 0:11:14  lr: 0.000553  min_lr: 0.000553  loss: 2.6423 (2.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0183 (1.2732)  time: 0.6273  data: 0.0005  max mem: 54228
Epoch: [232]  [ 400/1251]  eta: 0:09:01  lr: 0.000550  min_lr: 0.000550  loss: 2.5634 (2.6846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0300 (1.2002)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [232]  [ 600/1251]  eta: 0:06:52  lr: 0.000548  min_lr: 0.000548  loss: 2.7457 (2.6694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0023 (1.1605)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [232]  [ 800/1251]  eta: 0:04:45  lr: 0.000545  min_lr: 0.000545  loss: 2.7462 (2.6745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1221 (1.1628)  time: 0.6327  data: 0.0005  max mem: 54228
Epoch: [232]  [1000/1251]  eta: 0:02:38  lr: 0.000543  min_lr: 0.000543  loss: 2.7979 (2.6822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1929 (1.1851)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [232]  [1200/1251]  eta: 0:00:32  lr: 0.000540  min_lr: 0.000540  loss: 2.3241 (2.6772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9433 (1.1623)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.6283 (2.6762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2130 (1.1658)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [232] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.6283 (2.6857)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2130 (1.1658)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.5372 (0.5372)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.0636  data: 5.7203  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6548 (0.6763)  acc1: 86.4000 (86.7636)  acc5: 98.0000 (97.9273)  time: 0.8240  data: 0.5204  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8326 (0.7955)  acc1: 82.0000 (84.0571)  acc5: 96.4000 (96.7810)  time: 0.3000  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8666 (0.8090)  acc1: 81.6000 (83.6960)  acc5: 96.4000 (96.7200)  time: 0.3000  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5352 s / it)
* Acc@1 84.216 Acc@5 96.890 loss 0.801
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.28%
Epoch: [233]  [   0/1251]  eta: 1:23:12  lr: 0.000540  min_lr: 0.000540  loss: 2.8783 (2.8783)  weight_decay: 0.0500 (0.0500)  time: 3.9906  data: 2.7410  max mem: 54228
Epoch: [233]  [ 200/1251]  eta: 0:11:21  lr: 0.000537  min_lr: 0.000537  loss: 2.6531 (2.6627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8866 (1.0190)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [233]  [ 400/1251]  eta: 0:09:03  lr: 0.000535  min_lr: 0.000535  loss: 2.6874 (2.6677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9561 (1.0835)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [233]  [ 600/1251]  eta: 0:06:53  lr: 0.000533  min_lr: 0.000533  loss: 2.6556 (2.6641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0121 (1.0898)  time: 0.6363  data: 0.0005  max mem: 54228
Epoch: [233]  [ 800/1251]  eta: 0:04:46  lr: 0.000530  min_lr: 0.000530  loss: 2.5682 (2.6531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1313 (1.1273)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [233]  [1000/1251]  eta: 0:02:38  lr: 0.000528  min_lr: 0.000528  loss: 2.5879 (2.6612)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0094 (1.1361)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [233]  [1200/1251]  eta: 0:00:32  lr: 0.000525  min_lr: 0.000525  loss: 2.9543 (2.6669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1327)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 2.5354 (2.6644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2930 (1.1477)  time: 0.5339  data: 0.0007  max mem: 54228
Epoch: [233] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 2.5354 (2.6627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2930 (1.1477)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5838 (0.5838)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.5913  data: 6.2568  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6771 (0.6840)  acc1: 87.6000 (87.0546)  acc5: 97.6000 (98.0364)  time: 0.8715  data: 0.5691  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8473 (0.8097)  acc1: 82.0000 (83.9238)  acc5: 96.8000 (96.8191)  time: 0.2998  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8774 (0.8222)  acc1: 82.0000 (83.6000)  acc5: 96.0000 (96.6880)  time: 0.2999  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5575 s / it)
* Acc@1 84.116 Acc@5 96.878 loss 0.804
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.28%
Epoch: [234]  [   0/1251]  eta: 1:27:25  lr: 0.000525  min_lr: 0.000525  loss: 1.5431 (1.5431)  weight_decay: 0.0500 (0.0500)  time: 4.1934  data: 2.4849  max mem: 54228
Epoch: [234]  [ 200/1251]  eta: 0:11:18  lr: 0.000522  min_lr: 0.000522  loss: 2.5694 (2.6564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0308 (1.0500)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [234]  [ 400/1251]  eta: 0:09:02  lr: 0.000520  min_lr: 0.000520  loss: 2.7999 (2.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2377 (1.1111)  time: 0.6362  data: 0.0005  max mem: 54228
Epoch: [234]  [ 600/1251]  eta: 0:06:53  lr: 0.000517  min_lr: 0.000517  loss: 2.7644 (2.6547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9398 (1.0883)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [234]  [ 800/1251]  eta: 0:04:45  lr: 0.000515  min_lr: 0.000515  loss: 2.7663 (2.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1674 (1.1240)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [234]  [1000/1251]  eta: 0:02:38  lr: 0.000513  min_lr: 0.000513  loss: 2.5989 (2.6661)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.1470)  time: 0.6337  data: 0.0005  max mem: 54228
Epoch: [234]  [1200/1251]  eta: 0:00:32  lr: 0.000510  min_lr: 0.000510  loss: 2.7671 (2.6653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (1.1321)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.6546 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (1.1251)  time: 0.5334  data: 0.0005  max mem: 54228
Epoch: [234] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.6546 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (1.1251)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.5830 (0.5830)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 6.4807  data: 6.1577  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7093 (0.7203)  acc1: 86.8000 (87.5273)  acc5: 98.0000 (97.8546)  time: 0.8616  data: 0.5601  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8918 (0.8441)  acc1: 82.0000 (84.1714)  acc5: 96.8000 (96.8381)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9203 (0.8571)  acc1: 81.2000 (83.6960)  acc5: 96.8000 (96.8000)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5516 s / it)
* Acc@1 84.016 Acc@5 96.858 loss 0.849
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.28%
Epoch: [235]  [   0/1251]  eta: 1:23:51  lr: 0.000510  min_lr: 0.000510  loss: 2.9264 (2.9264)  weight_decay: 0.0500 (0.0500)  time: 4.0222  data: 2.9366  max mem: 54228
Epoch: [235]  [ 200/1251]  eta: 0:11:18  lr: 0.000507  min_lr: 0.000507  loss: 2.7422 (2.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.1113)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [235]  [ 400/1251]  eta: 0:09:03  lr: 0.000505  min_lr: 0.000505  loss: 2.7911 (2.6719)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9953 (1.1334)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [235]  [ 600/1251]  eta: 0:06:53  lr: 0.000502  min_lr: 0.000502  loss: 2.8116 (2.6824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0157 (1.1263)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [235]  [ 800/1251]  eta: 0:04:45  lr: 0.000500  min_lr: 0.000500  loss: 2.8681 (2.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8794 (1.1098)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [235]  [1000/1251]  eta: 0:02:38  lr: 0.000498  min_lr: 0.000498  loss: 2.9151 (2.6599)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1282 (1.1104)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [235]  [1200/1251]  eta: 0:00:32  lr: 0.000495  min_lr: 0.000495  loss: 2.6804 (2.6613)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2166 (1.1233)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.6184 (2.6588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0362 (1.1201)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [235] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.6184 (2.6602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0362 (1.1201)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.5971 (0.5971)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5066  data: 6.1795  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7295 (0.7369)  acc1: 86.8000 (86.6909)  acc5: 98.0000 (97.9636)  time: 0.8642  data: 0.5622  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8768 (0.8497)  acc1: 82.0000 (83.8095)  acc5: 96.8000 (96.8571)  time: 0.3002  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9195 (0.8620)  acc1: 82.0000 (83.5680)  acc5: 96.4000 (96.8480)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5527 s / it)
* Acc@1 84.102 Acc@5 96.846 loss 0.854
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.28%
Epoch: [236]  [   0/1251]  eta: 1:21:53  lr: 0.000495  min_lr: 0.000495  loss: 3.1312 (3.1312)  weight_decay: 0.0500 (0.0500)  time: 3.9276  data: 2.6417  max mem: 54228
Epoch: [236]  [ 200/1251]  eta: 0:11:20  lr: 0.000492  min_lr: 0.000492  loss: 2.8481 (2.6449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0148 (1.0395)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [236]  [ 400/1251]  eta: 0:09:02  lr: 0.000490  min_lr: 0.000490  loss: 2.6635 (2.6489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2444 (1.1123)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [236]  [ 600/1251]  eta: 0:06:52  lr: 0.000488  min_lr: 0.000488  loss: 2.7028 (2.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0334 (1.0972)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [236]  [ 800/1251]  eta: 0:04:45  lr: 0.000485  min_lr: 0.000485  loss: 2.8438 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2033 (1.1718)  time: 0.6275  data: 0.0004  max mem: 54228
Epoch: [236]  [1000/1251]  eta: 0:02:38  lr: 0.000483  min_lr: 0.000483  loss: 2.6325 (2.6703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9914 (1.1687)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [236]  [1200/1251]  eta: 0:00:32  lr: 0.000481  min_lr: 0.000481  loss: 2.9769 (2.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.1702)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 2.7707 (2.6720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.1647)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [236] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 2.7707 (2.6616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.1647)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.6187 (0.6187)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 6.3664  data: 6.0301  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7528 (0.7525)  acc1: 86.8000 (86.8727)  acc5: 98.0000 (97.9636)  time: 0.8512  data: 0.5485  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9128 (0.8693)  acc1: 83.2000 (84.2286)  acc5: 96.4000 (96.8762)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9475 (0.8815)  acc1: 81.6000 (83.8560)  acc5: 96.4000 (96.7840)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5464 s / it)
* Acc@1 84.328 Acc@5 96.902 loss 0.872
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.33%
Epoch: [237]  [   0/1251]  eta: 1:19:35  lr: 0.000480  min_lr: 0.000480  loss: 2.4911 (2.4911)  weight_decay: 0.0500 (0.0500)  time: 3.8174  data: 3.1706  max mem: 54228
Epoch: [237]  [ 200/1251]  eta: 0:11:16  lr: 0.000478  min_lr: 0.000478  loss: 2.8394 (2.6352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0975 (1.1930)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [237]  [ 400/1251]  eta: 0:09:01  lr: 0.000475  min_lr: 0.000475  loss: 2.6147 (2.6251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9024 (1.0971)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [237]  [ 600/1251]  eta: 0:06:53  lr: 0.000473  min_lr: 0.000473  loss: 2.7001 (2.6383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0912 (1.0803)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [237]  [ 800/1251]  eta: 0:04:45  lr: 0.000471  min_lr: 0.000471  loss: 2.7412 (2.6337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9632 (1.0692)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [237]  [1000/1251]  eta: 0:02:38  lr: 0.000468  min_lr: 0.000468  loss: 2.7412 (2.6403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0625 (1.0907)  time: 0.6340  data: 0.0005  max mem: 54228
Epoch: [237]  [1200/1251]  eta: 0:00:32  lr: 0.000466  min_lr: 0.000466  loss: 2.8213 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (1.1159)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.8187 (2.6458)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1206 (1.1188)  time: 0.5334  data: 0.0006  max mem: 54228
Epoch: [237] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.8187 (2.6546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1206 (1.1188)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5967 (0.5967)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.5806  data: 6.2637  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7574 (0.7312)  acc1: 88.0000 (87.2727)  acc5: 98.0000 (97.9636)  time: 0.8708  data: 0.5697  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9063 (0.8465)  acc1: 82.4000 (84.4191)  acc5: 97.2000 (96.9524)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9278 (0.8600)  acc1: 82.0000 (84.0640)  acc5: 96.8000 (96.9760)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5583 s / it)
* Acc@1 84.168 Acc@5 96.868 loss 0.855
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.33%
Epoch: [238]  [   0/1251]  eta: 1:24:33  lr: 0.000466  min_lr: 0.000466  loss: 1.8404 (1.8404)  weight_decay: 0.0500 (0.0500)  time: 4.0553  data: 2.7309  max mem: 54228
Epoch: [238]  [ 200/1251]  eta: 0:11:18  lr: 0.000463  min_lr: 0.000463  loss: 2.7213 (2.6061)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0311 (1.2430)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [238]  [ 400/1251]  eta: 0:09:03  lr: 0.000461  min_lr: 0.000461  loss: 2.7532 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0864 (1.2271)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [238]  [ 600/1251]  eta: 0:06:53  lr: 0.000459  min_lr: 0.000459  loss: 2.6550 (2.6433)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0358 (1.2056)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [238]  [ 800/1251]  eta: 0:04:45  lr: 0.000456  min_lr: 0.000456  loss: 2.8070 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3638 (1.2296)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [238]  [1000/1251]  eta: 0:02:38  lr: 0.000454  min_lr: 0.000454  loss: 2.8718 (2.6603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9000 (1.2009)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [238]  [1200/1251]  eta: 0:00:32  lr: 0.000452  min_lr: 0.000452  loss: 2.7523 (2.6551)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (1.1794)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.6272 (2.6558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (1.1720)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [238] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.6272 (2.6452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (1.1720)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5688 (0.5688)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.8023  data: 6.4675  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7105 (0.7072)  acc1: 86.8000 (87.0545)  acc5: 98.0000 (98.1818)  time: 0.8902  data: 0.5883  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8775 (0.8225)  acc1: 82.4000 (84.0191)  acc5: 97.2000 (97.0476)  time: 0.2988  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9073 (0.8325)  acc1: 82.4000 (83.7440)  acc5: 96.8000 (97.0240)  time: 0.2988  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5632 s / it)
* Acc@1 84.298 Acc@5 96.980 loss 0.828
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.33%
Epoch: [239]  [   0/1251]  eta: 1:25:05  lr: 0.000451  min_lr: 0.000451  loss: 2.9240 (2.9240)  weight_decay: 0.0500 (0.0500)  time: 4.0813  data: 1.9694  max mem: 54228
Epoch: [239]  [ 200/1251]  eta: 0:11:20  lr: 0.000449  min_lr: 0.000449  loss: 2.6547 (2.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0784 (1.1407)  time: 0.6427  data: 0.0004  max mem: 54228
Epoch: [239]  [ 400/1251]  eta: 0:09:03  lr: 0.000447  min_lr: 0.000447  loss: 2.8607 (2.6548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0103 (1.1672)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [239]  [ 600/1251]  eta: 0:06:53  lr: 0.000445  min_lr: 0.000445  loss: 2.8371 (2.6297)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2298 (1.2171)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [239]  [ 800/1251]  eta: 0:04:45  lr: 0.000442  min_lr: 0.000442  loss: 2.7377 (2.6326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0033 (1.2321)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [239]  [1000/1251]  eta: 0:02:38  lr: 0.000440  min_lr: 0.000440  loss: 2.5332 (2.6328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0243 (1.2065)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [239]  [1200/1251]  eta: 0:00:32  lr: 0.000438  min_lr: 0.000438  loss: 2.5241 (2.6344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1890 (1.1876)  time: 0.6354  data: 0.0004  max mem: 54228
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.5079 (2.6350)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.1921)  time: 0.5330  data: 0.0005  max mem: 54228
Epoch: [239] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.5079 (2.6391)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.1921)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5376 (0.5376)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.2566  data: 4.9225  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.6686 (0.6773)  acc1: 86.8000 (87.2000)  acc5: 98.0000 (98.0000)  time: 0.7507  data: 0.4479  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8370 (0.8050)  acc1: 82.4000 (84.1524)  acc5: 96.8000 (96.8762)  time: 0.3003  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8679 (0.8154)  acc1: 82.4000 (83.8720)  acc5: 96.0000 (96.7680)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5057 s / it)
* Acc@1 84.250 Acc@5 96.922 loss 0.806
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.33%
Epoch: [240]  [   0/1251]  eta: 1:16:36  lr: 0.000437  min_lr: 0.000437  loss: 2.9207 (2.9207)  weight_decay: 0.0500 (0.0500)  time: 3.6742  data: 2.1740  max mem: 54228
Epoch: [240]  [ 200/1251]  eta: 0:11:19  lr: 0.000435  min_lr: 0.000435  loss: 2.5549 (2.6381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9959 (1.0757)  time: 0.6290  data: 0.0005  max mem: 54228
Epoch: [240]  [ 400/1251]  eta: 0:09:02  lr: 0.000433  min_lr: 0.000433  loss: 2.8051 (2.6284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9805 (1.0432)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [240]  [ 600/1251]  eta: 0:06:53  lr: 0.000431  min_lr: 0.000431  loss: 2.6891 (2.6390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9883 (1.0726)  time: 0.6327  data: 0.0005  max mem: 54228
Epoch: [240]  [ 800/1251]  eta: 0:04:45  lr: 0.000428  min_lr: 0.000428  loss: 2.7531 (2.6335)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1547 (1.1104)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [240]  [1000/1251]  eta: 0:02:38  lr: 0.000426  min_lr: 0.000426  loss: 2.7868 (2.6359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9491 (1.1027)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [240]  [1200/1251]  eta: 0:00:32  lr: 0.000424  min_lr: 0.000424  loss: 2.8169 (2.6321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.1079)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.6656 (2.6315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.1088)  time: 0.5330  data: 0.0007  max mem: 54228
Epoch: [240] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.6656 (2.6435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0270 (1.1088)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5853 (0.5853)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.3262  data: 5.9849  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6985 (0.6985)  acc1: 86.8000 (86.7273)  acc5: 98.0000 (98.0727)  time: 0.8472  data: 0.5445  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8838 (0.8157)  acc1: 81.6000 (83.9810)  acc5: 96.8000 (96.9905)  time: 0.2991  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8825 (0.8249)  acc1: 81.6000 (83.7440)  acc5: 96.8000 (96.9600)  time: 0.2989  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5438 s / it)
* Acc@1 84.458 Acc@5 96.932 loss 0.818
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.46%
Epoch: [241]  [   0/1251]  eta: 1:07:43  lr: 0.000423  min_lr: 0.000423  loss: 2.3463 (2.3463)  weight_decay: 0.0500 (0.0500)  time: 3.2479  data: 2.6118  max mem: 54228
Epoch: [241]  [ 200/1251]  eta: 0:11:13  lr: 0.000421  min_lr: 0.000421  loss: 2.7969 (2.6287)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (1.2039)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [241]  [ 400/1251]  eta: 0:09:01  lr: 0.000419  min_lr: 0.000419  loss: 2.7310 (2.6484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0214 (1.1677)  time: 0.6357  data: 0.0005  max mem: 54228
Epoch: [241]  [ 600/1251]  eta: 0:06:52  lr: 0.000417  min_lr: 0.000417  loss: 2.6333 (2.6461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9656 (1.1597)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [241]  [ 800/1251]  eta: 0:04:45  lr: 0.000415  min_lr: 0.000415  loss: 2.6365 (2.6544)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0147 (1.1692)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [241]  [1000/1251]  eta: 0:02:38  lr: 0.000412  min_lr: 0.000412  loss: 2.8177 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [241]  [1200/1251]  eta: 0:00:32  lr: 0.000410  min_lr: 0.000410  loss: 2.7323 (2.6568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0158 (nan)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.8122 (2.6594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9257 (nan)  time: 0.5335  data: 0.0007  max mem: 54228
Epoch: [241] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.8122 (2.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9257 (nan)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.6494 (0.6494)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 4.6260  data: 4.2978  max mem: 54228
Test:  [10/25]  eta: 0:00:11  loss: 0.7533 (0.7726)  acc1: 87.2000 (87.3091)  acc5: 98.0000 (98.0000)  time: 0.7844  data: 0.4825  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9396 (0.8907)  acc1: 82.8000 (84.4000)  acc5: 96.8000 (96.8762)  time: 0.3497  data: 0.0505  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9679 (0.9012)  acc1: 82.8000 (84.0320)  acc5: 96.4000 (96.8000)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:12 (0.5173 s / it)
* Acc@1 84.408 Acc@5 96.934 loss 0.894
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.46%
Epoch: [242]  [   0/1251]  eta: 1:28:46  lr: 0.000410  min_lr: 0.000410  loss: 2.7090 (2.7090)  weight_decay: 0.0500 (0.0500)  time: 4.2578  data: 2.5578  max mem: 54228
Epoch: [242]  [ 200/1251]  eta: 0:11:21  lr: 0.000407  min_lr: 0.000407  loss: 2.8157 (2.6361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9795 (1.1632)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [242]  [ 400/1251]  eta: 0:09:03  lr: 0.000405  min_lr: 0.000405  loss: 2.7422 (2.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9892 (1.1394)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [242]  [ 600/1251]  eta: 0:06:53  lr: 0.000403  min_lr: 0.000403  loss: 2.6789 (2.6457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3366 (1.1587)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [242]  [ 800/1251]  eta: 0:04:46  lr: 0.000401  min_lr: 0.000401  loss: 2.7773 (2.6426)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1298 (1.1523)  time: 0.6370  data: 0.0005  max mem: 54228
Epoch: [242]  [1000/1251]  eta: 0:02:38  lr: 0.000399  min_lr: 0.000399  loss: 2.4571 (2.6406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0503 (1.1506)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [242]  [1200/1251]  eta: 0:00:32  lr: 0.000397  min_lr: 0.000397  loss: 2.6907 (2.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9939 (1.1355)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.6726 (2.6448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9969 (1.1344)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [242] Total time: 0:13:09 (0.6313 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.6726 (2.6313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9969 (1.1344)
Test:  [ 0/25]  eta: 0:02:41  loss: 0.5389 (0.5389)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.4448  data: 6.1187  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6694 (0.6782)  acc1: 87.2000 (87.0546)  acc5: 98.0000 (98.0727)  time: 0.8584  data: 0.5566  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8589 (0.7956)  acc1: 82.0000 (84.0571)  acc5: 96.8000 (96.9333)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8682 (0.8049)  acc1: 82.0000 (83.7600)  acc5: 96.4000 (96.8960)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5497 s / it)
* Acc@1 84.458 Acc@5 96.992 loss 0.796
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.46%
Epoch: [243]  [   0/1251]  eta: 1:20:12  lr: 0.000396  min_lr: 0.000396  loss: 2.6886 (2.6886)  weight_decay: 0.0500 (0.0500)  time: 3.8473  data: 2.9049  max mem: 54228
Epoch: [243]  [ 200/1251]  eta: 0:11:18  lr: 0.000394  min_lr: 0.000394  loss: 2.7887 (2.5951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0717 (1.1565)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [243]  [ 400/1251]  eta: 0:09:02  lr: 0.000392  min_lr: 0.000392  loss: 2.6425 (2.5938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0934 (1.2433)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [243]  [ 600/1251]  eta: 0:06:53  lr: 0.000390  min_lr: 0.000390  loss: 2.7496 (2.6137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0293 (1.2060)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [243]  [ 800/1251]  eta: 0:04:45  lr: 0.000388  min_lr: 0.000388  loss: 2.6468 (2.6073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0733 (1.2022)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [243]  [1000/1251]  eta: 0:02:38  lr: 0.000385  min_lr: 0.000385  loss: 2.7304 (2.6087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0180 (1.2135)  time: 0.6340  data: 0.0005  max mem: 54228
Epoch: [243]  [1200/1251]  eta: 0:00:32  lr: 0.000383  min_lr: 0.000383  loss: 2.7858 (2.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1784 (1.2007)  time: 0.6369  data: 0.0005  max mem: 54228
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.7803 (2.6182)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0521 (1.1949)  time: 0.5333  data: 0.0007  max mem: 54228
Epoch: [243] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.7803 (2.6280)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0521 (1.1949)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6517 (0.6517)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.9109  data: 5.5874  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7771 (0.7789)  acc1: 86.8000 (86.9818)  acc5: 97.6000 (97.7455)  time: 0.8099  data: 0.5083  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9745 (0.8964)  acc1: 82.8000 (84.2476)  acc5: 96.8000 (96.7238)  time: 0.2997  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9651 (0.9072)  acc1: 82.8000 (83.9040)  acc5: 96.8000 (96.7360)  time: 0.2997  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5295 s / it)
* Acc@1 84.378 Acc@5 96.962 loss 0.896
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.46%
Epoch: [244]  [   0/1251]  eta: 1:20:47  lr: 0.000383  min_lr: 0.000383  loss: 2.7856 (2.7856)  weight_decay: 0.0500 (0.0500)  time: 3.8749  data: 3.1893  max mem: 54228
Epoch: [244]  [ 200/1251]  eta: 0:11:17  lr: 0.000381  min_lr: 0.000381  loss: 2.4626 (2.6356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9812 (1.1727)  time: 0.6277  data: 0.0004  max mem: 54228
Epoch: [244]  [ 400/1251]  eta: 0:09:01  lr: 0.000379  min_lr: 0.000379  loss: 2.6865 (2.6418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9138 (1.1864)  time: 0.6276  data: 0.0005  max mem: 54228
Epoch: [244]  [ 600/1251]  eta: 0:06:52  lr: 0.000377  min_lr: 0.000377  loss: 2.8807 (2.6356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3308 (1.1756)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [244]  [ 800/1251]  eta: 0:04:45  lr: 0.000374  min_lr: 0.000374  loss: 2.7254 (2.6342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (1.1622)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [244]  [1000/1251]  eta: 0:02:38  lr: 0.000372  min_lr: 0.000372  loss: 2.8207 (2.6316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1773 (1.1732)  time: 0.6282  data: 0.0005  max mem: 54228
Epoch: [244]  [1200/1251]  eta: 0:00:32  lr: 0.000370  min_lr: 0.000370  loss: 2.7107 (2.6347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9950 (nan)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.7885 (2.6390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (nan)  time: 0.5331  data: 0.0006  max mem: 54228
Epoch: [244] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.7885 (2.6202)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (nan)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6110 (0.6110)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.2834  data: 5.9557  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7242 (0.7344)  acc1: 87.6000 (87.2727)  acc5: 97.6000 (97.7455)  time: 0.8438  data: 0.5418  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.9292 (0.8558)  acc1: 82.8000 (84.3429)  acc5: 96.4000 (96.7238)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9292 (0.8677)  acc1: 82.8000 (84.0640)  acc5: 96.4000 (96.6560)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5464 s / it)
* Acc@1 84.412 Acc@5 96.928 loss 0.862
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.46%
Epoch: [245]  [   0/1251]  eta: 1:22:54  lr: 0.000370  min_lr: 0.000370  loss: 2.8041 (2.8041)  weight_decay: 0.0500 (0.0500)  time: 3.9767  data: 1.9077  max mem: 54228
Epoch: [245]  [ 200/1251]  eta: 0:11:18  lr: 0.000368  min_lr: 0.000368  loss: 2.6213 (2.5742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9296 (1.1153)  time: 0.6384  data: 0.0004  max mem: 54228
Epoch: [245]  [ 400/1251]  eta: 0:09:02  lr: 0.000366  min_lr: 0.000366  loss: 2.7229 (2.6123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2223 (1.1878)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [245]  [ 600/1251]  eta: 0:06:53  lr: 0.000364  min_lr: 0.000364  loss: 2.7605 (2.6119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9613 (1.1885)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [245]  [ 800/1251]  eta: 0:04:45  lr: 0.000362  min_lr: 0.000362  loss: 2.6949 (2.6112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1939 (1.1967)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [245]  [1000/1251]  eta: 0:02:38  lr: 0.000359  min_lr: 0.000359  loss: 2.7855 (2.6203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0887 (1.2144)  time: 0.6275  data: 0.0005  max mem: 54228
Epoch: [245]  [1200/1251]  eta: 0:00:32  lr: 0.000357  min_lr: 0.000357  loss: 2.8403 (2.6257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.1922)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 2.7106 (2.6278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0218 (1.1874)  time: 0.5332  data: 0.0006  max mem: 54228
Epoch: [245] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 2.7106 (2.6158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0218 (1.1874)
Test:  [ 0/25]  eta: 0:02:33  loss: 0.5680 (0.5680)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.1291  data: 5.7911  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7004 (0.7114)  acc1: 87.6000 (87.4182)  acc5: 97.6000 (97.8546)  time: 0.8300  data: 0.5268  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8898 (0.8286)  acc1: 83.2000 (84.3810)  acc5: 96.4000 (96.9333)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8900 (0.8395)  acc1: 82.8000 (84.0640)  acc5: 96.4000 (96.8800)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5372 s / it)
* Acc@1 84.390 Acc@5 96.958 loss 0.833
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.46%
Epoch: [246]  [   0/1251]  eta: 1:27:54  lr: 0.000357  min_lr: 0.000357  loss: 2.2404 (2.2404)  weight_decay: 0.0500 (0.0500)  time: 4.2164  data: 3.5857  max mem: 54228
Epoch: [246]  [ 200/1251]  eta: 0:11:21  lr: 0.000355  min_lr: 0.000355  loss: 2.7530 (2.5874)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1769 (1.1420)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [246]  [ 400/1251]  eta: 0:09:03  lr: 0.000353  min_lr: 0.000353  loss: 2.6548 (2.5828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9709 (1.1575)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [246]  [ 600/1251]  eta: 0:06:53  lr: 0.000351  min_lr: 0.000351  loss: 2.5058 (2.5780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0485 (1.1778)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [246]  [ 800/1251]  eta: 0:04:45  lr: 0.000349  min_lr: 0.000349  loss: 2.7432 (2.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1070 (1.1946)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [246]  [1000/1251]  eta: 0:02:38  lr: 0.000347  min_lr: 0.000347  loss: 2.8044 (2.5892)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0813 (1.1827)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [246]  [1200/1251]  eta: 0:00:32  lr: 0.000345  min_lr: 0.000345  loss: 2.8296 (2.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8987 (1.1710)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 2.7297 (2.5899)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9488 (1.1644)  time: 0.5382  data: 0.0006  max mem: 54228
Epoch: [246] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 2.7297 (2.6094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9488 (1.1644)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5820 (0.5820)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 6.6093  data: 6.2775  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7029 (0.7068)  acc1: 88.0000 (87.3455)  acc5: 98.0000 (97.8182)  time: 0.8733  data: 0.5710  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8693 (0.8216)  acc1: 82.4000 (84.4762)  acc5: 96.4000 (96.8000)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8693 (0.8305)  acc1: 82.4000 (84.1280)  acc5: 96.4000 (96.8320)  time: 0.2997  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5577 s / it)
* Acc@1 84.486 Acc@5 96.946 loss 0.823
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.49%
Epoch: [247]  [   0/1251]  eta: 1:12:32  lr: 0.000344  min_lr: 0.000344  loss: 2.8372 (2.8372)  weight_decay: 0.0500 (0.0500)  time: 3.4794  data: 2.8481  max mem: 54228
Epoch: [247]  [ 200/1251]  eta: 0:11:15  lr: 0.000342  min_lr: 0.000342  loss: 2.6059 (2.5946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0821 (1.0939)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [247]  [ 400/1251]  eta: 0:09:00  lr: 0.000340  min_lr: 0.000340  loss: 2.8081 (2.6069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2790 (1.1651)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [247]  [ 600/1251]  eta: 0:06:52  lr: 0.000338  min_lr: 0.000338  loss: 2.6430 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.1455)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [247]  [ 800/1251]  eta: 0:04:45  lr: 0.000336  min_lr: 0.000336  loss: 2.5925 (2.5817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0755 (1.1539)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [247]  [1000/1251]  eta: 0:02:38  lr: 0.000334  min_lr: 0.000334  loss: 2.4026 (2.5756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2379 (1.1980)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [247]  [1200/1251]  eta: 0:00:32  lr: 0.000332  min_lr: 0.000332  loss: 2.6630 (2.5730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3066 (1.2083)  time: 0.6276  data: 0.0004  max mem: 54228
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.6077 (2.5739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0511 (1.2133)  time: 0.5327  data: 0.0005  max mem: 54228
Epoch: [247] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.6077 (2.6012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0511 (1.2133)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5693 (0.5693)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3416  data: 6.0007  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6952 (0.7134)  acc1: 87.6000 (87.1273)  acc5: 98.0000 (98.0364)  time: 0.8491  data: 0.5459  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8844 (0.8354)  acc1: 81.6000 (84.0191)  acc5: 96.4000 (96.8571)  time: 0.2997  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8978 (0.8465)  acc1: 81.6000 (83.7280)  acc5: 96.4000 (96.8320)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5466 s / it)
* Acc@1 84.400 Acc@5 96.974 loss 0.837
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.49%
Epoch: [248]  [   0/1251]  eta: 1:24:18  lr: 0.000332  min_lr: 0.000332  loss: 3.0962 (3.0962)  weight_decay: 0.0500 (0.0500)  time: 4.0436  data: 2.6480  max mem: 54228
Epoch: [248]  [ 200/1251]  eta: 0:11:18  lr: 0.000330  min_lr: 0.000330  loss: 2.7522 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9284 (1.0050)  time: 0.6285  data: 0.0004  max mem: 54228
Epoch: [248]  [ 400/1251]  eta: 0:09:03  lr: 0.000328  min_lr: 0.000328  loss: 2.8200 (2.6137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2394 (1.1008)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [248]  [ 600/1251]  eta: 0:06:53  lr: 0.000326  min_lr: 0.000326  loss: 2.4977 (2.5958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0819 (1.1487)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [248]  [ 800/1251]  eta: 0:04:45  lr: 0.000324  min_lr: 0.000324  loss: 2.7755 (2.5964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0817 (1.1536)  time: 0.6345  data: 0.0005  max mem: 54228
Epoch: [248]  [1000/1251]  eta: 0:02:38  lr: 0.000322  min_lr: 0.000322  loss: 2.6928 (2.6076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1191 (1.1717)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [248]  [1200/1251]  eta: 0:00:32  lr: 0.000320  min_lr: 0.000320  loss: 2.3845 (2.6079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1560 (1.1703)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.4361 (2.6081)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1541 (1.1711)  time: 0.5330  data: 0.0006  max mem: 54228
Epoch: [248] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.4361 (2.6023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1541 (1.1711)
Test:  [ 0/25]  eta: 0:02:34  loss: 0.5864 (0.5864)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.1652  data: 5.8415  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6963 (0.7150)  acc1: 87.6000 (87.5273)  acc5: 98.0000 (97.9273)  time: 0.8326  data: 0.5315  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8823 (0.8375)  acc1: 82.4000 (84.3619)  acc5: 96.8000 (96.8952)  time: 0.2991  data: 0.0003  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9051 (0.8498)  acc1: 82.0000 (84.0320)  acc5: 96.4000 (96.7520)  time: 0.2990  data: 0.0002  max mem: 54228
Test: Total time: 0:00:13 (0.5384 s / it)
* Acc@1 84.444 Acc@5 97.014 loss 0.838
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.49%
Epoch: [249]  [   0/1251]  eta: 1:20:27  lr: 0.000320  min_lr: 0.000320  loss: 3.0798 (3.0798)  weight_decay: 0.0500 (0.0500)  time: 3.8588  data: 2.6955  max mem: 54228
Epoch: [249]  [ 200/1251]  eta: 0:11:20  lr: 0.000318  min_lr: 0.000318  loss: 2.6440 (2.5794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9203 (1.1148)  time: 0.6432  data: 0.0005  max mem: 54228
Epoch: [249]  [ 400/1251]  eta: 0:09:02  lr: 0.000316  min_lr: 0.000316  loss: 2.7219 (2.6027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2471 (1.1724)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [249]  [ 600/1251]  eta: 0:06:53  lr: 0.000314  min_lr: 0.000314  loss: 2.7980 (2.6041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3705 (1.2659)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [249]  [ 800/1251]  eta: 0:04:45  lr: 0.000312  min_lr: 0.000312  loss: 2.7509 (2.6027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0949 (1.2101)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [249]  [1000/1251]  eta: 0:02:38  lr: 0.000310  min_lr: 0.000310  loss: 2.7329 (2.6055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0220 (1.2154)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [249]  [1200/1251]  eta: 0:00:32  lr: 0.000308  min_lr: 0.000308  loss: 2.4118 (2.6022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1919 (1.2170)  time: 0.6378  data: 0.0005  max mem: 54228
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.8706 (2.6063)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0392 (1.2102)  time: 0.5337  data: 0.0007  max mem: 54228
Epoch: [249] Total time: 0:13:10 (0.6316 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.8706 (2.6027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0392 (1.2102)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.6176 (0.6176)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.5988  data: 6.2827  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7369 (0.7432)  acc1: 87.6000 (87.4546)  acc5: 98.0000 (98.1455)  time: 0.8732  data: 0.5715  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9073 (0.8560)  acc1: 82.0000 (84.4000)  acc5: 96.8000 (96.9714)  time: 0.3004  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9186 (0.8678)  acc1: 82.0000 (84.0320)  acc5: 96.8000 (96.8320)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5571 s / it)
* Acc@1 84.458 Acc@5 96.974 loss 0.860
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.49%
Epoch: [250]  [   0/1251]  eta: 1:27:37  lr: 0.000307  min_lr: 0.000307  loss: 2.4413 (2.4413)  weight_decay: 0.0500 (0.0500)  time: 4.2025  data: 1.7947  max mem: 54228
Epoch: [250]  [ 200/1251]  eta: 0:11:20  lr: 0.000306  min_lr: 0.000306  loss: 2.6566 (2.5217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0115 (1.0847)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [250]  [ 400/1251]  eta: 0:09:02  lr: 0.000304  min_lr: 0.000304  loss: 2.6800 (2.5575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0558 (1.1481)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [250]  [ 600/1251]  eta: 0:06:53  lr: 0.000302  min_lr: 0.000302  loss: 2.4415 (2.5789)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3130 (1.2175)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [250]  [ 800/1251]  eta: 0:04:45  lr: 0.000300  min_lr: 0.000300  loss: 2.6244 (2.5807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0475 (1.1816)  time: 0.6284  data: 0.0004  max mem: 54228
Epoch: [250]  [1000/1251]  eta: 0:02:38  lr: 0.000298  min_lr: 0.000298  loss: 2.6768 (2.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1123 (1.1704)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [250]  [1200/1251]  eta: 0:00:32  lr: 0.000296  min_lr: 0.000296  loss: 2.4988 (2.5838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1790)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.3117 (2.5837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1745)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [250] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.3117 (2.5928)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0690 (1.1745)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.5336 (0.5336)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.9044  data: 5.5666  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6489 (0.6526)  acc1: 88.4000 (87.7091)  acc5: 98.0000 (98.0000)  time: 0.8410  data: 0.5383  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8001 (0.7728)  acc1: 81.6000 (84.3238)  acc5: 96.8000 (96.8571)  time: 0.3171  data: 0.0177  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8107 (0.7822)  acc1: 81.6000 (84.0320)  acc5: 96.4000 (96.8320)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5435 s / it)
* Acc@1 84.548 Acc@5 96.948 loss 0.775
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.55%
Epoch: [251]  [   0/1251]  eta: 1:14:03  lr: 0.000296  min_lr: 0.000296  loss: 2.2050 (2.2050)  weight_decay: 0.0500 (0.0500)  time: 3.5520  data: 2.9080  max mem: 54228
Epoch: [251]  [ 200/1251]  eta: 0:11:16  lr: 0.000294  min_lr: 0.000294  loss: 2.7835 (2.6064)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0373 (1.1564)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [251]  [ 400/1251]  eta: 0:09:01  lr: 0.000292  min_lr: 0.000292  loss: 2.7290 (2.6127)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0396 (1.2302)  time: 0.6359  data: 0.0005  max mem: 54228
Epoch: [251]  [ 600/1251]  eta: 0:06:52  lr: 0.000290  min_lr: 0.000290  loss: 2.6645 (2.6048)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0556 (1.2539)  time: 0.6279  data: 0.0005  max mem: 54228
Epoch: [251]  [ 800/1251]  eta: 0:04:45  lr: 0.000288  min_lr: 0.000288  loss: 2.4882 (2.5971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1474 (1.2414)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [251]  [1000/1251]  eta: 0:02:38  lr: 0.000286  min_lr: 0.000286  loss: 2.5550 (2.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1179 (1.2333)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [251]  [1200/1251]  eta: 0:00:32  lr: 0.000284  min_lr: 0.000284  loss: 2.7395 (2.5899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3485 (1.2455)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.3793 (2.5896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2279 (1.2446)  time: 0.5338  data: 0.0007  max mem: 54228
Epoch: [251] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.3793 (2.5837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2279 (1.2446)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.5708 (0.5708)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 6.3071  data: 5.9860  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6765 (0.6959)  acc1: 87.6000 (87.2000)  acc5: 98.4000 (98.0000)  time: 0.8461  data: 0.5446  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8743 (0.8121)  acc1: 82.0000 (84.1714)  acc5: 96.4000 (96.9143)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8743 (0.8216)  acc1: 82.0000 (83.9200)  acc5: 96.8000 (96.9440)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5460 s / it)
* Acc@1 84.656 Acc@5 96.970 loss 0.814
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.66%
Epoch: [252]  [   0/1251]  eta: 1:12:07  lr: 0.000284  min_lr: 0.000284  loss: 2.5633 (2.5633)  weight_decay: 0.0500 (0.0500)  time: 3.4593  data: 2.8231  max mem: 54228
Epoch: [252]  [ 200/1251]  eta: 0:11:16  lr: 0.000282  min_lr: 0.000282  loss: 2.7262 (2.5705)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1116 (1.1129)  time: 0.6288  data: 0.0004  max mem: 54228
Epoch: [252]  [ 400/1251]  eta: 0:09:02  lr: 0.000280  min_lr: 0.000280  loss: 2.6896 (2.5664)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6280  data: 0.0004  max mem: 54228
Epoch: [252]  [ 600/1251]  eta: 0:06:52  lr: 0.000279  min_lr: 0.000279  loss: 2.7302 (2.5747)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0937 (nan)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [252]  [ 800/1251]  eta: 0:04:45  lr: 0.000277  min_lr: 0.000277  loss: 2.6266 (2.5793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0747 (nan)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [252]  [1000/1251]  eta: 0:02:38  lr: 0.000275  min_lr: 0.000275  loss: 2.6467 (2.5696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9388 (nan)  time: 0.6292  data: 0.0005  max mem: 54228
Epoch: [252]  [1200/1251]  eta: 0:00:32  lr: 0.000273  min_lr: 0.000273  loss: 2.6735 (2.5674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0400 (nan)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.7288 (2.5677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (nan)  time: 0.5336  data: 0.0006  max mem: 54228
Epoch: [252] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.7288 (2.5805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (nan)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5806 (0.5806)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.8233  data: 6.5119  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7135 (0.7141)  acc1: 88.0000 (87.3818)  acc5: 98.0000 (98.0727)  time: 0.8971  data: 0.5923  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8800 (0.8406)  acc1: 82.8000 (84.4000)  acc5: 96.4000 (96.8191)  time: 0.3022  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9156 (0.8542)  acc1: 82.4000 (84.0960)  acc5: 96.4000 (96.8000)  time: 0.3009  data: 0.0001  max mem: 54228
Test: Total time: 0:00:14 (0.5690 s / it)
* Acc@1 84.532 Acc@5 96.970 loss 0.849
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.66%
Epoch: [253]  [   0/1251]  eta: 1:31:04  lr: 0.000273  min_lr: 0.000273  loss: 1.7984 (1.7984)  weight_decay: 0.0500 (0.0500)  time: 4.3685  data: 3.0072  max mem: 54228
Epoch: [253]  [ 200/1251]  eta: 0:11:22  lr: 0.000271  min_lr: 0.000271  loss: 2.6556 (2.5763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2105 (1.1455)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [253]  [ 400/1251]  eta: 0:09:03  lr: 0.000269  min_lr: 0.000269  loss: 2.7080 (2.5961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3847 (1.2120)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [253]  [ 600/1251]  eta: 0:06:53  lr: 0.000267  min_lr: 0.000267  loss: 2.5157 (2.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9916 (1.1955)  time: 0.6273  data: 0.0004  max mem: 54228
Epoch: [253]  [ 800/1251]  eta: 0:04:45  lr: 0.000265  min_lr: 0.000265  loss: 2.4874 (2.5910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2752 (1.2187)  time: 0.6281  data: 0.0004  max mem: 54228
Epoch: [253]  [1000/1251]  eta: 0:02:38  lr: 0.000264  min_lr: 0.000264  loss: 2.6208 (2.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0779 (1.1987)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [253]  [1200/1251]  eta: 0:00:32  lr: 0.000262  min_lr: 0.000262  loss: 2.6614 (2.5810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0870 (1.1937)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.6036 (2.5811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1442 (1.1998)  time: 0.5332  data: 0.0005  max mem: 54228
Epoch: [253] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.6036 (2.5788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1442 (1.1998)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6223 (0.6223)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 6.6534  data: 6.3187  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.7288 (0.7338)  acc1: 87.2000 (86.9091)  acc5: 97.6000 (97.7818)  time: 0.8774  data: 0.5747  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.9034 (0.8477)  acc1: 82.8000 (84.1143)  acc5: 96.4000 (96.7238)  time: 0.3002  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9217 (0.8593)  acc1: 82.4000 (83.8240)  acc5: 96.4000 (96.7200)  time: 0.3003  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5579 s / it)
* Acc@1 84.390 Acc@5 96.804 loss 0.851
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.66%
Epoch: [254]  [   0/1251]  eta: 1:22:35  lr: 0.000261  min_lr: 0.000261  loss: 1.5335 (1.5335)  weight_decay: 0.0500 (0.0500)  time: 3.9614  data: 2.0517  max mem: 54228
Epoch: [254]  [ 200/1251]  eta: 0:11:18  lr: 0.000260  min_lr: 0.000260  loss: 2.4883 (2.5764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0486 (1.2014)  time: 0.6279  data: 0.0004  max mem: 54228
Epoch: [254]  [ 400/1251]  eta: 0:09:02  lr: 0.000258  min_lr: 0.000258  loss: 2.6715 (2.5634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0830 (1.2261)  time: 0.6287  data: 0.0004  max mem: 54228
Epoch: [254]  [ 600/1251]  eta: 0:06:53  lr: 0.000256  min_lr: 0.000256  loss: 2.4654 (2.5608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0215 (1.2298)  time: 0.6277  data: 0.0005  max mem: 54228
Epoch: [254]  [ 800/1251]  eta: 0:04:45  lr: 0.000254  min_lr: 0.000254  loss: 2.8152 (2.5546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9474 (1.1804)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [254]  [1000/1251]  eta: 0:02:38  lr: 0.000253  min_lr: 0.000253  loss: 2.6189 (2.5562)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1156 (1.1953)  time: 0.6425  data: 0.0004  max mem: 54228
Epoch: [254]  [1200/1251]  eta: 0:00:32  lr: 0.000251  min_lr: 0.000251  loss: 2.6940 (2.5563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2194 (1.2060)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.7025 (2.5555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1365 (1.2072)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [254] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.7025 (2.5758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1365 (1.2072)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5750 (0.5750)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3568  data: 6.0390  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7110 (0.7101)  acc1: 86.8000 (87.1273)  acc5: 98.0000 (98.0000)  time: 0.8504  data: 0.5494  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8729 (0.8293)  acc1: 82.8000 (84.3619)  acc5: 96.4000 (96.8000)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9066 (0.8412)  acc1: 82.4000 (84.0800)  acc5: 96.0000 (96.7680)  time: 0.2993  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5503 s / it)
* Acc@1 84.594 Acc@5 96.958 loss 0.834
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.66%
Epoch: [255]  [   0/1251]  eta: 1:25:12  lr: 0.000250  min_lr: 0.000250  loss: 2.3038 (2.3038)  weight_decay: 0.0500 (0.0500)  time: 4.0865  data: 2.6771  max mem: 54228
Epoch: [255]  [ 200/1251]  eta: 0:11:20  lr: 0.000249  min_lr: 0.000249  loss: 2.7040 (2.5692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2263 (1.3683)  time: 0.6390  data: 0.0005  max mem: 54228
Epoch: [255]  [ 400/1251]  eta: 0:09:03  lr: 0.000247  min_lr: 0.000247  loss: 2.7339 (2.5674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1015 (1.3062)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [255]  [ 600/1251]  eta: 0:06:53  lr: 0.000245  min_lr: 0.000245  loss: 2.5054 (2.5587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0734 (1.2926)  time: 0.6284  data: 0.0005  max mem: 54228
Epoch: [255]  [ 800/1251]  eta: 0:04:45  lr: 0.000244  min_lr: 0.000244  loss: 2.7060 (2.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1405 (1.2420)  time: 0.6288  data: 0.0005  max mem: 54228
Epoch: [255]  [1000/1251]  eta: 0:02:38  lr: 0.000242  min_lr: 0.000242  loss: 2.8361 (2.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2456 (1.2479)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [255]  [1200/1251]  eta: 0:00:32  lr: 0.000240  min_lr: 0.000240  loss: 2.7346 (2.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3244 (1.2504)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.7644 (2.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.2512)  time: 0.5333  data: 0.0006  max mem: 54228
Epoch: [255] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.7644 (2.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.2512)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.5602 (0.5602)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.0636  data: 5.7361  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6861 (0.6901)  acc1: 87.2000 (87.5636)  acc5: 98.0000 (97.9273)  time: 0.8237  data: 0.5218  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8540 (0.8165)  acc1: 82.8000 (84.3619)  acc5: 96.8000 (96.8571)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9024 (0.8303)  acc1: 82.0000 (84.0800)  acc5: 96.8000 (96.8640)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5364 s / it)
* Acc@1 84.560 Acc@5 97.086 loss 0.820
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.66%
Epoch: [256]  [   0/1251]  eta: 1:27:02  lr: 0.000240  min_lr: 0.000240  loss: 2.9903 (2.9903)  weight_decay: 0.0500 (0.0500)  time: 4.1750  data: 3.4837  max mem: 54228
Epoch: [256]  [ 200/1251]  eta: 0:11:22  lr: 0.000238  min_lr: 0.000238  loss: 2.6753 (2.5956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0668 (1.1708)  time: 0.6286  data: 0.0004  max mem: 54228
Epoch: [256]  [ 400/1251]  eta: 0:09:03  lr: 0.000236  min_lr: 0.000236  loss: 2.6526 (2.5876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2077 (1.1787)  time: 0.6278  data: 0.0004  max mem: 54228
Epoch: [256]  [ 600/1251]  eta: 0:06:53  lr: 0.000235  min_lr: 0.000235  loss: 2.6823 (2.5829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7364 (1.2962)  time: 0.6289  data: 0.0004  max mem: 54228
Epoch: [256]  [ 800/1251]  eta: 0:04:46  lr: 0.000233  min_lr: 0.000233  loss: 2.6977 (2.5811)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1557 (1.3112)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [256]  [1000/1251]  eta: 0:02:38  lr: 0.000231  min_lr: 0.000231  loss: 2.4613 (2.5729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0749 (1.2884)  time: 0.6292  data: 0.0004  max mem: 54228
Epoch: [256]  [1200/1251]  eta: 0:00:32  lr: 0.000230  min_lr: 0.000230  loss: 2.8014 (2.5786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2988 (1.2957)  time: 0.6335  data: 0.0004  max mem: 54228
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.4872 (2.5753)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1859 (1.2937)  time: 0.5338  data: 0.0006  max mem: 54228
Epoch: [256] Total time: 0:13:10 (0.6318 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.4872 (2.5637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1859 (1.2937)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5278 (0.5278)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.3716  data: 6.0369  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6520 (0.6616)  acc1: 86.8000 (87.4909)  acc5: 98.0000 (98.0364)  time: 0.8517  data: 0.5491  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8493 (0.7816)  acc1: 82.8000 (84.6667)  acc5: 96.8000 (96.8571)  time: 0.2995  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8674 (0.7955)  acc1: 82.4000 (84.3680)  acc5: 96.4000 (96.8160)  time: 0.2995  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5460 s / it)
* Acc@1 84.664 Acc@5 96.988 loss 0.788
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.66%
Epoch: [257]  [   0/1251]  eta: 1:07:27  lr: 0.000229  min_lr: 0.000229  loss: 2.8283 (2.8283)  weight_decay: 0.0500 (0.0500)  time: 3.2356  data: 2.6055  max mem: 54228
Epoch: [257]  [ 200/1251]  eta: 0:11:13  lr: 0.000228  min_lr: 0.000228  loss: 2.4374 (2.5922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0155 (1.1641)  time: 0.6285  data: 0.0006  max mem: 54228
Epoch: [257]  [ 400/1251]  eta: 0:09:00  lr: 0.000226  min_lr: 0.000226  loss: 2.3671 (2.5617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2294 (1.2514)  time: 0.6359  data: 0.0005  max mem: 54228
Epoch: [257]  [ 600/1251]  eta: 0:06:52  lr: 0.000224  min_lr: 0.000224  loss: 2.5562 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.1896)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [257]  [ 800/1251]  eta: 0:04:45  lr: 0.000223  min_lr: 0.000223  loss: 2.5799 (2.5652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1587 (1.1888)  time: 0.6357  data: 0.0005  max mem: 54228
Epoch: [257]  [1000/1251]  eta: 0:02:38  lr: 0.000221  min_lr: 0.000221  loss: 2.7998 (2.5649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0965 (1.2075)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [257]  [1200/1251]  eta: 0:00:32  lr: 0.000219  min_lr: 0.000219  loss: 2.7071 (2.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2709 (1.2447)  time: 0.6289  data: 0.0005  max mem: 54228
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.7315 (2.5626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0775 (1.2386)  time: 0.5331  data: 0.0005  max mem: 54228
Epoch: [257] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.7315 (2.5636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0775 (1.2386)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.5892 (0.5892)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.4994  data: 6.1488  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.7166 (0.7363)  acc1: 87.2000 (87.1273)  acc5: 97.6000 (97.9636)  time: 0.8636  data: 0.5593  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8882 (0.8535)  acc1: 82.8000 (84.3238)  acc5: 96.4000 (96.8571)  time: 0.3003  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.9244 (0.8664)  acc1: 82.8000 (84.1120)  acc5: 96.4000 (96.8160)  time: 0.3004  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5527 s / it)
* Acc@1 84.524 Acc@5 96.962 loss 0.857
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.66%
Epoch: [258]  [   0/1251]  eta: 1:22:41  lr: 0.000219  min_lr: 0.000219  loss: 1.8368 (1.8368)  weight_decay: 0.0500 (0.0500)  time: 3.9664  data: 2.5665  max mem: 54228
Epoch: [258]  [ 200/1251]  eta: 0:11:19  lr: 0.000217  min_lr: 0.000217  loss: 2.6110 (2.5403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2539 (1.2861)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [258]  [ 400/1251]  eta: 0:09:03  lr: 0.000216  min_lr: 0.000216  loss: 2.6581 (2.5351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3493 (1.3133)  time: 0.6282  data: 0.0004  max mem: 54228
Epoch: [258]  [ 600/1251]  eta: 0:06:53  lr: 0.000214  min_lr: 0.000214  loss: 2.6401 (2.5343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9846 (1.2688)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [258]  [ 800/1251]  eta: 0:04:45  lr: 0.000212  min_lr: 0.000212  loss: 2.6603 (2.5397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8910 (1.2283)  time: 0.6278  data: 0.0005  max mem: 54228
Epoch: [258]  [1000/1251]  eta: 0:02:38  lr: 0.000211  min_lr: 0.000211  loss: 2.5693 (2.5462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0254 (1.2088)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [258]  [1200/1251]  eta: 0:00:32  lr: 0.000209  min_lr: 0.000209  loss: 2.6008 (2.5522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3107 (1.2274)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.4760 (2.5507)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2642 (1.2395)  time: 0.5332  data: 0.0007  max mem: 54228
Epoch: [258] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.4760 (2.5565)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2642 (1.2395)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5168 (0.5168)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.5432  data: 6.2095  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6505 (0.6547)  acc1: 86.8000 (87.3091)  acc5: 98.0000 (97.9636)  time: 0.8668  data: 0.5648  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8318 (0.7740)  acc1: 83.2000 (84.4381)  acc5: 96.8000 (96.9714)  time: 0.2990  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8557 (0.7883)  acc1: 82.8000 (84.1760)  acc5: 96.4000 (96.8800)  time: 0.2989  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5527 s / it)
* Acc@1 84.678 Acc@5 97.022 loss 0.782
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.68%
Epoch: [259]  [   0/1251]  eta: 1:04:11  lr: 0.000209  min_lr: 0.000209  loss: 2.1204 (2.1204)  weight_decay: 0.0500 (0.0500)  time: 3.0790  data: 2.4441  max mem: 54228
Epoch: [259]  [ 200/1251]  eta: 0:11:15  lr: 0.000207  min_lr: 0.000207  loss: 2.7321 (2.5814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1385 (1.3819)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [259]  [ 400/1251]  eta: 0:09:01  lr: 0.000206  min_lr: 0.000206  loss: 2.6111 (2.5518)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2027 (1.3059)  time: 0.6283  data: 0.0005  max mem: 54228
Epoch: [259]  [ 600/1251]  eta: 0:06:52  lr: 0.000204  min_lr: 0.000204  loss: 2.5968 (2.5663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1216 (1.3079)  time: 0.6367  data: 0.0005  max mem: 54228
Epoch: [259]  [ 800/1251]  eta: 0:04:45  lr: 0.000203  min_lr: 0.000203  loss: 2.6647 (2.5716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3316 (1.2974)  time: 0.6332  data: 0.0005  max mem: 54228
Epoch: [259]  [1000/1251]  eta: 0:02:38  lr: 0.000201  min_lr: 0.000201  loss: 2.6877 (2.5702)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0929 (1.2887)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [259]  [1200/1251]  eta: 0:00:32  lr: 0.000199  min_lr: 0.000199  loss: 2.7314 (2.5740)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (1.2632)  time: 0.6286  data: 0.0005  max mem: 54228
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.5182 (2.5706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0963 (1.2593)  time: 0.5378  data: 0.0005  max mem: 54228
Epoch: [259] Total time: 0:13:08 (0.6306 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.5182 (2.5655)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0963 (1.2593)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.5421 (0.5421)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 6.2878  data: 5.9632  max mem: 54228
Test:  [10/25]  eta: 0:00:12  loss: 0.6722 (0.6724)  acc1: 87.2000 (87.6000)  acc5: 98.0000 (98.0364)  time: 0.8441  data: 0.5424  max mem: 54228
Test:  [20/25]  eta: 0:00:02  loss: 0.8258 (0.7874)  acc1: 84.0000 (84.7048)  acc5: 96.4000 (96.8762)  time: 0.2996  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8554 (0.8004)  acc1: 83.2000 (84.3680)  acc5: 96.4000 (96.8640)  time: 0.2996  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5431 s / it)
* Acc@1 84.636 Acc@5 97.026 loss 0.797
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.68%
Epoch: [260]  [   0/1251]  eta: 1:28:42  lr: 0.000199  min_lr: 0.000199  loss: 2.9600 (2.9600)  weight_decay: 0.0500 (0.0500)  time: 4.2545  data: 2.4946  max mem: 54228
Epoch: [260]  [ 200/1251]  eta: 0:11:20  lr: 0.000197  min_lr: 0.000197  loss: 2.7869 (2.5605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1377 (1.1562)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [260]  [ 400/1251]  eta: 0:09:02  lr: 0.000196  min_lr: 0.000196  loss: 2.6064 (2.5194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1350 (1.1674)  time: 0.6285  data: 0.0005  max mem: 54228
Epoch: [260]  [ 600/1251]  eta: 0:06:53  lr: 0.000194  min_lr: 0.000194  loss: 2.3277 (2.5289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1070 (1.1910)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [260]  [ 800/1251]  eta: 0:04:45  lr: 0.000193  min_lr: 0.000193  loss: 2.7394 (2.5509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3164 (1.2555)  time: 0.6281  data: 0.0005  max mem: 54228
Epoch: [260]  [1000/1251]  eta: 0:02:38  lr: 0.000191  min_lr: 0.000191  loss: 2.6032 (2.5533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0617 (1.2497)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [260]  [1200/1251]  eta: 0:00:32  lr: 0.000190  min_lr: 0.000190  loss: 2.5343 (2.5485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.2533)  time: 0.6287  data: 0.0005  max mem: 54228
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.5137 (2.5468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0890 (1.2553)  time: 0.5335  data: 0.0005  max mem: 54228
Epoch: [260] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.5137 (2.5489)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0890 (1.2553)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5310 (0.5310)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.6142  data: 6.2895  max mem: 54228
Test:  [10/25]  eta: 0:00:13  loss: 0.6463 (0.6593)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0727)  time: 0.8739  data: 0.5721  max mem: 54228
Test:  [20/25]  eta: 0:00:03  loss: 0.8429 (0.7777)  acc1: 83.6000 (84.7048)  acc5: 96.4000 (96.9524)  time: 0.3001  data: 0.0002  max mem: 54228
Test:  [24/25]  eta: 0:00:00  loss: 0.8429 (0.7890)  acc1: 82.8000 (84.3520)  acc5: 96.4000 (96.9280)  time: 0.3002  data: 0.0001  max mem: 54228
Test: Total time: 0:00:13 (0.5564 s / it)
* Acc@1 84.724 Acc@5 97.030 loss 0.787
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.72%
Epoch: [261]  [   0/1251]  eta: 1:11:08  lr: 0.000189  min_lr: 0.000189  loss: 3.0498 (3.0498)  weight_decay: 0.0500 (0.0500)  time: 3.4117  data: 2.7780  max mem: 54228
Epoch: [261]  [ 200/1251]  eta: 0:11:14  lr: 0.000188  min_lr: 0.000188  loss: 2.6024 (2.5422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2989 (1.5590)  time: 0.6280  data: 0.0005  max mem: 54228
Epoch: [261]  [ 400/1251]  eta: 0:09:01  lr: 0.000186  min_lr: 0.000186  loss: 2.8166 (2.5423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9959 (1.3368)  time: 0.6361  data: 0.0005  max mem: 54228
Epoch: [261]  [ 600/1251]  eta: 0:06:52  lr: 0.000185  min_lr: 0.000185  loss: 2.6161 (2.5510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (1.3162)  time: 0.6283  data: 0.0004  max mem: 54228
Epoch: [261]  [ 800/1251]  eta: 0:04:45  lr: 0.000183  min_lr: 0.000183  loss: 2.5433 (2.5395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (1.2926)  time: 0.6340  data: 0.0004  max mem: 54228
| distributed init (rank 0): env://, gpu 0
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 2): env://, gpu 2
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=256, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_256_11.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(256, 256), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=292, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(256, 256))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f363d255c10>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.013)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.026)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.039)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.052)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.065)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.077)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.090)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.103)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.116)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.129)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.142)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.155)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.168)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.181)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.194)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.206)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.219)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.232)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.245)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.258)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.271)
    )
    (10): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.284)
    )
    (11): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.297)
    )
    (12): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.310)
    )
    (13): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.323)
    )
    (14): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.335)
    )
    (15): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.348)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.361)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.374)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.387)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.400)
    )
  )
  (head): ConvX(
    (conv): Conv2d(768, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 50901626
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer1.3.mlp.conv_in.conv.weight",
      "layer1.3.mlp.dw.conv.weight",
      "layer1.3.mlp.re.region.0.weight",
      "layer1.3.mlp.re.region.3.weight",
      "layer1.3.mlp.proj.conv.weight",
      "layer1.3.dcnn.conv_in.conv.weight",
      "layer1.3.dcnn.spe.conv.weight",
      "layer1.3.dcnn.att.logit_scale",
      "layer1.3.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer2.5.mlp.conv_in.conv.weight",
      "layer2.5.mlp.dw.conv.weight",
      "layer2.5.mlp.re.region.0.weight",
      "layer2.5.mlp.re.region.3.weight",
      "layer2.5.mlp.proj.conv.weight",
      "layer2.5.dcnn.conv_in.conv.weight",
      "layer2.5.dcnn.spe.conv.weight",
      "layer2.5.dcnn.att.logit_scale",
      "layer2.5.dcnn.proj.conv.weight",
      "layer2.6.mlp.conv_in.conv.weight",
      "layer2.6.mlp.dw.conv.weight",
      "layer2.6.mlp.re.region.0.weight",
      "layer2.6.mlp.re.region.3.weight",
      "layer2.6.mlp.proj.conv.weight",
      "layer2.6.dcnn.conv_in.conv.weight",
      "layer2.6.dcnn.spe.conv.weight",
      "layer2.6.dcnn.att.logit_scale",
      "layer2.6.dcnn.proj.conv.weight",
      "layer2.7.mlp.conv_in.conv.weight",
      "layer2.7.mlp.dw.conv.weight",
      "layer2.7.mlp.re.region.0.weight",
      "layer2.7.mlp.re.region.3.weight",
      "layer2.7.mlp.proj.conv.weight",
      "layer2.7.dcnn.conv_in.conv.weight",
      "layer2.7.dcnn.spe.conv.weight",
      "layer2.7.dcnn.att.logit_scale",
      "layer2.7.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer3.10.mlp.conv_in.conv.weight",
      "layer3.10.mlp.dw.conv.weight",
      "layer3.10.mlp.re.region.0.weight",
      "layer3.10.mlp.re.region.3.weight",
      "layer3.10.mlp.proj.conv.weight",
      "layer3.10.dcnn.conv_in.conv.weight",
      "layer3.10.dcnn.spe.conv.weight",
      "layer3.10.dcnn.att.logit_scale",
      "layer3.10.dcnn.proj.conv.weight",
      "layer3.11.mlp.conv_in.conv.weight",
      "layer3.11.mlp.dw.conv.weight",
      "layer3.11.mlp.re.region.0.weight",
      "layer3.11.mlp.re.region.3.weight",
      "layer3.11.mlp.proj.conv.weight",
      "layer3.11.dcnn.conv_in.conv.weight",
      "layer3.11.dcnn.spe.conv.weight",
      "layer3.11.dcnn.att.logit_scale",
      "layer3.11.dcnn.proj.conv.weight",
      "layer3.12.mlp.conv_in.conv.weight",
      "layer3.12.mlp.dw.conv.weight",
      "layer3.12.mlp.re.region.0.weight",
      "layer3.12.mlp.re.region.3.weight",
      "layer3.12.mlp.proj.conv.weight",
      "layer3.12.dcnn.conv_in.conv.weight",
      "layer3.12.dcnn.spe.conv.weight",
      "layer3.12.dcnn.att.logit_scale",
      "layer3.12.dcnn.proj.conv.weight",
      "layer3.13.mlp.conv_in.conv.weight",
      "layer3.13.mlp.dw.conv.weight",
      "layer3.13.mlp.re.region.0.weight",
      "layer3.13.mlp.re.region.3.weight",
      "layer3.13.mlp.proj.conv.weight",
      "layer3.13.dcnn.conv_in.conv.weight",
      "layer3.13.dcnn.spe.conv.weight",
      "layer3.13.dcnn.att.logit_scale",
      "layer3.13.dcnn.proj.conv.weight",
      "layer3.14.mlp.conv_in.conv.weight",
      "layer3.14.mlp.dw.conv.weight",
      "layer3.14.mlp.re.region.0.weight",
      "layer3.14.mlp.re.region.3.weight",
      "layer3.14.mlp.proj.conv.weight",
      "layer3.14.dcnn.conv_in.conv.weight",
      "layer3.14.dcnn.spe.conv.weight",
      "layer3.14.dcnn.att.logit_scale",
      "layer3.14.dcnn.proj.conv.weight",
      "layer3.15.mlp.conv_in.conv.weight",
      "layer3.15.mlp.dw.conv.weight",
      "layer3.15.mlp.re.region.0.weight",
      "layer3.15.mlp.re.region.3.weight",
      "layer3.15.mlp.proj.conv.weight",
      "layer3.15.dcnn.conv_in.conv.weight",
      "layer3.15.dcnn.spe.conv.weight",
      "layer3.15.dcnn.att.logit_scale",
      "layer3.15.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "layer4.3.mlp.conv_in.conv.weight",
      "layer4.3.mlp.dw.conv.weight",
      "layer4.3.mlp.re.region.0.weight",
      "layer4.3.mlp.re.region.3.weight",
      "layer4.3.mlp.proj.conv.weight",
      "layer4.3.dcnn.conv_in.conv.weight",
      "layer4.3.dcnn.spe.conv.weight",
      "layer4.3.dcnn.att.logit_scale",
      "layer4.3.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer1.3.mlp.conv_in.norm.weight",
      "layer1.3.mlp.conv_in.norm.bias",
      "layer1.3.mlp.dw.norm.weight",
      "layer1.3.mlp.dw.norm.bias",
      "layer1.3.mlp.re.region.1.weight",
      "layer1.3.mlp.re.region.1.bias",
      "layer1.3.mlp.re.region.3.bias",
      "layer1.3.mlp.proj.norm.weight",
      "layer1.3.mlp.proj.norm.bias",
      "layer1.3.dcnn.conv_in.norm.weight",
      "layer1.3.dcnn.conv_in.norm.bias",
      "layer1.3.dcnn.spe.norm.weight",
      "layer1.3.dcnn.spe.norm.bias",
      "layer1.3.dcnn.proj.norm.weight",
      "layer1.3.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer2.5.mlp.conv_in.norm.weight",
      "layer2.5.mlp.conv_in.norm.bias",
      "layer2.5.mlp.dw.norm.weight",
      "layer2.5.mlp.dw.norm.bias",
      "layer2.5.mlp.re.region.1.weight",
      "layer2.5.mlp.re.region.1.bias",
      "layer2.5.mlp.re.region.3.bias",
      "layer2.5.mlp.proj.norm.weight",
      "layer2.5.mlp.proj.norm.bias",
      "layer2.5.dcnn.conv_in.norm.weight",
      "layer2.5.dcnn.conv_in.norm.bias",
      "layer2.5.dcnn.spe.norm.weight",
      "layer2.5.dcnn.spe.norm.bias",
      "layer2.5.dcnn.proj.norm.weight",
      "layer2.5.dcnn.proj.norm.bias",
      "layer2.6.mlp.conv_in.norm.weight",
      "layer2.6.mlp.conv_in.norm.bias",
      "layer2.6.mlp.dw.norm.weight",
      "layer2.6.mlp.dw.norm.bias",
      "layer2.6.mlp.re.region.1.weight",
      "layer2.6.mlp.re.region.1.bias",
      "layer2.6.mlp.re.region.3.bias",
      "layer2.6.mlp.proj.norm.weight",
      "layer2.6.mlp.proj.norm.bias",
      "layer2.6.dcnn.conv_in.norm.weight",
      "layer2.6.dcnn.conv_in.norm.bias",
      "layer2.6.dcnn.spe.norm.weight",
      "layer2.6.dcnn.spe.norm.bias",
      "layer2.6.dcnn.proj.norm.weight",
      "layer2.6.dcnn.proj.norm.bias",
      "layer2.7.mlp.conv_in.norm.weight",
      "layer2.7.mlp.conv_in.norm.bias",
      "layer2.7.mlp.dw.norm.weight",
      "layer2.7.mlp.dw.norm.bias",
      "layer2.7.mlp.re.region.1.weight",
      "layer2.7.mlp.re.region.1.bias",
      "layer2.7.mlp.re.region.3.bias",
      "layer2.7.mlp.proj.norm.weight",
      "layer2.7.mlp.proj.norm.bias",
      "layer2.7.dcnn.conv_in.norm.weight",
      "layer2.7.dcnn.conv_in.norm.bias",
      "layer2.7.dcnn.spe.norm.weight",
      "layer2.7.dcnn.spe.norm.bias",
      "layer2.7.dcnn.proj.norm.weight",
      "layer2.7.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer3.10.mlp.conv_in.norm.weight",
      "layer3.10.mlp.conv_in.norm.bias",
      "layer3.10.mlp.dw.norm.weight",
      "layer3.10.mlp.dw.norm.bias",
      "layer3.10.mlp.re.region.1.weight",
      "layer3.10.mlp.re.region.1.bias",
      "layer3.10.mlp.re.region.3.bias",
      "layer3.10.mlp.proj.norm.weight",
      "layer3.10.mlp.proj.norm.bias",
      "layer3.10.dcnn.conv_in.norm.weight",
      "layer3.10.dcnn.conv_in.norm.bias",
      "layer3.10.dcnn.spe.norm.weight",
      "layer3.10.dcnn.spe.norm.bias",
      "layer3.10.dcnn.proj.norm.weight",
      "layer3.10.dcnn.proj.norm.bias",
      "layer3.11.mlp.conv_in.norm.weight",
      "layer3.11.mlp.conv_in.norm.bias",
      "layer3.11.mlp.dw.norm.weight",
      "layer3.11.mlp.dw.norm.bias",
      "layer3.11.mlp.re.region.1.weight",
      "layer3.11.mlp.re.region.1.bias",
      "layer3.11.mlp.re.region.3.bias",
      "layer3.11.mlp.proj.norm.weight",
      "layer3.11.mlp.proj.norm.bias",
      "layer3.11.dcnn.conv_in.norm.weight",
      "layer3.11.dcnn.conv_in.norm.bias",
      "layer3.11.dcnn.spe.norm.weight",
      "layer3.11.dcnn.spe.norm.bias",
      "layer3.11.dcnn.proj.norm.weight",
      "layer3.11.dcnn.proj.norm.bias",
      "layer3.12.mlp.conv_in.norm.weight",
      "layer3.12.mlp.conv_in.norm.bias",
      "layer3.12.mlp.dw.norm.weight",
      "layer3.12.mlp.dw.norm.bias",
      "layer3.12.mlp.re.region.1.weight",
      "layer3.12.mlp.re.region.1.bias",
      "layer3.12.mlp.re.region.3.bias",
      "layer3.12.mlp.proj.norm.weight",
      "layer3.12.mlp.proj.norm.bias",
      "layer3.12.dcnn.conv_in.norm.weight",
      "layer3.12.dcnn.conv_in.norm.bias",
      "layer3.12.dcnn.spe.norm.weight",
      "layer3.12.dcnn.spe.norm.bias",
      "layer3.12.dcnn.proj.norm.weight",
      "layer3.12.dcnn.proj.norm.bias",
      "layer3.13.mlp.conv_in.norm.weight",
      "layer3.13.mlp.conv_in.norm.bias",
      "layer3.13.mlp.dw.norm.weight",
      "layer3.13.mlp.dw.norm.bias",
      "layer3.13.mlp.re.region.1.weight",
      "layer3.13.mlp.re.region.1.bias",
      "layer3.13.mlp.re.region.3.bias",
      "layer3.13.mlp.proj.norm.weight",
      "layer3.13.mlp.proj.norm.bias",
      "layer3.13.dcnn.conv_in.norm.weight",
      "layer3.13.dcnn.conv_in.norm.bias",
      "layer3.13.dcnn.spe.norm.weight",
      "layer3.13.dcnn.spe.norm.bias",
      "layer3.13.dcnn.proj.norm.weight",
      "layer3.13.dcnn.proj.norm.bias",
      "layer3.14.mlp.conv_in.norm.weight",
      "layer3.14.mlp.conv_in.norm.bias",
      "layer3.14.mlp.dw.norm.weight",
      "layer3.14.mlp.dw.norm.bias",
      "layer3.14.mlp.re.region.1.weight",
      "layer3.14.mlp.re.region.1.bias",
      "layer3.14.mlp.re.region.3.bias",
      "layer3.14.mlp.proj.norm.weight",
      "layer3.14.mlp.proj.norm.bias",
      "layer3.14.dcnn.conv_in.norm.weight",
      "layer3.14.dcnn.conv_in.norm.bias",
      "layer3.14.dcnn.spe.norm.weight",
      "layer3.14.dcnn.spe.norm.bias",
      "layer3.14.dcnn.proj.norm.weight",
      "layer3.14.dcnn.proj.norm.bias",
      "layer3.15.mlp.conv_in.norm.weight",
      "layer3.15.mlp.conv_in.norm.bias",
      "layer3.15.mlp.dw.norm.weight",
      "layer3.15.mlp.dw.norm.bias",
      "layer3.15.mlp.re.region.1.weight",
      "layer3.15.mlp.re.region.1.bias",
      "layer3.15.mlp.re.region.3.bias",
      "layer3.15.mlp.proj.norm.weight",
      "layer3.15.mlp.proj.norm.bias",
      "layer3.15.dcnn.conv_in.norm.weight",
      "layer3.15.dcnn.conv_in.norm.bias",
      "layer3.15.dcnn.spe.norm.weight",
      "layer3.15.dcnn.spe.norm.bias",
      "layer3.15.dcnn.proj.norm.weight",
      "layer3.15.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "layer4.3.mlp.conv_in.norm.weight",
      "layer4.3.mlp.conv_in.norm.bias",
      "layer4.3.mlp.dw.norm.weight",
      "layer4.3.mlp.dw.norm.bias",
      "layer4.3.mlp.re.region.1.weight",
      "layer4.3.mlp.re.region.1.bias",
      "layer4.3.mlp.re.region.3.bias",
      "layer4.3.mlp.proj.norm.weight",
      "layer4.3.mlp.proj.norm.bias",
      "layer4.3.dcnn.conv_in.norm.weight",
      "layer4.3.dcnn.conv_in.norm.bias",
      "layer4.3.dcnn.spe.norm.weight",
      "layer4.3.dcnn.spe.norm.bias",
      "layer4.3.dcnn.proj.norm.weight",
      "layer4.3.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: checkpoint_base_256_11.4G/checkpoint-260.pth
Resume checkpoint checkpoint_base_256_11.4G/checkpoint-260.pth
With optim & sched!
Start training for 300 epochs
Epoch: [261]  [   0/1251]  eta: 5:06:39  lr: 0.000189  min_lr: 0.000189  loss: 2.8042 (2.8042)  weight_decay: 0.0500 (0.0500)  time: 14.7077  data: 2.6668  max mem: 54633
Epoch: [261]  [ 200/1251]  eta: 0:12:23  lr: 0.000188  min_lr: 0.000188  loss: 2.5836 (2.5810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1868 (1.2750)  time: 0.6352  data: 0.0006  max mem: 54633
Epoch: [261]  [ 400/1251]  eta: 0:09:31  lr: 0.000186  min_lr: 0.000186  loss: 2.6642 (2.5631)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6351  data: 0.0006  max mem: 54633
Epoch: [261]  [ 600/1251]  eta: 0:07:09  lr: 0.000185  min_lr: 0.000185  loss: 2.5575 (2.5819)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0012 (nan)  time: 0.6344  data: 0.0007  max mem: 54633
Epoch: [261]  [ 800/1251]  eta: 0:04:54  lr: 0.000183  min_lr: 0.000183  loss: 2.6676 (2.5775)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0832 (nan)  time: 0.6346  data: 0.0006  max mem: 54633
Epoch: [261]  [1000/1251]  eta: 0:02:43  lr: 0.000182  min_lr: 0.000182  loss: 2.6919 (2.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0500 (nan)  time: 0.6357  data: 0.0007  max mem: 54633
Epoch: [261]  [1200/1251]  eta: 0:00:33  lr: 0.000180  min_lr: 0.000180  loss: 2.7721 (2.5769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3004 (nan)  time: 0.6348  data: 0.0007  max mem: 54633
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.3652 (2.5731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3042 (nan)  time: 0.5384  data: 0.0006  max mem: 54633
Epoch: [261] Total time: 0:13:28 (0.6461 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.3652 (2.5812)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3042 (nan)
Test:  [ 0/25]  eta: 0:05:03  loss: 0.5035 (0.5035)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 12.1427  data: 7.5289  max mem: 54633
Test:  [10/25]  eta: 0:00:20  loss: 0.6187 (0.6308)  acc1: 87.6000 (87.7818)  acc5: 98.0000 (98.0000)  time: 1.3766  data: 0.6847  max mem: 54633
Test:  [20/25]  eta: 0:00:04  loss: 0.7958 (0.7447)  acc1: 83.2000 (84.7619)  acc5: 96.8000 (96.9143)  time: 0.3003  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8220 (0.7591)  acc1: 82.8000 (84.4800)  acc5: 96.4000 (96.8800)  time: 0.3004  data: 0.0001  max mem: 54633
Test: Total time: 0:00:19 (0.7770 s / it)
* Acc@1 84.806 Acc@5 97.072 loss 0.756
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.81%
Epoch: [262]  [   0/1251]  eta: 1:14:29  lr: 0.000180  min_lr: 0.000180  loss: 1.7709 (1.7709)  weight_decay: 0.0500 (0.0500)  time: 3.5731  data: 2.8475  max mem: 54633
Epoch: [262]  [ 200/1251]  eta: 0:11:17  lr: 0.000179  min_lr: 0.000179  loss: 2.6175 (2.5347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1611 (1.3766)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [262]  [ 400/1251]  eta: 0:09:01  lr: 0.000177  min_lr: 0.000177  loss: 2.5275 (2.5616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2124 (1.3673)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [262]  [ 600/1251]  eta: 0:06:52  lr: 0.000176  min_lr: 0.000176  loss: 2.5843 (2.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1129 (1.2906)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [262]  [ 800/1251]  eta: 0:04:45  lr: 0.000174  min_lr: 0.000174  loss: 2.7194 (2.5780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0567 (1.3036)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [262]  [1000/1251]  eta: 0:02:38  lr: 0.000173  min_lr: 0.000173  loss: 2.6688 (2.5728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1692 (1.2970)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [262]  [1200/1251]  eta: 0:00:32  lr: 0.000171  min_lr: 0.000171  loss: 2.6661 (2.5684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0033 (1.2752)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.8406 (2.5731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1132 (1.2831)  time: 0.5331  data: 0.0006  max mem: 54633
Epoch: [262] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.8406 (2.5770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1132 (1.2831)
Test:  [ 0/25]  eta: 0:02:40  loss: 0.6549 (0.6549)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.4227  data: 6.0727  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.7614 (0.7765)  acc1: 86.8000 (87.5273)  acc5: 98.0000 (98.0727)  time: 0.8566  data: 0.5524  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.9631 (0.8949)  acc1: 82.4000 (84.4571)  acc5: 96.0000 (96.8952)  time: 0.2998  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9631 (0.9064)  acc1: 82.4000 (84.1120)  acc5: 96.0000 (96.8480)  time: 0.2997  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5480 s / it)
* Acc@1 84.636 Acc@5 96.988 loss 0.901
Accuracy of the model on the 50000 test images: 84.6%
Max accuracy: 84.81%
Epoch: [263]  [   0/1251]  eta: 1:30:08  lr: 0.000171  min_lr: 0.000171  loss: 3.0072 (3.0072)  weight_decay: 0.0500 (0.0500)  time: 4.3234  data: 1.9001  max mem: 54633
Epoch: [263]  [ 200/1251]  eta: 0:11:19  lr: 0.000169  min_lr: 0.000169  loss: 2.7286 (2.5849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2821 (1.1644)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [263]  [ 400/1251]  eta: 0:09:03  lr: 0.000168  min_lr: 0.000168  loss: 2.6808 (2.5917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0703 (1.2088)  time: 0.6293  data: 0.0005  max mem: 54633
Epoch: [263]  [ 600/1251]  eta: 0:06:53  lr: 0.000167  min_lr: 0.000167  loss: 2.2794 (2.5566)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1653 (1.2182)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [263]  [ 800/1251]  eta: 0:04:45  lr: 0.000165  min_lr: 0.000165  loss: 2.6283 (2.5413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2868 (1.2133)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [263]  [1000/1251]  eta: 0:02:38  lr: 0.000164  min_lr: 0.000164  loss: 2.7400 (2.5457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4126 (1.2374)  time: 0.6287  data: 0.0004  max mem: 54633
Epoch: [263]  [1200/1251]  eta: 0:00:32  lr: 0.000162  min_lr: 0.000162  loss: 2.6506 (2.5555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2067 (1.2406)  time: 0.6283  data: 0.0004  max mem: 54633
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.6703 (2.5546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1253 (1.2360)  time: 0.5331  data: 0.0006  max mem: 54633
Epoch: [263] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.6703 (2.5723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1253 (1.2360)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5436 (0.5436)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.1647  data: 4.8009  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6379 (0.6505)  acc1: 87.2000 (87.5273)  acc5: 98.0000 (98.1091)  time: 0.8275  data: 0.5214  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.7996 (0.7682)  acc1: 82.8000 (84.5905)  acc5: 96.4000 (96.9333)  time: 0.3473  data: 0.0467  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8479 (0.7825)  acc1: 82.8000 (84.3680)  acc5: 96.4000 (96.8640)  time: 0.3008  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5373 s / it)
* Acc@1 84.742 Acc@5 97.112 loss 0.777
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.81%
Epoch: [264]  [   0/1251]  eta: 1:26:04  lr: 0.000162  min_lr: 0.000162  loss: 2.5464 (2.5464)  weight_decay: 0.0500 (0.0500)  time: 4.1286  data: 1.7887  max mem: 54633
Epoch: [264]  [ 200/1251]  eta: 0:11:19  lr: 0.000160  min_lr: 0.000160  loss: 2.5254 (2.5554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0289 (1.2779)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [264]  [ 400/1251]  eta: 0:09:02  lr: 0.000159  min_lr: 0.000159  loss: 2.7460 (2.5463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0965 (1.2512)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [264]  [ 600/1251]  eta: 0:06:53  lr: 0.000158  min_lr: 0.000158  loss: 2.4152 (2.5405)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2804 (1.2567)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [264]  [ 800/1251]  eta: 0:04:45  lr: 0.000156  min_lr: 0.000156  loss: 2.3955 (2.5343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7919 (1.3097)  time: 0.6288  data: 0.0004  max mem: 54633
Epoch: [264]  [1000/1251]  eta: 0:02:38  lr: 0.000155  min_lr: 0.000155  loss: 2.7212 (2.5478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0797 (1.2854)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [264]  [1200/1251]  eta: 0:00:32  lr: 0.000154  min_lr: 0.000154  loss: 2.5968 (2.5523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0712 (1.2676)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.7291 (2.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1333 (1.2613)  time: 0.5335  data: 0.0007  max mem: 54633
Epoch: [264] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.7291 (2.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1333 (1.2613)
Test:  [ 0/25]  eta: 0:02:42  loss: 0.5808 (0.5808)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.4980  data: 6.1762  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.7032 (0.7114)  acc1: 87.6000 (87.7455)  acc5: 98.0000 (98.1818)  time: 0.8636  data: 0.5618  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8873 (0.8273)  acc1: 82.0000 (84.6667)  acc5: 96.4000 (96.9333)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8994 (0.8402)  acc1: 82.0000 (84.3040)  acc5: 96.4000 (96.9440)  time: 0.3005  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5518 s / it)
* Acc@1 84.752 Acc@5 97.048 loss 0.835
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.81%
Epoch: [265]  [   0/1251]  eta: 1:24:51  lr: 0.000153  min_lr: 0.000153  loss: 2.6066 (2.6066)  weight_decay: 0.0500 (0.0500)  time: 4.0701  data: 2.9981  max mem: 54633
Epoch: [265]  [ 200/1251]  eta: 0:11:18  lr: 0.000152  min_lr: 0.000152  loss: 2.5827 (2.5820)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1343 (1.1724)  time: 0.6368  data: 0.0004  max mem: 54633
Epoch: [265]  [ 400/1251]  eta: 0:09:02  lr: 0.000150  min_lr: 0.000150  loss: 2.7147 (2.5942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1865 (1.2173)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [265]  [ 600/1251]  eta: 0:06:52  lr: 0.000149  min_lr: 0.000149  loss: 2.6204 (2.5809)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0264 (1.2768)  time: 0.6285  data: 0.0004  max mem: 54633
Epoch: [265]  [ 800/1251]  eta: 0:04:45  lr: 0.000148  min_lr: 0.000148  loss: 2.6553 (2.5807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1807 (1.3039)  time: 0.6282  data: 0.0004  max mem: 54633
Epoch: [265]  [1000/1251]  eta: 0:02:38  lr: 0.000146  min_lr: 0.000146  loss: 2.6234 (2.5761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0320 (1.2850)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [265]  [1200/1251]  eta: 0:00:32  lr: 0.000145  min_lr: 0.000145  loss: 2.6233 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2859 (1.2707)  time: 0.6285  data: 0.0004  max mem: 54633
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.6099 (2.5713)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1515 (1.2761)  time: 0.5331  data: 0.0005  max mem: 54633
Epoch: [265] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.6099 (2.5686)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1515 (1.2761)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5472 (0.5472)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.7449  data: 5.4285  max mem: 54633
Test:  [10/25]  eta: 0:00:11  loss: 0.6621 (0.6775)  acc1: 88.0000 (87.8909)  acc5: 98.0000 (98.0727)  time: 0.7952  data: 0.4939  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8452 (0.7932)  acc1: 82.8000 (84.7238)  acc5: 96.4000 (96.8952)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8819 (0.8069)  acc1: 82.4000 (84.3840)  acc5: 96.4000 (96.8480)  time: 0.3004  data: 0.0002  max mem: 54633
Test: Total time: 0:00:13 (0.5220 s / it)
* Acc@1 84.746 Acc@5 97.064 loss 0.804
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.81%
Epoch: [266]  [   0/1251]  eta: 1:20:29  lr: 0.000145  min_lr: 0.000145  loss: 2.0841 (2.0841)  weight_decay: 0.0500 (0.0500)  time: 3.8606  data: 2.8533  max mem: 54633
Epoch: [266]  [ 200/1251]  eta: 0:11:19  lr: 0.000143  min_lr: 0.000143  loss: 2.7022 (2.6298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1125 (1.2275)  time: 0.6280  data: 0.0005  max mem: 54633
Epoch: [266]  [ 400/1251]  eta: 0:09:02  lr: 0.000142  min_lr: 0.000142  loss: 2.6429 (2.6004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0921 (1.2245)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [266]  [ 600/1251]  eta: 0:06:53  lr: 0.000141  min_lr: 0.000141  loss: 2.6860 (2.5881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1621 (1.2513)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [266]  [ 800/1251]  eta: 0:04:45  lr: 0.000139  min_lr: 0.000139  loss: 2.6178 (2.5792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4017 (1.3059)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [266]  [1000/1251]  eta: 0:02:38  lr: 0.000138  min_lr: 0.000138  loss: 2.8333 (2.5812)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1152 (1.2853)  time: 0.6279  data: 0.0005  max mem: 54633
Epoch: [266]  [1200/1251]  eta: 0:00:32  lr: 0.000137  min_lr: 0.000137  loss: 2.7398 (2.5802)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3249 (1.3048)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.4255 (2.5752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2812 (1.3030)  time: 0.5367  data: 0.0007  max mem: 54633
Epoch: [266] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.4255 (2.5632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2812 (1.3030)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5344 (0.5344)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5969  data: 6.2683  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6402 (0.6474)  acc1: 87.6000 (87.6364)  acc5: 98.0000 (98.0000)  time: 0.8734  data: 0.5701  max mem: 54633
Test:  [20/25]  eta: 0:00:03  loss: 0.8080 (0.7627)  acc1: 82.8000 (84.8191)  acc5: 96.8000 (96.8191)  time: 0.3010  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8468 (0.7776)  acc1: 82.8000 (84.3520)  acc5: 96.8000 (96.7680)  time: 0.3011  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5563 s / it)
* Acc@1 84.726 Acc@5 97.082 loss 0.771
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.81%
Epoch: [267]  [   0/1251]  eta: 1:27:19  lr: 0.000136  min_lr: 0.000136  loss: 1.6438 (1.6438)  weight_decay: 0.0500 (0.0500)  time: 4.1882  data: 2.5451  max mem: 54633
Epoch: [267]  [ 200/1251]  eta: 0:11:18  lr: 0.000135  min_lr: 0.000135  loss: 2.5012 (2.5922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9992 (1.1364)  time: 0.6282  data: 0.0004  max mem: 54633
Epoch: [267]  [ 400/1251]  eta: 0:09:02  lr: 0.000134  min_lr: 0.000134  loss: 2.5970 (2.5622)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1946 (1.1976)  time: 0.6276  data: 0.0004  max mem: 54633
Epoch: [267]  [ 600/1251]  eta: 0:06:53  lr: 0.000133  min_lr: 0.000133  loss: 2.4202 (2.5679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0652 (1.2100)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [267]  [ 800/1251]  eta: 0:04:45  lr: 0.000131  min_lr: 0.000131  loss: 2.6150 (2.5703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2785 (1.2218)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [267]  [1000/1251]  eta: 0:02:38  lr: 0.000130  min_lr: 0.000130  loss: 2.5583 (2.5673)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0612 (1.2245)  time: 0.6333  data: 0.0005  max mem: 54633
Epoch: [267]  [1200/1251]  eta: 0:00:32  lr: 0.000129  min_lr: 0.000129  loss: 2.7138 (2.5756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1510 (1.2187)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.6986 (2.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1529 (1.2206)  time: 0.5329  data: 0.0005  max mem: 54633
Epoch: [267] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.6986 (2.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1529 (1.2206)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.5479 (0.5479)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 4.5056  data: 4.1348  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6683 (0.6757)  acc1: 86.8000 (87.5273)  acc5: 98.0000 (98.0000)  time: 0.8017  data: 0.4965  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8464 (0.7922)  acc1: 82.8000 (84.6095)  acc5: 96.8000 (96.8571)  time: 0.3650  data: 0.0664  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8660 (0.8047)  acc1: 82.4000 (84.2240)  acc5: 96.8000 (96.8000)  time: 0.2987  data: 0.0002  max mem: 54633
Test: Total time: 0:00:13 (0.5244 s / it)
* Acc@1 84.804 Acc@5 97.056 loss 0.796
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.81%
Epoch: [268]  [   0/1251]  eta: 1:25:15  lr: 0.000128  min_lr: 0.000128  loss: 1.4741 (1.4741)  weight_decay: 0.0500 (0.0500)  time: 4.0889  data: 2.9941  max mem: 54633
Epoch: [268]  [ 200/1251]  eta: 0:11:18  lr: 0.000127  min_lr: 0.000127  loss: 2.6721 (2.6002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1144 (1.3039)  time: 0.6293  data: 0.0005  max mem: 54633
Epoch: [268]  [ 400/1251]  eta: 0:09:02  lr: 0.000126  min_lr: 0.000126  loss: 2.6955 (2.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3046 (1.2509)  time: 0.6277  data: 0.0005  max mem: 54633
Epoch: [268]  [ 600/1251]  eta: 0:06:53  lr: 0.000125  min_lr: 0.000125  loss: 2.6137 (2.5683)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0221 (1.2185)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [268]  [ 800/1251]  eta: 0:04:45  lr: 0.000123  min_lr: 0.000123  loss: 2.7538 (2.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0446 (1.2229)  time: 0.6418  data: 0.0005  max mem: 54633
Epoch: [268]  [1000/1251]  eta: 0:02:38  lr: 0.000122  min_lr: 0.000122  loss: 2.5936 (2.5503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9553 (1.2186)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [268]  [1200/1251]  eta: 0:00:32  lr: 0.000121  min_lr: 0.000121  loss: 2.7066 (2.5504)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1315 (1.2290)  time: 0.6280  data: 0.0005  max mem: 54633
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.7227 (2.5492)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (1.2267)  time: 0.5330  data: 0.0007  max mem: 54633
Epoch: [268] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.7227 (2.5573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (1.2267)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5849 (0.5849)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.5500  data: 6.2360  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6829 (0.6982)  acc1: 88.0000 (87.3455)  acc5: 98.4000 (98.1455)  time: 0.8694  data: 0.5672  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8609 (0.8147)  acc1: 83.2000 (84.4000)  acc5: 96.8000 (96.9714)  time: 0.3010  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8886 (0.8280)  acc1: 82.0000 (84.0960)  acc5: 96.0000 (96.8640)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5554 s / it)
* Acc@1 84.754 Acc@5 97.070 loss 0.820
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.81%
Epoch: [269]  [   0/1251]  eta: 1:28:53  lr: 0.000121  min_lr: 0.000121  loss: 2.5948 (2.5948)  weight_decay: 0.0500 (0.0500)  time: 4.2635  data: 3.3676  max mem: 54633
Epoch: [269]  [ 200/1251]  eta: 0:11:21  lr: 0.000120  min_lr: 0.000120  loss: 2.6285 (2.5715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9945 (1.1944)  time: 0.6284  data: 0.0004  max mem: 54633
Epoch: [269]  [ 400/1251]  eta: 0:09:03  lr: 0.000118  min_lr: 0.000118  loss: 2.6864 (2.5668)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3326 (1.2583)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [269]  [ 600/1251]  eta: 0:06:53  lr: 0.000117  min_lr: 0.000117  loss: 2.6875 (2.5784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9712 (1.2658)  time: 0.6285  data: 0.0004  max mem: 54633
Epoch: [269]  [ 800/1251]  eta: 0:04:45  lr: 0.000116  min_lr: 0.000116  loss: 2.6453 (2.5722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2745 (1.2713)  time: 0.6280  data: 0.0004  max mem: 54633
Epoch: [269]  [1000/1251]  eta: 0:02:38  lr: 0.000115  min_lr: 0.000115  loss: 2.4217 (2.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0591 (1.2868)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [269]  [1200/1251]  eta: 0:00:32  lr: 0.000113  min_lr: 0.000113  loss: 2.6614 (2.5702)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6468  data: 0.0004  max mem: 54633
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.5594 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4151 (nan)  time: 0.5328  data: 0.0005  max mem: 54633
Epoch: [269] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.5594 (2.5559)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4151 (nan)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5540 (0.5540)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5715  data: 6.2354  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6683 (0.6734)  acc1: 87.6000 (87.4182)  acc5: 98.0000 (98.0000)  time: 0.8703  data: 0.5672  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8283 (0.7859)  acc1: 83.2000 (84.4762)  acc5: 96.8000 (96.9333)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8506 (0.7989)  acc1: 82.8000 (84.2720)  acc5: 96.4000 (96.8480)  time: 0.3005  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5544 s / it)
* Acc@1 84.738 Acc@5 96.992 loss 0.790
Accuracy of the model on the 50000 test images: 84.7%
Max accuracy: 84.81%
Epoch: [270]  [   0/1251]  eta: 1:27:00  lr: 0.000113  min_lr: 0.000113  loss: 1.9004 (1.9004)  weight_decay: 0.0500 (0.0500)  time: 4.1733  data: 1.9784  max mem: 54633
Epoch: [270]  [ 200/1251]  eta: 0:11:18  lr: 0.000112  min_lr: 0.000112  loss: 2.6956 (2.5583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1454 (1.2306)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [270]  [ 400/1251]  eta: 0:09:02  lr: 0.000111  min_lr: 0.000111  loss: 2.5521 (2.5599)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0881 (1.2237)  time: 0.6284  data: 0.0004  max mem: 54633
Epoch: [270]  [ 600/1251]  eta: 0:06:53  lr: 0.000110  min_lr: 0.000110  loss: 2.5705 (2.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.2131)  time: 0.6287  data: 0.0004  max mem: 54633
Epoch: [270]  [ 800/1251]  eta: 0:04:45  lr: 0.000109  min_lr: 0.000109  loss: 2.5602 (2.5560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1840 (1.2368)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [270]  [1000/1251]  eta: 0:02:38  lr: 0.000107  min_lr: 0.000107  loss: 2.6846 (2.5496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9205 (1.2041)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [270]  [1200/1251]  eta: 0:00:32  lr: 0.000106  min_lr: 0.000106  loss: 2.6617 (2.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1577 (1.1957)  time: 0.6279  data: 0.0004  max mem: 54633
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.6386 (2.5465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0804 (1.1903)  time: 0.5327  data: 0.0005  max mem: 54633
Epoch: [270] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.6386 (2.5596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0804 (1.1903)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6046 (0.6046)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.8903  data: 5.5237  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.7351 (0.7321)  acc1: 87.2000 (87.3091)  acc5: 98.0000 (98.0727)  time: 0.8090  data: 0.5025  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.9008 (0.8509)  acc1: 83.2000 (84.5524)  acc5: 96.8000 (96.9905)  time: 0.3008  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9078 (0.8629)  acc1: 82.4000 (84.2560)  acc5: 96.4000 (96.8800)  time: 0.3008  data: 0.0002  max mem: 54633
Test: Total time: 0:00:13 (0.5285 s / it)
* Acc@1 84.848 Acc@5 97.048 loss 0.854
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.85%
Epoch: [271]  [   0/1251]  eta: 1:20:04  lr: 0.000106  min_lr: 0.000106  loss: 2.8953 (2.8953)  weight_decay: 0.0500 (0.0500)  time: 3.8407  data: 3.1917  max mem: 54633
Epoch: [271]  [ 200/1251]  eta: 0:11:17  lr: 0.000105  min_lr: 0.000105  loss: 2.7097 (2.5453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2964 (1.2818)  time: 0.6279  data: 0.0004  max mem: 54633
Epoch: [271]  [ 400/1251]  eta: 0:09:02  lr: 0.000104  min_lr: 0.000104  loss: 2.7671 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1384 (1.2556)  time: 0.6282  data: 0.0004  max mem: 54633
Epoch: [271]  [ 600/1251]  eta: 0:06:52  lr: 0.000102  min_lr: 0.000102  loss: 2.5462 (2.5682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3173 (nan)  time: 0.6284  data: 0.0004  max mem: 54633
Epoch: [271]  [ 800/1251]  eta: 0:04:45  lr: 0.000101  min_lr: 0.000101  loss: 2.7520 (2.5736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4690 (nan)  time: 0.6287  data: 0.0005  max mem: 54633
Epoch: [271]  [1000/1251]  eta: 0:02:38  lr: 0.000100  min_lr: 0.000100  loss: 2.6834 (2.5660)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9914 (nan)  time: 0.6283  data: 0.0004  max mem: 54633
Epoch: [271]  [1200/1251]  eta: 0:00:32  lr: 0.000099  min_lr: 0.000099  loss: 2.3963 (2.5657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1105 (nan)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.4543 (2.5647)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2784 (nan)  time: 0.5333  data: 0.0006  max mem: 54633
Epoch: [271] Total time: 0:13:08 (0.6307 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.4543 (2.5475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2784 (nan)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5408 (0.5408)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.5809  data: 6.2395  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6678 (0.6709)  acc1: 86.8000 (87.5273)  acc5: 98.0000 (98.0727)  time: 0.8711  data: 0.5675  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8117 (0.7860)  acc1: 83.6000 (84.7810)  acc5: 96.8000 (96.9905)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8543 (0.7978)  acc1: 83.6000 (84.3840)  acc5: 96.4000 (96.8960)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5556 s / it)
* Acc@1 84.810 Acc@5 97.054 loss 0.789
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.85%
Epoch: [272]  [   0/1251]  eta: 1:22:03  lr: 0.000099  min_lr: 0.000099  loss: 3.0073 (3.0073)  weight_decay: 0.0500 (0.0500)  time: 3.9355  data: 3.1909  max mem: 54633
Epoch: [272]  [ 200/1251]  eta: 0:11:18  lr: 0.000098  min_lr: 0.000098  loss: 2.6765 (2.5346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3433 (1.3554)  time: 0.6288  data: 0.0004  max mem: 54633
Epoch: [272]  [ 400/1251]  eta: 0:09:02  lr: 0.000097  min_lr: 0.000097  loss: 2.5743 (2.5506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2515 (1.4262)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [272]  [ 600/1251]  eta: 0:06:52  lr: 0.000096  min_lr: 0.000096  loss: 2.5184 (2.5564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (1.3264)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [272]  [ 800/1251]  eta: 0:04:45  lr: 0.000094  min_lr: 0.000094  loss: 2.5435 (2.5482)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1494 (1.3069)  time: 0.6298  data: 0.0004  max mem: 54633
Epoch: [272]  [1000/1251]  eta: 0:02:38  lr: 0.000093  min_lr: 0.000093  loss: 2.7091 (2.5514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0676 (1.2932)  time: 0.6290  data: 0.0005  max mem: 54633
Epoch: [272]  [1200/1251]  eta: 0:00:32  lr: 0.000092  min_lr: 0.000092  loss: 2.5494 (2.5500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1711 (1.2733)  time: 0.6285  data: 0.0005  max mem: 54633
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.6732 (2.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0381 (1.2675)  time: 0.5390  data: 0.0007  max mem: 54633
Epoch: [272] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.6732 (2.5457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0381 (1.2675)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.6019 (0.6019)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 6.6273  data: 6.2897  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.7390 (0.7366)  acc1: 86.8000 (87.4546)  acc5: 97.6000 (97.8909)  time: 0.8750  data: 0.5721  max mem: 54633
Test:  [20/25]  eta: 0:00:03  loss: 0.9055 (0.8524)  acc1: 82.4000 (84.6857)  acc5: 96.8000 (96.9905)  time: 0.3001  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9102 (0.8644)  acc1: 82.4000 (84.4000)  acc5: 96.8000 (96.9440)  time: 0.3002  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5567 s / it)
* Acc@1 84.774 Acc@5 96.998 loss 0.856
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.85%
Epoch: [273]  [   0/1251]  eta: 1:22:25  lr: 0.000092  min_lr: 0.000092  loss: 2.5582 (2.5582)  weight_decay: 0.0500 (0.0500)  time: 3.9530  data: 3.0724  max mem: 54633
Epoch: [273]  [ 200/1251]  eta: 0:11:18  lr: 0.000091  min_lr: 0.000091  loss: 2.6614 (2.5618)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0702 (1.1860)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [273]  [ 400/1251]  eta: 0:09:01  lr: 0.000090  min_lr: 0.000090  loss: 2.4899 (2.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1255 (1.2043)  time: 0.6274  data: 0.0005  max mem: 54633
Epoch: [273]  [ 600/1251]  eta: 0:06:52  lr: 0.000089  min_lr: 0.000089  loss: 2.4711 (2.5422)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2814 (1.2589)  time: 0.6277  data: 0.0005  max mem: 54633
Epoch: [273]  [ 800/1251]  eta: 0:04:45  lr: 0.000088  min_lr: 0.000088  loss: 2.6395 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0483 (1.2523)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [273]  [1000/1251]  eta: 0:02:38  lr: 0.000087  min_lr: 0.000087  loss: 2.6610 (2.5389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1329 (1.2375)  time: 0.6274  data: 0.0005  max mem: 54633
Epoch: [273]  [1200/1251]  eta: 0:00:32  lr: 0.000086  min_lr: 0.000086  loss: 2.5873 (2.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0744 (1.2189)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.4292 (2.5356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0884 (1.2147)  time: 0.5327  data: 0.0007  max mem: 54633
Epoch: [273] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.4292 (2.5388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0884 (1.2147)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5350 (0.5350)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 6.3617  data: 6.0170  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6642 (0.6656)  acc1: 86.8000 (87.5636)  acc5: 98.0000 (97.9273)  time: 0.8512  data: 0.5473  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8369 (0.7834)  acc1: 82.8000 (84.5905)  acc5: 96.8000 (97.0667)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8623 (0.7965)  acc1: 82.4000 (84.3520)  acc5: 96.8000 (96.9920)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5472 s / it)
* Acc@1 84.916 Acc@5 97.106 loss 0.788
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.92%
Epoch: [274]  [   0/1251]  eta: 1:05:49  lr: 0.000085  min_lr: 0.000085  loss: 2.1026 (2.1026)  weight_decay: 0.0500 (0.0500)  time: 3.1571  data: 2.5209  max mem: 54633
Epoch: [274]  [ 200/1251]  eta: 0:11:12  lr: 0.000084  min_lr: 0.000084  loss: 2.5878 (2.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0881 (1.1461)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [274]  [ 400/1251]  eta: 0:09:00  lr: 0.000083  min_lr: 0.000083  loss: 2.4512 (2.5247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0221 (1.0839)  time: 0.6378  data: 0.0004  max mem: 54633
Epoch: [274]  [ 600/1251]  eta: 0:06:52  lr: 0.000082  min_lr: 0.000082  loss: 2.5041 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3151 (1.1376)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [274]  [ 800/1251]  eta: 0:04:44  lr: 0.000081  min_lr: 0.000081  loss: 2.6732 (2.5576)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0454 (1.1463)  time: 0.6280  data: 0.0005  max mem: 54633
Epoch: [274]  [1000/1251]  eta: 0:02:38  lr: 0.000080  min_lr: 0.000080  loss: 2.5412 (2.5474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5177 (1.2183)  time: 0.6276  data: 0.0004  max mem: 54633
Epoch: [274]  [1200/1251]  eta: 0:00:32  lr: 0.000079  min_lr: 0.000079  loss: 2.7441 (2.5548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0490 (1.2224)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.6160 (2.5554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1074 (1.2187)  time: 0.5330  data: 0.0006  max mem: 54633
Epoch: [274] Total time: 0:13:07 (0.6295 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.6160 (2.5460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1074 (1.2187)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5425 (0.5425)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.2267  data: 5.9083  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6715 (0.6742)  acc1: 86.8000 (87.2727)  acc5: 97.6000 (97.8545)  time: 0.8393  data: 0.5375  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8232 (0.7897)  acc1: 82.8000 (84.5524)  acc5: 96.8000 (96.9905)  time: 0.3005  data: 0.0003  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8617 (0.8023)  acc1: 82.4000 (84.2720)  acc5: 96.8000 (96.8640)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5416 s / it)
* Acc@1 84.832 Acc@5 97.076 loss 0.796
Accuracy of the model on the 50000 test images: 84.8%
Max accuracy: 84.92%
Epoch: [275]  [   0/1251]  eta: 1:31:29  lr: 0.000079  min_lr: 0.000079  loss: 2.3569 (2.3569)  weight_decay: 0.0500 (0.0500)  time: 4.3883  data: 1.8846  max mem: 54633
Epoch: [275]  [ 200/1251]  eta: 0:11:20  lr: 0.000078  min_lr: 0.000078  loss: 2.5372 (2.5054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2702 (1.3103)  time: 0.6288  data: 0.0004  max mem: 54633
Epoch: [275]  [ 400/1251]  eta: 0:09:03  lr: 0.000077  min_lr: 0.000077  loss: 2.7589 (2.5159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1618 (1.3071)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [275]  [ 600/1251]  eta: 0:06:53  lr: 0.000076  min_lr: 0.000076  loss: 2.4968 (2.5258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (1.2693)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [275]  [ 800/1251]  eta: 0:04:45  lr: 0.000075  min_lr: 0.000075  loss: 2.6665 (2.5382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.2469)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [275]  [1000/1251]  eta: 0:02:38  lr: 0.000074  min_lr: 0.000074  loss: 2.5733 (2.5365)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3710 (1.2622)  time: 0.6290  data: 0.0005  max mem: 54633
Epoch: [275]  [1200/1251]  eta: 0:00:32  lr: 0.000073  min_lr: 0.000073  loss: 2.4637 (2.5357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1761 (1.2703)  time: 0.6277  data: 0.0005  max mem: 54633
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.7406 (2.5323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1698 (1.2653)  time: 0.5328  data: 0.0005  max mem: 54633
Epoch: [275] Total time: 0:13:09 (0.6309 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.7406 (2.5345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1698 (1.2653)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.5629 (0.5629)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.0162  data: 5.6777  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6822 (0.6872)  acc1: 87.2000 (87.6000)  acc5: 98.0000 (97.8909)  time: 0.8188  data: 0.5165  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8527 (0.8036)  acc1: 83.2000 (84.7810)  acc5: 96.8000 (96.9333)  time: 0.2989  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8779 (0.8164)  acc1: 83.2000 (84.3840)  acc5: 96.4000 (96.8480)  time: 0.2987  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5310 s / it)
* Acc@1 84.874 Acc@5 97.082 loss 0.809
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.92%
Epoch: [276]  [   0/1251]  eta: 1:24:32  lr: 0.000073  min_lr: 0.000073  loss: 2.4177 (2.4177)  weight_decay: 0.0500 (0.0500)  time: 4.0546  data: 1.9693  max mem: 54633
Epoch: [276]  [ 200/1251]  eta: 0:11:19  lr: 0.000072  min_lr: 0.000072  loss: 2.5155 (2.5540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1767 (1.2796)  time: 0.6274  data: 0.0004  max mem: 54633
Epoch: [276]  [ 400/1251]  eta: 0:09:02  lr: 0.000071  min_lr: 0.000071  loss: 2.4971 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4386 (1.3036)  time: 0.6274  data: 0.0004  max mem: 54633
Epoch: [276]  [ 600/1251]  eta: 0:06:53  lr: 0.000070  min_lr: 0.000070  loss: 2.5214 (2.5315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0240 (1.2799)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [276]  [ 800/1251]  eta: 0:04:45  lr: 0.000069  min_lr: 0.000069  loss: 2.7713 (2.5333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0704 (1.2383)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [276]  [1000/1251]  eta: 0:02:38  lr: 0.000068  min_lr: 0.000068  loss: 2.7070 (2.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2061 (1.2525)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [276]  [1200/1251]  eta: 0:00:32  lr: 0.000067  min_lr: 0.000067  loss: 2.4484 (2.5439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0867 (1.2442)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.6281 (2.5447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0898 (1.2446)  time: 0.5332  data: 0.0007  max mem: 54633
Epoch: [276] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.6281 (2.5394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0898 (1.2446)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.5840 (0.5840)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.0075  data: 5.6735  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6956 (0.7044)  acc1: 86.8000 (87.6000)  acc5: 98.0000 (97.8909)  time: 0.8195  data: 0.5160  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8570 (0.8188)  acc1: 83.6000 (84.8381)  acc5: 96.8000 (96.9524)  time: 0.3007  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9017 (0.8327)  acc1: 83.2000 (84.4800)  acc5: 96.8000 (96.8800)  time: 0.3007  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5331 s / it)
* Acc@1 84.896 Acc@5 97.086 loss 0.825
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.92%
Epoch: [277]  [   0/1251]  eta: 1:33:07  lr: 0.000067  min_lr: 0.000067  loss: 2.6086 (2.6086)  weight_decay: 0.0500 (0.0500)  time: 4.4665  data: 1.9264  max mem: 54633
Epoch: [277]  [ 200/1251]  eta: 0:11:19  lr: 0.000066  min_lr: 0.000066  loss: 2.6892 (2.5453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (1.1889)  time: 0.6282  data: 0.0004  max mem: 54633
Epoch: [277]  [ 400/1251]  eta: 0:09:02  lr: 0.000065  min_lr: 0.000065  loss: 2.4709 (2.5485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1929 (1.2149)  time: 0.6274  data: 0.0004  max mem: 54633
Epoch: [277]  [ 600/1251]  eta: 0:06:53  lr: 0.000064  min_lr: 0.000064  loss: 2.6375 (2.5324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1059 (1.2460)  time: 0.6274  data: 0.0004  max mem: 54633
Epoch: [277]  [ 800/1251]  eta: 0:04:45  lr: 0.000064  min_lr: 0.000064  loss: 2.7790 (2.5509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0869 (1.2330)  time: 0.6272  data: 0.0004  max mem: 54633
Epoch: [277]  [1000/1251]  eta: 0:02:38  lr: 0.000063  min_lr: 0.000063  loss: 2.6079 (2.5490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1944 (1.2428)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [277]  [1200/1251]  eta: 0:00:32  lr: 0.000062  min_lr: 0.000062  loss: 2.7183 (2.5487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0036 (1.2242)  time: 0.6280  data: 0.0004  max mem: 54633
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.6740 (2.5483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (1.2426)  time: 0.5327  data: 0.0005  max mem: 54633
Epoch: [277] Total time: 0:13:08 (0.6301 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.6740 (2.5427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1356 (1.2426)
Test:  [ 0/25]  eta: 0:02:35  loss: 0.5702 (0.5702)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.2178  data: 5.8800  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6859 (0.6949)  acc1: 87.6000 (87.5636)  acc5: 98.0000 (97.9636)  time: 0.8384  data: 0.5349  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8461 (0.8096)  acc1: 83.6000 (84.8952)  acc5: 96.8000 (97.0095)  time: 0.3006  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8715 (0.8222)  acc1: 83.2000 (84.5600)  acc5: 96.4000 (96.9280)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5411 s / it)
* Acc@1 84.930 Acc@5 97.116 loss 0.814
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.93%
Epoch: [278]  [   0/1251]  eta: 1:00:56  lr: 0.000062  min_lr: 0.000062  loss: 2.8500 (2.8500)  weight_decay: 0.0500 (0.0500)  time: 2.9232  data: 2.2816  max mem: 54633
Epoch: [278]  [ 200/1251]  eta: 0:11:11  lr: 0.000061  min_lr: 0.000061  loss: 2.4906 (2.5447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1528 (1.1778)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [278]  [ 400/1251]  eta: 0:08:59  lr: 0.000060  min_lr: 0.000060  loss: 2.7224 (2.5359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1280 (1.1394)  time: 0.6277  data: 0.0005  max mem: 54633
Epoch: [278]  [ 600/1251]  eta: 0:06:51  lr: 0.000059  min_lr: 0.000059  loss: 2.6599 (2.5281)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2997 (1.1930)  time: 0.6286  data: 0.0005  max mem: 54633
Epoch: [278]  [ 800/1251]  eta: 0:04:44  lr: 0.000058  min_lr: 0.000058  loss: 2.7215 (2.5275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1685 (1.1867)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [278]  [1000/1251]  eta: 0:02:38  lr: 0.000057  min_lr: 0.000057  loss: 2.5298 (2.5368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9854 (1.1657)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [278]  [1200/1251]  eta: 0:00:32  lr: 0.000056  min_lr: 0.000056  loss: 2.7084 (2.5396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0394 (1.1576)  time: 0.6279  data: 0.0005  max mem: 54633
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.5907 (2.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.1620)  time: 0.5332  data: 0.0006  max mem: 54633
Epoch: [278] Total time: 0:13:07 (0.6295 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.5907 (2.5389)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.1620)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.5602 (0.5602)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.7382  data: 6.3935  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6734 (0.6782)  acc1: 86.8000 (87.2727)  acc5: 98.0000 (97.8909)  time: 0.8855  data: 0.5815  max mem: 54633
Test:  [20/25]  eta: 0:00:03  loss: 0.8361 (0.7922)  acc1: 83.6000 (84.7048)  acc5: 96.8000 (96.9143)  time: 0.3004  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8600 (0.8044)  acc1: 82.8000 (84.4160)  acc5: 96.4000 (96.8480)  time: 0.3005  data: 0.0001  max mem: 54633
Test: Total time: 0:00:14 (0.5612 s / it)
* Acc@1 84.948 Acc@5 97.076 loss 0.796
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.95%
Epoch: [279]  [   0/1251]  eta: 1:16:57  lr: 0.000056  min_lr: 0.000056  loss: 2.7874 (2.7874)  weight_decay: 0.0500 (0.0500)  time: 3.6913  data: 3.0498  max mem: 54633
Epoch: [279]  [ 200/1251]  eta: 0:11:16  lr: 0.000055  min_lr: 0.000055  loss: 2.6406 (2.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0568 (1.1373)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [279]  [ 400/1251]  eta: 0:09:01  lr: 0.000055  min_lr: 0.000055  loss: 2.8145 (2.5490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3530 (1.3396)  time: 0.6280  data: 0.0004  max mem: 54633
Epoch: [279]  [ 600/1251]  eta: 0:06:52  lr: 0.000054  min_lr: 0.000054  loss: 2.6567 (2.5475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2052 (1.3369)  time: 0.6283  data: 0.0004  max mem: 54633
Epoch: [279]  [ 800/1251]  eta: 0:04:45  lr: 0.000053  min_lr: 0.000053  loss: 2.6909 (2.5508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (1.3064)  time: 0.6286  data: 0.0004  max mem: 54633
Epoch: [279]  [1000/1251]  eta: 0:02:38  lr: 0.000052  min_lr: 0.000052  loss: 2.6404 (2.5387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1841 (1.3226)  time: 0.6283  data: 0.0004  max mem: 54633
Epoch: [279]  [1200/1251]  eta: 0:00:32  lr: 0.000051  min_lr: 0.000051  loss: 2.7245 (2.5418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1218 (1.2936)  time: 0.6367  data: 0.0004  max mem: 54633
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.3169 (2.5385)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1735 (1.2903)  time: 0.5333  data: 0.0006  max mem: 54633
Epoch: [279] Total time: 0:13:08 (0.6305 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.3169 (2.5302)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1735 (1.2903)
Test:  [ 0/25]  eta: 0:02:32  loss: 0.5094 (0.5094)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.1106  data: 5.7882  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6276 (0.6343)  acc1: 87.6000 (87.9636)  acc5: 98.0000 (97.9273)  time: 0.8281  data: 0.5266  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.7765 (0.7488)  acc1: 83.6000 (85.0667)  acc5: 96.8000 (96.9905)  time: 0.2997  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8202 (0.7611)  acc1: 83.6000 (84.7840)  acc5: 96.4000 (96.9120)  time: 0.2995  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5358 s / it)
* Acc@1 84.922 Acc@5 97.116 loss 0.757
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.95%
Epoch: [280]  [   0/1251]  eta: 1:18:43  lr: 0.000051  min_lr: 0.000051  loss: 3.0249 (3.0249)  weight_decay: 0.0500 (0.0500)  time: 3.7758  data: 1.8339  max mem: 54633
Epoch: [280]  [ 200/1251]  eta: 0:11:17  lr: 0.000050  min_lr: 0.000050  loss: 2.6934 (2.5473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0747 (1.3712)  time: 0.6287  data: 0.0005  max mem: 54633
Epoch: [280]  [ 400/1251]  eta: 0:09:01  lr: 0.000050  min_lr: 0.000050  loss: 2.6680 (2.5311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1455 (1.2713)  time: 0.6286  data: 0.0004  max mem: 54633
Epoch: [280]  [ 600/1251]  eta: 0:06:53  lr: 0.000049  min_lr: 0.000049  loss: 2.7154 (2.5392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.2653)  time: 0.6286  data: 0.0004  max mem: 54633
Epoch: [280]  [ 800/1251]  eta: 0:04:45  lr: 0.000048  min_lr: 0.000048  loss: 2.5632 (2.5293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0087 (1.2322)  time: 0.6290  data: 0.0005  max mem: 54633
Epoch: [280]  [1000/1251]  eta: 0:02:38  lr: 0.000047  min_lr: 0.000047  loss: 2.1807 (2.5268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0311 (1.2082)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [280]  [1200/1251]  eta: 0:00:32  lr: 0.000046  min_lr: 0.000046  loss: 2.4283 (2.5259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0123 (1.2044)  time: 0.6288  data: 0.0005  max mem: 54633
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.6483 (2.5267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.2069)  time: 0.5329  data: 0.0006  max mem: 54633
Epoch: [280] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.6483 (2.5356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.2069)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5763 (0.5763)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.3519  data: 6.0199  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6892 (0.6962)  acc1: 86.8000 (87.6727)  acc5: 98.0000 (97.8546)  time: 0.8491  data: 0.5476  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8598 (0.8144)  acc1: 83.2000 (84.6667)  acc5: 96.8000 (97.0286)  time: 0.2988  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8908 (0.8272)  acc1: 82.8000 (84.3680)  acc5: 96.8000 (96.9440)  time: 0.2987  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5448 s / it)
* Acc@1 84.852 Acc@5 97.116 loss 0.819
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.95%
Epoch: [281]  [   0/1251]  eta: 1:21:49  lr: 0.000046  min_lr: 0.000046  loss: 2.8499 (2.8499)  weight_decay: 0.0500 (0.0500)  time: 3.9244  data: 3.2355  max mem: 54633
Epoch: [281]  [ 200/1251]  eta: 0:11:17  lr: 0.000046  min_lr: 0.000046  loss: 2.5916 (2.5515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0626 (1.2381)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [281]  [ 400/1251]  eta: 0:09:02  lr: 0.000045  min_lr: 0.000045  loss: 2.5322 (2.5632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (1.2011)  time: 0.6296  data: 0.0004  max mem: 54633
Epoch: [281]  [ 600/1251]  eta: 0:06:53  lr: 0.000044  min_lr: 0.000044  loss: 2.6999 (2.5555)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2710 (1.2245)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [281]  [ 800/1251]  eta: 0:04:45  lr: 0.000043  min_lr: 0.000043  loss: 2.7274 (2.5406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0852 (nan)  time: 0.6280  data: 0.0004  max mem: 54633
Epoch: [281]  [1000/1251]  eta: 0:02:38  lr: 0.000043  min_lr: 0.000043  loss: 2.3579 (2.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0099 (nan)  time: 0.6279  data: 0.0005  max mem: 54633
Epoch: [281]  [1200/1251]  eta: 0:00:32  lr: 0.000042  min_lr: 0.000042  loss: 2.4640 (2.5310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2112 (nan)  time: 0.6274  data: 0.0004  max mem: 54633
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.6915 (2.5332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0421 (nan)  time: 0.5330  data: 0.0005  max mem: 54633
Epoch: [281] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.6915 (2.5320)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0421 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5825 (0.5825)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.3939  data: 5.0462  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.7059 (0.7101)  acc1: 87.2000 (87.6727)  acc5: 98.0000 (97.8909)  time: 0.8520  data: 0.5355  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8664 (0.8263)  acc1: 83.6000 (84.8000)  acc5: 96.8000 (96.9524)  time: 0.3518  data: 0.0423  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9081 (0.8395)  acc1: 83.2000 (84.4480)  acc5: 96.4000 (96.8960)  time: 0.3077  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5504 s / it)
* Acc@1 84.960 Acc@5 97.110 loss 0.830
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 84.96%
Epoch: [282]  [   0/1251]  eta: 1:09:12  lr: 0.000042  min_lr: 0.000042  loss: 1.6671 (1.6671)  weight_decay: 0.0500 (0.0500)  time: 3.3190  data: 2.6785  max mem: 54633
Epoch: [282]  [ 200/1251]  eta: 0:11:15  lr: 0.000041  min_lr: 0.000041  loss: 2.4880 (2.4859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9903 (1.2001)  time: 0.6280  data: 0.0004  max mem: 54633
Epoch: [282]  [ 400/1251]  eta: 0:09:00  lr: 0.000040  min_lr: 0.000040  loss: 2.6636 (2.5311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1719 (1.1784)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [282]  [ 600/1251]  eta: 0:06:52  lr: 0.000040  min_lr: 0.000040  loss: 2.7334 (2.5506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0281 (1.1512)  time: 0.6330  data: 0.0005  max mem: 54633
Epoch: [282]  [ 800/1251]  eta: 0:04:44  lr: 0.000039  min_lr: 0.000039  loss: 2.5333 (2.5462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1390 (1.1915)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [282]  [1000/1251]  eta: 0:02:38  lr: 0.000038  min_lr: 0.000038  loss: 2.3920 (2.5407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (1.1789)  time: 0.6274  data: 0.0005  max mem: 54633
Epoch: [282]  [1200/1251]  eta: 0:00:32  lr: 0.000037  min_lr: 0.000037  loss: 2.5801 (2.5371)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0289 (1.1816)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.7140 (2.5368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (1.1775)  time: 0.5328  data: 0.0008  max mem: 54633
Epoch: [282] Total time: 0:13:07 (0.6295 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.7140 (2.5270)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0138 (1.1775)
Test:  [ 0/25]  eta: 0:02:51  loss: 0.5808 (0.5808)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.8521  data: 6.5250  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.6988 (0.7064)  acc1: 87.2000 (87.4182)  acc5: 98.0000 (97.8909)  time: 0.8961  data: 0.5935  max mem: 54633
Test:  [20/25]  eta: 0:00:03  loss: 0.8687 (0.8238)  acc1: 83.2000 (84.7048)  acc5: 96.8000 (96.9524)  time: 0.3005  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8987 (0.8372)  acc1: 82.8000 (84.4000)  acc5: 96.4000 (96.8640)  time: 0.3006  data: 0.0001  max mem: 54633
Test: Total time: 0:00:14 (0.5664 s / it)
* Acc@1 84.940 Acc@5 97.084 loss 0.829
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.96%
Epoch: [283]  [   0/1251]  eta: 1:31:26  lr: 0.000037  min_lr: 0.000037  loss: 2.8611 (2.8611)  weight_decay: 0.0500 (0.0500)  time: 4.3854  data: 1.9136  max mem: 54633
Epoch: [283]  [ 200/1251]  eta: 0:11:20  lr: 0.000037  min_lr: 0.000037  loss: 2.1172 (2.4874)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2176 (1.3374)  time: 0.6284  data: 0.0005  max mem: 54633
Epoch: [283]  [ 400/1251]  eta: 0:09:03  lr: 0.000036  min_lr: 0.000036  loss: 2.6404 (2.4888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1260 (1.2648)  time: 0.6278  data: 0.0005  max mem: 54633
Epoch: [283]  [ 600/1251]  eta: 0:06:53  lr: 0.000035  min_lr: 0.000035  loss: 2.5531 (2.4886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0527 (1.2394)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [283]  [ 800/1251]  eta: 0:04:45  lr: 0.000035  min_lr: 0.000035  loss: 2.4609 (2.4956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1202 (1.2831)  time: 0.6288  data: 0.0005  max mem: 54633
Epoch: [283]  [1000/1251]  eta: 0:02:38  lr: 0.000034  min_lr: 0.000034  loss: 2.5961 (2.5022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1510 (1.2715)  time: 0.6283  data: 0.0005  max mem: 54633
Epoch: [283]  [1200/1251]  eta: 0:00:32  lr: 0.000033  min_lr: 0.000033  loss: 2.6414 (2.5096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9474 (1.2560)  time: 0.6279  data: 0.0006  max mem: 54633
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.6765 (2.5134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2175 (1.2513)  time: 0.5331  data: 0.0007  max mem: 54633
Epoch: [283] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.6765 (2.5242)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2175 (1.2513)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6132 (0.6132)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3046  data: 5.9624  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.7286 (0.7373)  acc1: 86.8000 (87.5636)  acc5: 98.0000 (97.9273)  time: 0.8461  data: 0.5423  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8971 (0.8509)  acc1: 82.8000 (84.8571)  acc5: 96.8000 (97.0476)  time: 0.3003  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9363 (0.8645)  acc1: 82.4000 (84.4480)  acc5: 96.8000 (96.9760)  time: 0.3004  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5447 s / it)
* Acc@1 84.900 Acc@5 97.096 loss 0.857
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.96%
Epoch: [284]  [   0/1251]  eta: 1:23:12  lr: 0.000033  min_lr: 0.000033  loss: 2.6944 (2.6944)  weight_decay: 0.0500 (0.0500)  time: 3.9905  data: 1.8104  max mem: 54633
Epoch: [284]  [ 200/1251]  eta: 0:11:19  lr: 0.000032  min_lr: 0.000032  loss: 2.4715 (2.5129)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0326 (1.1572)  time: 0.6276  data: 0.0004  max mem: 54633
Epoch: [284]  [ 400/1251]  eta: 0:09:02  lr: 0.000032  min_lr: 0.000032  loss: 2.7547 (2.5228)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1549 (1.2302)  time: 0.6361  data: 0.0004  max mem: 54633
Epoch: [284]  [ 600/1251]  eta: 0:06:52  lr: 0.000031  min_lr: 0.000031  loss: 2.6226 (2.5143)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2186 (1.2241)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [284]  [ 800/1251]  eta: 0:04:45  lr: 0.000031  min_lr: 0.000031  loss: 2.4314 (2.5099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2433 (1.2105)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [284]  [1000/1251]  eta: 0:02:38  lr: 0.000030  min_lr: 0.000030  loss: 2.5177 (2.5147)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0087 (1.2102)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [284]  [1200/1251]  eta: 0:00:32  lr: 0.000029  min_lr: 0.000029  loss: 2.5560 (2.5140)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3390 (1.2173)  time: 0.6281  data: 0.0004  max mem: 54633
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.7326 (2.5172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1753 (1.2142)  time: 0.5367  data: 0.0005  max mem: 54633
Epoch: [284] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.7326 (2.5258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1753 (1.2142)
Test:  [ 0/25]  eta: 0:02:48  loss: 0.6217 (0.6217)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.7393  data: 6.3973  max mem: 54633
Test:  [10/25]  eta: 0:00:13  loss: 0.7365 (0.7457)  acc1: 87.2000 (87.6364)  acc5: 98.0000 (97.8909)  time: 0.8858  data: 0.5820  max mem: 54633
Test:  [20/25]  eta: 0:00:03  loss: 0.9117 (0.8628)  acc1: 83.6000 (84.8000)  acc5: 96.4000 (96.8952)  time: 0.3005  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.9282 (0.8753)  acc1: 83.2000 (84.4480)  acc5: 96.4000 (96.8640)  time: 0.3005  data: 0.0001  max mem: 54633
Test: Total time: 0:00:14 (0.5615 s / it)
* Acc@1 84.928 Acc@5 97.054 loss 0.869
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.96%
Epoch: [285]  [   0/1251]  eta: 1:28:33  lr: 0.000029  min_lr: 0.000029  loss: 2.6600 (2.6600)  weight_decay: 0.0500 (0.0500)  time: 4.2475  data: 1.9264  max mem: 54633
Epoch: [285]  [ 200/1251]  eta: 0:11:18  lr: 0.000029  min_lr: 0.000029  loss: 2.6576 (2.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3023 (1.2960)  time: 0.6284  data: 0.0004  max mem: 54633
Epoch: [285]  [ 400/1251]  eta: 0:09:02  lr: 0.000028  min_lr: 0.000028  loss: 2.7191 (2.5883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0791 (1.2420)  time: 0.6273  data: 0.0004  max mem: 54633
Epoch: [285]  [ 600/1251]  eta: 0:06:52  lr: 0.000027  min_lr: 0.000027  loss: 2.6253 (2.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0938 (1.2349)  time: 0.6271  data: 0.0004  max mem: 54633
Epoch: [285]  [ 800/1251]  eta: 0:04:45  lr: 0.000027  min_lr: 0.000027  loss: 2.5054 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9862 (1.2576)  time: 0.6278  data: 0.0004  max mem: 54633
Epoch: [285]  [1000/1251]  eta: 0:02:38  lr: 0.000026  min_lr: 0.000026  loss: 2.6677 (2.5698)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1286 (1.2410)  time: 0.6276  data: 0.0004  max mem: 54633
Epoch: [285]  [1200/1251]  eta: 0:00:32  lr: 0.000026  min_lr: 0.000026  loss: 2.6701 (2.5677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1346 (1.2280)  time: 0.6271  data: 0.0004  max mem: 54633
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.7963 (2.5690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1346 (1.2278)  time: 0.5324  data: 0.0005  max mem: 54633
Epoch: [285] Total time: 0:13:08 (0.6299 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.7963 (2.5289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1346 (1.2278)
Test:  [ 0/25]  eta: 0:02:30  loss: 0.5567 (0.5567)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.0297  data: 5.6853  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6840 (0.6893)  acc1: 87.2000 (87.3455)  acc5: 98.0000 (97.9273)  time: 0.8200  data: 0.5171  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8493 (0.8074)  acc1: 83.2000 (84.6857)  acc5: 96.8000 (96.9905)  time: 0.2990  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8892 (0.8205)  acc1: 82.8000 (84.3840)  acc5: 96.4000 (96.9120)  time: 0.2989  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5315 s / it)
* Acc@1 84.888 Acc@5 97.090 loss 0.813
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.96%
Epoch: [286]  [   0/1251]  eta: 1:29:29  lr: 0.000026  min_lr: 0.000026  loss: 2.7676 (2.7676)  weight_decay: 0.0500 (0.0500)  time: 4.2921  data: 2.5067  max mem: 54633
Epoch: [286]  [ 200/1251]  eta: 0:11:20  lr: 0.000025  min_lr: 0.000025  loss: 2.7082 (2.5412)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0983 (1.1865)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [286]  [ 400/1251]  eta: 0:09:02  lr: 0.000025  min_lr: 0.000025  loss: 2.6084 (2.5563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0445 (1.1873)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [286]  [ 600/1251]  eta: 0:06:53  lr: 0.000024  min_lr: 0.000024  loss: 2.4715 (2.5575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9605 (1.1685)  time: 0.6275  data: 0.0005  max mem: 54633
Epoch: [286]  [ 800/1251]  eta: 0:04:45  lr: 0.000023  min_lr: 0.000023  loss: 2.5068 (2.5424)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1093 (1.1718)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [286]  [1000/1251]  eta: 0:02:38  lr: 0.000023  min_lr: 0.000023  loss: 2.5409 (2.5315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1152 (1.1770)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [286]  [1200/1251]  eta: 0:00:32  lr: 0.000022  min_lr: 0.000022  loss: 2.6418 (2.5252)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0854 (1.1768)  time: 0.6273  data: 0.0005  max mem: 54633
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.6099 (2.5257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0946 (1.1765)  time: 0.5322  data: 0.0005  max mem: 54633
Epoch: [286] Total time: 0:13:08 (0.6302 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.6099 (2.5267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0946 (1.1765)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5664 (0.5664)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3609  data: 6.0176  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6887 (0.6941)  acc1: 86.8000 (87.5273)  acc5: 98.0000 (97.8546)  time: 0.8501  data: 0.5473  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.8441 (0.8100)  acc1: 83.2000 (84.8762)  acc5: 96.4000 (96.9714)  time: 0.2989  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8870 (0.8236)  acc1: 83.2000 (84.5120)  acc5: 96.4000 (96.9120)  time: 0.2989  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5452 s / it)
* Acc@1 84.880 Acc@5 97.068 loss 0.816
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 84.96%
Epoch: [287]  [   0/1251]  eta: 1:26:14  lr: 0.000022  min_lr: 0.000022  loss: 2.6881 (2.6881)  weight_decay: 0.0500 (0.0500)  time: 4.1359  data: 2.4211  max mem: 54633
Epoch: [287]  [ 200/1251]  eta: 0:11:18  lr: 0.000022  min_lr: 0.000022  loss: 2.5580 (2.5624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1097 (1.2160)  time: 0.6281  data: 0.0005  max mem: 54633
Epoch: [287]  [ 400/1251]  eta: 0:09:02  lr: 0.000021  min_lr: 0.000021  loss: 2.6183 (2.5535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1177 (1.2254)  time: 0.6291  data: 0.0004  max mem: 54633
Epoch: [287]  [ 600/1251]  eta: 0:06:53  lr: 0.000021  min_lr: 0.000021  loss: 2.4444 (2.5436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0825 (1.2053)  time: 0.6282  data: 0.0005  max mem: 54633
Epoch: [287]  [ 800/1251]  eta: 0:04:45  lr: 0.000020  min_lr: 0.000020  loss: 2.5357 (2.5366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2415 (1.2173)  time: 0.6275  data: 0.0004  max mem: 54633
Epoch: [287]  [1000/1251]  eta: 0:02:38  lr: 0.000020  min_lr: 0.000020  loss: 2.5606 (2.5406)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0917 (1.1990)  time: 0.6273  data: 0.0004  max mem: 54633
Epoch: [287]  [1200/1251]  eta: 0:00:32  lr: 0.000019  min_lr: 0.000019  loss: 2.6222 (2.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1450 (1.1993)  time: 0.6277  data: 0.0004  max mem: 54633
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.4701 (2.5353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1450 (1.2028)  time: 0.5325  data: 0.0006  max mem: 54633
Epoch: [287] Total time: 0:13:08 (0.6304 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.4701 (2.5294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1450 (1.2028)
Test:  [ 0/25]  eta: 0:02:39  loss: 0.5157 (0.5157)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3624  data: 6.0382  max mem: 54633
Test:  [10/25]  eta: 0:00:12  loss: 0.6374 (0.6450)  acc1: 88.0000 (87.7091)  acc5: 98.0000 (97.8909)  time: 0.8502  data: 0.5493  max mem: 54633
Test:  [20/25]  eta: 0:00:02  loss: 0.7960 (0.7616)  acc1: 83.6000 (84.9143)  acc5: 96.4000 (96.9143)  time: 0.2988  data: 0.0002  max mem: 54633
Test:  [24/25]  eta: 0:00:00  loss: 0.8400 (0.7746)  acc1: 82.8000 (84.5600)  acc5: 96.4000 (96.8480)  time: 0.2987  data: 0.0001  max mem: 54633
Test: Total time: 0:00:13 (0.5455 s / it)
* Acc@1 84.960 Acc@5 97.070 loss 0.767
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 84.96%
Epoch: [288]  [   0/1251]  eta: 1:18:35  lr: 0.000019  min_lr: 0.000019  loss: 1.6664 (1.6664)  weight_decay: 0.0500 (0.0500)  time: 3.7693  data: 3.1396  max mem: 54633
Epoch: [288]  [ 200/1251]  eta: 0:11:15  lr: 0.000019  min_lr: 0.000019  loss: 2.7364 (2.5174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0447 (1.1498)  time: 0.6272  data: 0.0004  max mem: 54633
Epoch: [288]  [ 400/1251]  eta: 0:09:01  lr: 0.000018  min_lr: 0.000018  loss: 2.4962 (2.4931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (1.1542)  time: 0.6285  data: 0.0004  max mem: 54633
Epoch: [288]  [ 600/1251]  eta: 0:06:52  lr: 0.000018  min_lr: 0.000018  loss: 2.6619 (2.5040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0078 (1.1584)  time: 0.6276  data: 0.0005  max mem: 54633
Epoch: [288]  [ 800/1251]  eta: 0:04:45  lr: 0.000017  min_lr: 0.000017  loss: 2.6032 (2.5007)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0606 (1.1736)  time: 0.6346  data: 0.0004  max mem: 54633
| distributed init (rank 0): env://, gpu 0
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 1): env://, gpu 1
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=256, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_256_11.4G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(256, 256), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=292, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(256, 256))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f5174d7d8d0>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.034)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.045)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.056)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.068)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.079)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.090)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.102)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.113)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.124)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.135)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.147)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.158)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.169)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.181)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.192)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.203)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.215)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.226)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.237)
    )
    (10): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.248)
    )
    (11): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.260)
    )
    (12): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.271)
    )
    (13): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.282)
    )
    (14): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.294)
    )
    (15): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.305)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.316)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.327)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.339)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.350)
    )
  )
  (head): ConvX(
    (conv): Conv2d(768, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 50901626
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer1.3.mlp.conv_in.conv.weight",
      "layer1.3.mlp.dw.conv.weight",
      "layer1.3.mlp.re.region.0.weight",
      "layer1.3.mlp.re.region.3.weight",
      "layer1.3.mlp.proj.conv.weight",
      "layer1.3.dcnn.conv_in.conv.weight",
      "layer1.3.dcnn.spe.conv.weight",
      "layer1.3.dcnn.att.logit_scale",
      "layer1.3.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer2.5.mlp.conv_in.conv.weight",
      "layer2.5.mlp.dw.conv.weight",
      "layer2.5.mlp.re.region.0.weight",
      "layer2.5.mlp.re.region.3.weight",
      "layer2.5.mlp.proj.conv.weight",
      "layer2.5.dcnn.conv_in.conv.weight",
      "layer2.5.dcnn.spe.conv.weight",
      "layer2.5.dcnn.att.logit_scale",
      "layer2.5.dcnn.proj.conv.weight",
      "layer2.6.mlp.conv_in.conv.weight",
      "layer2.6.mlp.dw.conv.weight",
      "layer2.6.mlp.re.region.0.weight",
      "layer2.6.mlp.re.region.3.weight",
      "layer2.6.mlp.proj.conv.weight",
      "layer2.6.dcnn.conv_in.conv.weight",
      "layer2.6.dcnn.spe.conv.weight",
      "layer2.6.dcnn.att.logit_scale",
      "layer2.6.dcnn.proj.conv.weight",
      "layer2.7.mlp.conv_in.conv.weight",
      "layer2.7.mlp.dw.conv.weight",
      "layer2.7.mlp.re.region.0.weight",
      "layer2.7.mlp.re.region.3.weight",
      "layer2.7.mlp.proj.conv.weight",
      "layer2.7.dcnn.conv_in.conv.weight",
      "layer2.7.dcnn.spe.conv.weight",
      "layer2.7.dcnn.att.logit_scale",
      "layer2.7.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer3.10.mlp.conv_in.conv.weight",
      "layer3.10.mlp.dw.conv.weight",
      "layer3.10.mlp.re.region.0.weight",
      "layer3.10.mlp.re.region.3.weight",
      "layer3.10.mlp.proj.conv.weight",
      "layer3.10.dcnn.conv_in.conv.weight",
      "layer3.10.dcnn.spe.conv.weight",
      "layer3.10.dcnn.att.logit_scale",
      "layer3.10.dcnn.proj.conv.weight",
      "layer3.11.mlp.conv_in.conv.weight",
      "layer3.11.mlp.dw.conv.weight",
      "layer3.11.mlp.re.region.0.weight",
      "layer3.11.mlp.re.region.3.weight",
      "layer3.11.mlp.proj.conv.weight",
      "layer3.11.dcnn.conv_in.conv.weight",
      "layer3.11.dcnn.spe.conv.weight",
      "layer3.11.dcnn.att.logit_scale",
      "layer3.11.dcnn.proj.conv.weight",
      "layer3.12.mlp.conv_in.conv.weight",
      "layer3.12.mlp.dw.conv.weight",
      "layer3.12.mlp.re.region.0.weight",
      "layer3.12.mlp.re.region.3.weight",
      "layer3.12.mlp.proj.conv.weight",
      "layer3.12.dcnn.conv_in.conv.weight",
      "layer3.12.dcnn.spe.conv.weight",
      "layer3.12.dcnn.att.logit_scale",
      "layer3.12.dcnn.proj.conv.weight",
      "layer3.13.mlp.conv_in.conv.weight",
      "layer3.13.mlp.dw.conv.weight",
      "layer3.13.mlp.re.region.0.weight",
      "layer3.13.mlp.re.region.3.weight",
      "layer3.13.mlp.proj.conv.weight",
      "layer3.13.dcnn.conv_in.conv.weight",
      "layer3.13.dcnn.spe.conv.weight",
      "layer3.13.dcnn.att.logit_scale",
      "layer3.13.dcnn.proj.conv.weight",
      "layer3.14.mlp.conv_in.conv.weight",
      "layer3.14.mlp.dw.conv.weight",
      "layer3.14.mlp.re.region.0.weight",
      "layer3.14.mlp.re.region.3.weight",
      "layer3.14.mlp.proj.conv.weight",
      "layer3.14.dcnn.conv_in.conv.weight",
      "layer3.14.dcnn.spe.conv.weight",
      "layer3.14.dcnn.att.logit_scale",
      "layer3.14.dcnn.proj.conv.weight",
      "layer3.15.mlp.conv_in.conv.weight",
      "layer3.15.mlp.dw.conv.weight",
      "layer3.15.mlp.re.region.0.weight",
      "layer3.15.mlp.re.region.3.weight",
      "layer3.15.mlp.proj.conv.weight",
      "layer3.15.dcnn.conv_in.conv.weight",
      "layer3.15.dcnn.spe.conv.weight",
      "layer3.15.dcnn.att.logit_scale",
      "layer3.15.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "layer4.3.mlp.conv_in.conv.weight",
      "layer4.3.mlp.dw.conv.weight",
      "layer4.3.mlp.re.region.0.weight",
      "layer4.3.mlp.re.region.3.weight",
      "layer4.3.mlp.proj.conv.weight",
      "layer4.3.dcnn.conv_in.conv.weight",
      "layer4.3.dcnn.spe.conv.weight",
      "layer4.3.dcnn.att.logit_scale",
      "layer4.3.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer1.3.mlp.conv_in.norm.weight",
      "layer1.3.mlp.conv_in.norm.bias",
      "layer1.3.mlp.dw.norm.weight",
      "layer1.3.mlp.dw.norm.bias",
      "layer1.3.mlp.re.region.1.weight",
      "layer1.3.mlp.re.region.1.bias",
      "layer1.3.mlp.re.region.3.bias",
      "layer1.3.mlp.proj.norm.weight",
      "layer1.3.mlp.proj.norm.bias",
      "layer1.3.dcnn.conv_in.norm.weight",
      "layer1.3.dcnn.conv_in.norm.bias",
      "layer1.3.dcnn.spe.norm.weight",
      "layer1.3.dcnn.spe.norm.bias",
      "layer1.3.dcnn.proj.norm.weight",
      "layer1.3.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer2.5.mlp.conv_in.norm.weight",
      "layer2.5.mlp.conv_in.norm.bias",
      "layer2.5.mlp.dw.norm.weight",
      "layer2.5.mlp.dw.norm.bias",
      "layer2.5.mlp.re.region.1.weight",
      "layer2.5.mlp.re.region.1.bias",
      "layer2.5.mlp.re.region.3.bias",
      "layer2.5.mlp.proj.norm.weight",
      "layer2.5.mlp.proj.norm.bias",
      "layer2.5.dcnn.conv_in.norm.weight",
      "layer2.5.dcnn.conv_in.norm.bias",
      "layer2.5.dcnn.spe.norm.weight",
      "layer2.5.dcnn.spe.norm.bias",
      "layer2.5.dcnn.proj.norm.weight",
      "layer2.5.dcnn.proj.norm.bias",
      "layer2.6.mlp.conv_in.norm.weight",
      "layer2.6.mlp.conv_in.norm.bias",
      "layer2.6.mlp.dw.norm.weight",
      "layer2.6.mlp.dw.norm.bias",
      "layer2.6.mlp.re.region.1.weight",
      "layer2.6.mlp.re.region.1.bias",
      "layer2.6.mlp.re.region.3.bias",
      "layer2.6.mlp.proj.norm.weight",
      "layer2.6.mlp.proj.norm.bias",
      "layer2.6.dcnn.conv_in.norm.weight",
      "layer2.6.dcnn.conv_in.norm.bias",
      "layer2.6.dcnn.spe.norm.weight",
      "layer2.6.dcnn.spe.norm.bias",
      "layer2.6.dcnn.proj.norm.weight",
      "layer2.6.dcnn.proj.norm.bias",
      "layer2.7.mlp.conv_in.norm.weight",
      "layer2.7.mlp.conv_in.norm.bias",
      "layer2.7.mlp.dw.norm.weight",
      "layer2.7.mlp.dw.norm.bias",
      "layer2.7.mlp.re.region.1.weight",
      "layer2.7.mlp.re.region.1.bias",
      "layer2.7.mlp.re.region.3.bias",
      "layer2.7.mlp.proj.norm.weight",
      "layer2.7.mlp.proj.norm.bias",
      "layer2.7.dcnn.conv_in.norm.weight",
      "layer2.7.dcnn.conv_in.norm.bias",
      "layer2.7.dcnn.spe.norm.weight",
      "layer2.7.dcnn.spe.norm.bias",
      "layer2.7.dcnn.proj.norm.weight",
      "layer2.7.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer3.10.mlp.conv_in.norm.weight",
      "layer3.10.mlp.conv_in.norm.bias",
      "layer3.10.mlp.dw.norm.weight",
      "layer3.10.mlp.dw.norm.bias",
      "layer3.10.mlp.re.region.1.weight",
      "layer3.10.mlp.re.region.1.bias",
      "layer3.10.mlp.re.region.3.bias",
      "layer3.10.mlp.proj.norm.weight",
      "layer3.10.mlp.proj.norm.bias",
      "layer3.10.dcnn.conv_in.norm.weight",
      "layer3.10.dcnn.conv_in.norm.bias",
      "layer3.10.dcnn.spe.norm.weight",
      "layer3.10.dcnn.spe.norm.bias",
      "layer3.10.dcnn.proj.norm.weight",
      "layer3.10.dcnn.proj.norm.bias",
      "layer3.11.mlp.conv_in.norm.weight",
      "layer3.11.mlp.conv_in.norm.bias",
      "layer3.11.mlp.dw.norm.weight",
      "layer3.11.mlp.dw.norm.bias",
      "layer3.11.mlp.re.region.1.weight",
      "layer3.11.mlp.re.region.1.bias",
      "layer3.11.mlp.re.region.3.bias",
      "layer3.11.mlp.proj.norm.weight",
      "layer3.11.mlp.proj.norm.bias",
      "layer3.11.dcnn.conv_in.norm.weight",
      "layer3.11.dcnn.conv_in.norm.bias",
      "layer3.11.dcnn.spe.norm.weight",
      "layer3.11.dcnn.spe.norm.bias",
      "layer3.11.dcnn.proj.norm.weight",
      "layer3.11.dcnn.proj.norm.bias",
      "layer3.12.mlp.conv_in.norm.weight",
      "layer3.12.mlp.conv_in.norm.bias",
      "layer3.12.mlp.dw.norm.weight",
      "layer3.12.mlp.dw.norm.bias",
      "layer3.12.mlp.re.region.1.weight",
      "layer3.12.mlp.re.region.1.bias",
      "layer3.12.mlp.re.region.3.bias",
      "layer3.12.mlp.proj.norm.weight",
      "layer3.12.mlp.proj.norm.bias",
      "layer3.12.dcnn.conv_in.norm.weight",
      "layer3.12.dcnn.conv_in.norm.bias",
      "layer3.12.dcnn.spe.norm.weight",
      "layer3.12.dcnn.spe.norm.bias",
      "layer3.12.dcnn.proj.norm.weight",
      "layer3.12.dcnn.proj.norm.bias",
      "layer3.13.mlp.conv_in.norm.weight",
      "layer3.13.mlp.conv_in.norm.bias",
      "layer3.13.mlp.dw.norm.weight",
      "layer3.13.mlp.dw.norm.bias",
      "layer3.13.mlp.re.region.1.weight",
      "layer3.13.mlp.re.region.1.bias",
      "layer3.13.mlp.re.region.3.bias",
      "layer3.13.mlp.proj.norm.weight",
      "layer3.13.mlp.proj.norm.bias",
      "layer3.13.dcnn.conv_in.norm.weight",
      "layer3.13.dcnn.conv_in.norm.bias",
      "layer3.13.dcnn.spe.norm.weight",
      "layer3.13.dcnn.spe.norm.bias",
      "layer3.13.dcnn.proj.norm.weight",
      "layer3.13.dcnn.proj.norm.bias",
      "layer3.14.mlp.conv_in.norm.weight",
      "layer3.14.mlp.conv_in.norm.bias",
      "layer3.14.mlp.dw.norm.weight",
      "layer3.14.mlp.dw.norm.bias",
      "layer3.14.mlp.re.region.1.weight",
      "layer3.14.mlp.re.region.1.bias",
      "layer3.14.mlp.re.region.3.bias",
      "layer3.14.mlp.proj.norm.weight",
      "layer3.14.mlp.proj.norm.bias",
      "layer3.14.dcnn.conv_in.norm.weight",
      "layer3.14.dcnn.conv_in.norm.bias",
      "layer3.14.dcnn.spe.norm.weight",
      "layer3.14.dcnn.spe.norm.bias",
      "layer3.14.dcnn.proj.norm.weight",
      "layer3.14.dcnn.proj.norm.bias",
      "layer3.15.mlp.conv_in.norm.weight",
      "layer3.15.mlp.conv_in.norm.bias",
      "layer3.15.mlp.dw.norm.weight",
      "layer3.15.mlp.dw.norm.bias",
      "layer3.15.mlp.re.region.1.weight",
      "layer3.15.mlp.re.region.1.bias",
      "layer3.15.mlp.re.region.3.bias",
      "layer3.15.mlp.proj.norm.weight",
      "layer3.15.mlp.proj.norm.bias",
      "layer3.15.dcnn.conv_in.norm.weight",
      "layer3.15.dcnn.conv_in.norm.bias",
      "layer3.15.dcnn.spe.norm.weight",
      "layer3.15.dcnn.spe.norm.bias",
      "layer3.15.dcnn.proj.norm.weight",
      "layer3.15.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "layer4.3.mlp.conv_in.norm.weight",
      "layer4.3.mlp.conv_in.norm.bias",
      "layer4.3.mlp.dw.norm.weight",
      "layer4.3.mlp.dw.norm.bias",
      "layer4.3.mlp.re.region.1.weight",
      "layer4.3.mlp.re.region.1.bias",
      "layer4.3.mlp.re.region.3.bias",
      "layer4.3.mlp.proj.norm.weight",
      "layer4.3.mlp.proj.norm.bias",
      "layer4.3.dcnn.conv_in.norm.weight",
      "layer4.3.dcnn.conv_in.norm.bias",
      "layer4.3.dcnn.spe.norm.weight",
      "layer4.3.dcnn.spe.norm.bias",
      "layer4.3.dcnn.proj.norm.weight",
      "layer4.3.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: checkpoint_base_256_11.4G/checkpoint-287.pth
Resume checkpoint checkpoint_base_256_11.4G/checkpoint-287.pth
With optim & sched!
Start training for 300 epochs
Epoch: [288]  [   0/1251]  eta: 5:10:14  lr: 0.000019  min_lr: 0.000019  loss: 2.5948 (2.5948)  weight_decay: 0.0500 (0.0500)  time: 14.8797  data: 2.5437  max mem: 54644
Epoch: [288]  [ 200/1251]  eta: 0:12:24  lr: 0.000019  min_lr: 0.000019  loss: 2.4682 (2.4963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0162 (1.1552)  time: 0.6351  data: 0.0005  max mem: 54644
Epoch: [288]  [ 400/1251]  eta: 0:09:31  lr: 0.000018  min_lr: 0.000018  loss: 2.5641 (2.4802)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (1.1134)  time: 0.6349  data: 0.0005  max mem: 54644
Epoch: [288]  [ 600/1251]  eta: 0:07:09  lr: 0.000018  min_lr: 0.000018  loss: 2.5980 (2.4969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0432 (1.1082)  time: 0.6349  data: 0.0005  max mem: 54644
Epoch: [288]  [ 800/1251]  eta: 0:04:54  lr: 0.000017  min_lr: 0.000017  loss: 2.5945 (2.4912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0545 (1.1226)  time: 0.6351  data: 0.0005  max mem: 54644
Epoch: [288]  [1000/1251]  eta: 0:02:43  lr: 0.000017  min_lr: 0.000017  loss: 2.5150 (2.4794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1839 (1.1887)  time: 0.6349  data: 0.0005  max mem: 54644
Epoch: [288]  [1200/1251]  eta: 0:00:33  lr: 0.000016  min_lr: 0.000016  loss: 2.6599 (2.4829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2386 (1.1983)  time: 0.6350  data: 0.0006  max mem: 54644
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.3949 (2.4793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1258 (1.1979)  time: 0.5389  data: 0.0006  max mem: 54644
Epoch: [288] Total time: 0:13:28 (0.6462 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.3949 (2.4876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1258 (1.1979)
Test:  [ 0/25]  eta: 0:05:11  loss: 0.5192 (0.5192)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 12.4432  data: 7.8615  max mem: 54644
Test:  [10/25]  eta: 0:00:21  loss: 0.6391 (0.6456)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (97.8546)  time: 1.4025  data: 0.7149  max mem: 54644
Test:  [20/25]  eta: 0:00:04  loss: 0.7954 (0.7612)  acc1: 83.6000 (84.8571)  acc5: 96.8000 (96.9524)  time: 0.2983  data: 0.0001  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8454 (0.7743)  acc1: 82.8000 (84.5440)  acc5: 96.8000 (96.8960)  time: 0.2982  data: 0.0001  max mem: 54644
Test: Total time: 0:00:19 (0.7875 s / it)
* Acc@1 85.014 Acc@5 97.064 loss 0.768
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [289]  [   0/1251]  eta: 1:12:20  lr: 0.000016  min_lr: 0.000016  loss: 1.9433 (1.9433)  weight_decay: 0.0500 (0.0500)  time: 3.4697  data: 2.6177  max mem: 54644
Epoch: [289]  [ 200/1251]  eta: 0:11:15  lr: 0.000016  min_lr: 0.000016  loss: 2.4743 (2.4715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1122 (1.2016)  time: 0.6284  data: 0.0005  max mem: 54644
Epoch: [289]  [ 400/1251]  eta: 0:09:00  lr: 0.000015  min_lr: 0.000015  loss: 2.5431 (2.4758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0091 (1.1337)  time: 0.6282  data: 0.0005  max mem: 54644
Epoch: [289]  [ 600/1251]  eta: 0:06:52  lr: 0.000015  min_lr: 0.000015  loss: 2.5160 (2.4826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1975 (1.1558)  time: 0.6286  data: 0.0005  max mem: 54644
Epoch: [289]  [ 800/1251]  eta: 0:04:45  lr: 0.000014  min_lr: 0.000014  loss: 2.5361 (2.4920)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (1.1547)  time: 0.6279  data: 0.0005  max mem: 54644
Epoch: [289]  [1000/1251]  eta: 0:02:38  lr: 0.000014  min_lr: 0.000014  loss: 2.5105 (2.4865)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1057 (1.1507)  time: 0.6281  data: 0.0005  max mem: 54644
Epoch: [289]  [1200/1251]  eta: 0:00:32  lr: 0.000014  min_lr: 0.000014  loss: 2.4492 (2.4793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0338 (1.1367)  time: 0.6282  data: 0.0005  max mem: 54644
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.7196 (2.4837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1702 (1.1436)  time: 0.5331  data: 0.0006  max mem: 54644
Epoch: [289] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.7196 (2.4875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1702 (1.1436)
Test:  [ 0/25]  eta: 0:02:46  loss: 0.6698 (0.6698)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.6596  data: 6.3324  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.7987 (0.7989)  acc1: 86.8000 (87.4182)  acc5: 97.6000 (97.8182)  time: 0.8785  data: 0.5759  max mem: 54644
Test:  [20/25]  eta: 0:00:03  loss: 0.9613 (0.9163)  acc1: 83.2000 (84.7238)  acc5: 96.8000 (96.9714)  time: 0.3004  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.9916 (0.9292)  acc1: 82.4000 (84.3200)  acc5: 96.4000 (96.9280)  time: 0.3004  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5585 s / it)
* Acc@1 84.912 Acc@5 97.018 loss 0.921
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 85.01%
Epoch: [290]  [   0/1251]  eta: 1:26:28  lr: 0.000014  min_lr: 0.000014  loss: 2.9622 (2.9622)  weight_decay: 0.0500 (0.0500)  time: 4.1476  data: 2.0345  max mem: 54644
Epoch: [290]  [ 200/1251]  eta: 0:11:19  lr: 0.000013  min_lr: 0.000013  loss: 2.6048 (2.5025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9464 (1.1109)  time: 0.6294  data: 0.0005  max mem: 54644
Epoch: [290]  [ 400/1251]  eta: 0:09:02  lr: 0.000013  min_lr: 0.000013  loss: 2.5419 (2.4987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0662 (1.0780)  time: 0.6287  data: 0.0004  max mem: 54644
Epoch: [290]  [ 600/1251]  eta: 0:06:53  lr: 0.000012  min_lr: 0.000012  loss: 2.2058 (2.4638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0776 (1.1195)  time: 0.6281  data: 0.0005  max mem: 54644
Epoch: [290]  [ 800/1251]  eta: 0:04:45  lr: 0.000012  min_lr: 0.000012  loss: 2.5735 (2.4532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1357 (1.1252)  time: 0.6280  data: 0.0005  max mem: 54644
Epoch: [290]  [1000/1251]  eta: 0:02:38  lr: 0.000012  min_lr: 0.000012  loss: 2.4910 (2.4577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0922 (1.1370)  time: 0.6281  data: 0.0005  max mem: 54644
Epoch: [290]  [1200/1251]  eta: 0:00:32  lr: 0.000011  min_lr: 0.000011  loss: 2.5515 (2.4669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0338 (1.1234)  time: 0.6284  data: 0.0005  max mem: 54644
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.4779 (2.4657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0335 (1.1212)  time: 0.5332  data: 0.0007  max mem: 54644
Epoch: [290] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.4779 (2.4848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0335 (1.1212)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.5353 (0.5353)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.7027  data: 6.3861  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.6615 (0.6662)  acc1: 87.6000 (87.5636)  acc5: 98.0000 (97.8909)  time: 0.8816  data: 0.5808  max mem: 54644
Test:  [20/25]  eta: 0:00:03  loss: 0.8107 (0.7814)  acc1: 83.6000 (84.8762)  acc5: 96.8000 (96.9714)  time: 0.2991  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8581 (0.7945)  acc1: 83.6000 (84.6240)  acc5: 96.8000 (96.9120)  time: 0.2988  data: 0.0001  max mem: 54644
Test: Total time: 0:00:14 (0.5601 s / it)
* Acc@1 85.006 Acc@5 97.044 loss 0.788
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [291]  [   0/1251]  eta: 1:24:31  lr: 0.000011  min_lr: 0.000011  loss: 2.7821 (2.7821)  weight_decay: 0.0500 (0.0500)  time: 4.0543  data: 2.8048  max mem: 54644
Epoch: [291]  [ 200/1251]  eta: 0:11:18  lr: 0.000011  min_lr: 0.000011  loss: 2.4640 (2.4869)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1059 (1.2075)  time: 0.6278  data: 0.0005  max mem: 54644
Epoch: [291]  [ 400/1251]  eta: 0:09:02  lr: 0.000010  min_lr: 0.000010  loss: 2.6378 (2.4743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3142 (1.2425)  time: 0.6279  data: 0.0005  max mem: 54644
Epoch: [291]  [ 600/1251]  eta: 0:06:52  lr: 0.000010  min_lr: 0.000010  loss: 2.3638 (2.4731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1152 (1.2082)  time: 0.6286  data: 0.0004  max mem: 54644
Epoch: [291]  [ 800/1251]  eta: 0:04:45  lr: 0.000010  min_lr: 0.000010  loss: 2.3730 (2.4670)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1380 (1.1929)  time: 0.6292  data: 0.0005  max mem: 54644
Epoch: [291]  [1000/1251]  eta: 0:02:38  lr: 0.000009  min_lr: 0.000009  loss: 2.6545 (2.4760)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0579 (1.1871)  time: 0.6286  data: 0.0004  max mem: 54644
Epoch: [291]  [1200/1251]  eta: 0:00:32  lr: 0.000009  min_lr: 0.000009  loss: 2.5626 (2.4767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0351 (1.1661)  time: 0.6290  data: 0.0005  max mem: 54644
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.7161 (2.4781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0127 (1.1629)  time: 0.5335  data: 0.0006  max mem: 54644
Epoch: [291] Total time: 0:13:09 (0.6312 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.7161 (2.4881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0127 (1.1629)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5918 (0.5918)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5250  data: 6.1989  max mem: 54644
Test:  [10/25]  eta: 0:00:12  loss: 0.7174 (0.7249)  acc1: 86.8000 (87.4546)  acc5: 98.0000 (97.7455)  time: 0.8660  data: 0.5639  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.8775 (0.8405)  acc1: 83.6000 (84.7810)  acc5: 96.8000 (96.9333)  time: 0.3003  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.9246 (0.8538)  acc1: 83.2000 (84.5120)  acc5: 96.8000 (96.8480)  time: 0.3003  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5528 s / it)
* Acc@1 84.984 Acc@5 97.054 loss 0.847
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [292]  [   0/1251]  eta: 1:23:17  lr: 0.000009  min_lr: 0.000009  loss: 2.8122 (2.8122)  weight_decay: 0.0500 (0.0500)  time: 3.9949  data: 2.8156  max mem: 54644
Epoch: [292]  [ 200/1251]  eta: 0:11:19  lr: 0.000009  min_lr: 0.000009  loss: 2.5059 (2.5194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1135 (1.1659)  time: 0.6386  data: 0.0005  max mem: 54644
Epoch: [292]  [ 400/1251]  eta: 0:09:03  lr: 0.000008  min_lr: 0.000008  loss: 2.5551 (2.5131)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0411 (1.1882)  time: 0.6284  data: 0.0004  max mem: 54644
Epoch: [292]  [ 600/1251]  eta: 0:06:53  lr: 0.000008  min_lr: 0.000008  loss: 2.6036 (2.5050)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1718 (1.1697)  time: 0.6286  data: 0.0004  max mem: 54644
Epoch: [292]  [ 800/1251]  eta: 0:04:45  lr: 0.000008  min_lr: 0.000008  loss: 2.6067 (2.5002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1717 (1.2048)  time: 0.6283  data: 0.0005  max mem: 54644
Epoch: [292]  [1000/1251]  eta: 0:02:38  lr: 0.000008  min_lr: 0.000008  loss: 2.5971 (2.4912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0119 (1.1990)  time: 0.6281  data: 0.0005  max mem: 54644
Epoch: [292]  [1200/1251]  eta: 0:00:32  lr: 0.000007  min_lr: 0.000007  loss: 2.4870 (2.4919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2176 (1.1978)  time: 0.6282  data: 0.0005  max mem: 54644
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.5107 (2.4875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0859 (1.2010)  time: 0.5334  data: 0.0007  max mem: 54644
Epoch: [292] Total time: 0:13:09 (0.6308 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.5107 (2.4887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0859 (1.2010)
Test:  [ 0/25]  eta: 0:02:43  loss: 0.5569 (0.5569)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.5329  data: 6.1944  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.6804 (0.6867)  acc1: 87.2000 (87.4909)  acc5: 98.0000 (97.8909)  time: 0.8669  data: 0.5634  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.8357 (0.8019)  acc1: 83.2000 (84.7810)  acc5: 96.8000 (96.9714)  time: 0.3005  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8807 (0.8149)  acc1: 83.2000 (84.5280)  acc5: 96.8000 (96.9120)  time: 0.3005  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5535 s / it)
* Acc@1 84.996 Acc@5 97.046 loss 0.809
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [293]  [   0/1251]  eta: 1:27:12  lr: 0.000007  min_lr: 0.000007  loss: 1.9636 (1.9636)  weight_decay: 0.0500 (0.0500)  time: 4.1830  data: 3.4872  max mem: 54644
Epoch: [293]  [ 200/1251]  eta: 0:11:21  lr: 0.000007  min_lr: 0.000007  loss: 2.5537 (2.5496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9701 (1.2104)  time: 0.6285  data: 0.0005  max mem: 54644
Epoch: [293]  [ 400/1251]  eta: 0:09:03  lr: 0.000007  min_lr: 0.000007  loss: 2.6112 (2.5073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (1.1975)  time: 0.6284  data: 0.0005  max mem: 54644
Epoch: [293]  [ 600/1251]  eta: 0:06:53  lr: 0.000006  min_lr: 0.000006  loss: 2.4862 (2.4983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0409 (1.2238)  time: 0.6294  data: 0.0005  max mem: 54644
Epoch: [293]  [ 800/1251]  eta: 0:04:45  lr: 0.000006  min_lr: 0.000006  loss: 2.4535 (2.4938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1231 (1.2178)  time: 0.6286  data: 0.0005  max mem: 54644
Epoch: [293]  [1000/1251]  eta: 0:02:38  lr: 0.000006  min_lr: 0.000006  loss: 2.7120 (2.4989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0334 (1.2133)  time: 0.6289  data: 0.0005  max mem: 54644
Epoch: [293]  [1200/1251]  eta: 0:00:32  lr: 0.000006  min_lr: 0.000006  loss: 2.6711 (2.5015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2275 (1.2019)  time: 0.6296  data: 0.0004  max mem: 54644
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.2474 (2.4972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1978 (1.2019)  time: 0.5396  data: 0.0006  max mem: 54644
Epoch: [293] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.2474 (2.4830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1978 (1.2019)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.5282 (0.5282)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.3389  data: 6.0151  max mem: 54644
Test:  [10/25]  eta: 0:00:12  loss: 0.6530 (0.6588)  acc1: 87.6000 (87.7818)  acc5: 98.0000 (98.0000)  time: 0.8545  data: 0.5525  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.8125 (0.7758)  acc1: 83.2000 (84.8762)  acc5: 96.8000 (96.9714)  time: 0.3032  data: 0.0031  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8567 (0.7889)  acc1: 83.2000 (84.6080)  acc5: 96.4000 (96.8960)  time: 0.3004  data: 0.0002  max mem: 54644
Test: Total time: 0:00:13 (0.5500 s / it)
* Acc@1 84.994 Acc@5 97.084 loss 0.783
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [294]  [   0/1251]  eta: 1:27:54  lr: 0.000006  min_lr: 0.000006  loss: 1.5883 (1.5883)  weight_decay: 0.0500 (0.0500)  time: 4.2163  data: 2.7918  max mem: 54644
Epoch: [294]  [ 200/1251]  eta: 0:11:18  lr: 0.000005  min_lr: 0.000005  loss: 2.5446 (2.5119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1785 (1.1874)  time: 0.6284  data: 0.0004  max mem: 54644
Epoch: [294]  [ 400/1251]  eta: 0:09:02  lr: 0.000005  min_lr: 0.000005  loss: 2.5611 (2.4849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0113 (1.1373)  time: 0.6284  data: 0.0005  max mem: 54644
Epoch: [294]  [ 600/1251]  eta: 0:06:53  lr: 0.000005  min_lr: 0.000005  loss: 2.3607 (2.4891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0887 (1.1577)  time: 0.6282  data: 0.0005  max mem: 54644
Epoch: [294]  [ 800/1251]  eta: 0:04:45  lr: 0.000005  min_lr: 0.000005  loss: 2.5828 (2.4941)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0856 (1.1516)  time: 0.6283  data: 0.0005  max mem: 54644
Epoch: [294]  [1000/1251]  eta: 0:02:38  lr: 0.000004  min_lr: 0.000004  loss: 2.4465 (2.4924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0484 (1.1473)  time: 0.6359  data: 0.0004  max mem: 54644
Epoch: [294]  [1200/1251]  eta: 0:00:32  lr: 0.000004  min_lr: 0.000004  loss: 2.7150 (2.4985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9687 (1.1326)  time: 0.6281  data: 0.0004  max mem: 54644
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.6564 (2.4964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1166 (1.1361)  time: 0.5332  data: 0.0005  max mem: 54644
Epoch: [294] Total time: 0:13:09 (0.6307 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.6564 (2.4865)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1166 (1.1361)
Test:  [ 0/25]  eta: 0:02:44  loss: 0.5514 (0.5514)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.5832  data: 6.2523  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.6785 (0.6833)  acc1: 87.2000 (87.5273)  acc5: 98.0000 (97.9273)  time: 0.8714  data: 0.5687  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.8296 (0.7976)  acc1: 84.4000 (84.8762)  acc5: 96.8000 (97.0667)  time: 0.3004  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8752 (0.8110)  acc1: 83.2000 (84.5760)  acc5: 96.4000 (96.9760)  time: 0.3005  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5559 s / it)
* Acc@1 84.970 Acc@5 97.064 loss 0.805
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [295]  [   0/1251]  eta: 1:29:51  lr: 0.000004  min_lr: 0.000004  loss: 1.5374 (1.5374)  weight_decay: 0.0500 (0.0500)  time: 4.3094  data: 3.1129  max mem: 54644
Epoch: [295]  [ 200/1251]  eta: 0:11:20  lr: 0.000004  min_lr: 0.000004  loss: 2.7414 (2.5312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0817 (1.1283)  time: 0.6292  data: 0.0004  max mem: 54644
Epoch: [295]  [ 400/1251]  eta: 0:09:03  lr: 0.000004  min_lr: 0.000004  loss: 2.5957 (2.4992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0707 (1.1398)  time: 0.6279  data: 0.0005  max mem: 54644
Epoch: [295]  [ 600/1251]  eta: 0:06:53  lr: 0.000004  min_lr: 0.000004  loss: 2.5385 (2.4898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0478 (1.1086)  time: 0.6285  data: 0.0004  max mem: 54644
Epoch: [295]  [ 800/1251]  eta: 0:04:45  lr: 0.000003  min_lr: 0.000003  loss: 2.6075 (2.4801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9831 (1.1049)  time: 0.6426  data: 0.0005  max mem: 54644
Epoch: [295]  [1000/1251]  eta: 0:02:38  lr: 0.000003  min_lr: 0.000003  loss: 2.3939 (2.4770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0078 (1.1171)  time: 0.6284  data: 0.0004  max mem: 54644
Epoch: [295]  [1200/1251]  eta: 0:00:32  lr: 0.000003  min_lr: 0.000003  loss: 2.6011 (2.4733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1360 (1.1135)  time: 0.6285  data: 0.0004  max mem: 54644
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.5912 (2.4729)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1418 (1.1166)  time: 0.5335  data: 0.0005  max mem: 54644
Epoch: [295] Total time: 0:13:09 (0.6310 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.5912 (2.4859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1418 (1.1166)
Test:  [ 0/25]  eta: 0:02:50  loss: 0.5857 (0.5857)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.8323  data: 6.5049  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.7101 (0.7175)  acc1: 87.2000 (87.4909)  acc5: 98.0000 (97.8546)  time: 0.8941  data: 0.5916  max mem: 54644
Test:  [20/25]  eta: 0:00:03  loss: 0.8656 (0.8325)  acc1: 83.6000 (84.8000)  acc5: 96.8000 (96.9143)  time: 0.3004  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.9127 (0.8459)  acc1: 83.2000 (84.4800)  acc5: 96.4000 (96.8480)  time: 0.3003  data: 0.0001  max mem: 54644
Test: Total time: 0:00:14 (0.5660 s / it)
* Acc@1 84.978 Acc@5 97.048 loss 0.838
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [296]  [   0/1251]  eta: 1:27:37  lr: 0.000003  min_lr: 0.000003  loss: 2.5755 (2.5755)  weight_decay: 0.0500 (0.0500)  time: 4.2025  data: 3.5171  max mem: 54644
Epoch: [296]  [ 200/1251]  eta: 0:11:21  lr: 0.000003  min_lr: 0.000003  loss: 2.5595 (2.4863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9849 (1.0868)  time: 0.6308  data: 0.0004  max mem: 54644
Epoch: [296]  [ 400/1251]  eta: 0:09:03  lr: 0.000003  min_lr: 0.000003  loss: 2.4888 (2.4786)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.6282  data: 0.0004  max mem: 54644
Epoch: [296]  [ 600/1251]  eta: 0:06:53  lr: 0.000003  min_lr: 0.000003  loss: 2.6372 (2.4944)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0534 (nan)  time: 0.6288  data: 0.0004  max mem: 54644
Epoch: [296]  [ 800/1251]  eta: 0:04:45  lr: 0.000002  min_lr: 0.000002  loss: 2.5545 (2.4921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0726 (nan)  time: 0.6293  data: 0.0004  max mem: 54644
Epoch: [296]  [1000/1251]  eta: 0:02:38  lr: 0.000002  min_lr: 0.000002  loss: 2.3601 (2.4898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2358 (nan)  time: 0.6286  data: 0.0004  max mem: 54644
Epoch: [296]  [1200/1251]  eta: 0:00:32  lr: 0.000002  min_lr: 0.000002  loss: 2.6336 (2.4943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0508 (nan)  time: 0.6424  data: 0.0004  max mem: 54644
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.4501 (2.4934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0789 (nan)  time: 0.5332  data: 0.0006  max mem: 54644
Epoch: [296] Total time: 0:13:09 (0.6314 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.4501 (2.4854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0789 (nan)
Test:  [ 0/25]  eta: 0:02:47  loss: 0.5468 (0.5468)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 6.7127  data: 6.3932  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.6723 (0.6780)  acc1: 86.8000 (87.4909)  acc5: 98.0000 (98.0364)  time: 0.8821  data: 0.5815  max mem: 54644
Test:  [20/25]  eta: 0:00:03  loss: 0.8314 (0.7937)  acc1: 83.6000 (84.7619)  acc5: 96.8000 (97.1048)  time: 0.2988  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8745 (0.8072)  acc1: 82.8000 (84.3680)  acc5: 96.4000 (97.0400)  time: 0.2988  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5596 s / it)
* Acc@1 84.932 Acc@5 97.068 loss 0.800
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 85.01%
Epoch: [297]  [   0/1251]  eta: 1:25:38  lr: 0.000002  min_lr: 0.000002  loss: 1.7790 (1.7790)  weight_decay: 0.0500 (0.0500)  time: 4.1073  data: 2.7568  max mem: 54644
Epoch: [297]  [ 200/1251]  eta: 0:11:18  lr: 0.000002  min_lr: 0.000002  loss: 2.5609 (2.4931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0644 (1.2436)  time: 0.6290  data: 0.0004  max mem: 54644
Epoch: [297]  [ 400/1251]  eta: 0:09:02  lr: 0.000002  min_lr: 0.000002  loss: 2.4723 (2.4907)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9295 (1.1867)  time: 0.6295  data: 0.0005  max mem: 54644
Epoch: [297]  [ 600/1251]  eta: 0:06:53  lr: 0.000002  min_lr: 0.000002  loss: 2.4784 (2.4929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1737 (1.1887)  time: 0.6283  data: 0.0004  max mem: 54644
Epoch: [297]  [ 800/1251]  eta: 0:04:45  lr: 0.000002  min_lr: 0.000002  loss: 2.5387 (2.4888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (1.1788)  time: 0.6285  data: 0.0004  max mem: 54644
Epoch: [297]  [1000/1251]  eta: 0:02:38  lr: 0.000002  min_lr: 0.000002  loss: 2.6282 (2.4834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0700 (1.1628)  time: 0.6287  data: 0.0004  max mem: 54644
Epoch: [297]  [1200/1251]  eta: 0:00:32  lr: 0.000002  min_lr: 0.000002  loss: 2.5737 (2.4759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0807 (1.1578)  time: 0.6290  data: 0.0004  max mem: 54644
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.5705 (2.4774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0146 (1.1559)  time: 0.5333  data: 0.0005  max mem: 54644
Epoch: [297] Total time: 0:13:09 (0.6311 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.5705 (2.4892)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0146 (1.1559)
Test:  [ 0/25]  eta: 0:02:38  loss: 0.6103 (0.6103)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.3547  data: 6.0030  max mem: 54644
Test:  [10/25]  eta: 0:00:12  loss: 0.7358 (0.7417)  acc1: 87.6000 (87.3818)  acc5: 98.0000 (97.8546)  time: 0.8508  data: 0.5461  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.8990 (0.8569)  acc1: 83.2000 (84.6476)  acc5: 96.8000 (97.0476)  time: 0.3005  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.9340 (0.8702)  acc1: 82.8000 (84.3200)  acc5: 96.8000 (96.9600)  time: 0.3006  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5464 s / it)
* Acc@1 84.948 Acc@5 97.092 loss 0.863
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 85.01%
Epoch: [298]  [   0/1251]  eta: 1:20:56  lr: 0.000002  min_lr: 0.000002  loss: 2.8572 (2.8572)  weight_decay: 0.0500 (0.0500)  time: 3.8820  data: 3.1247  max mem: 54644
Epoch: [298]  [ 200/1251]  eta: 0:11:18  lr: 0.000001  min_lr: 0.000001  loss: 2.6966 (2.4690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0694 (1.1688)  time: 0.6282  data: 0.0005  max mem: 54644
Epoch: [298]  [ 400/1251]  eta: 0:09:02  lr: 0.000001  min_lr: 0.000001  loss: 2.6139 (2.4964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0786 (1.1783)  time: 0.6286  data: 0.0005  max mem: 54644
Epoch: [298]  [ 600/1251]  eta: 0:06:53  lr: 0.000001  min_lr: 0.000001  loss: 2.5508 (2.4953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0805 (1.1638)  time: 0.6298  data: 0.0005  max mem: 54644
Epoch: [298]  [ 800/1251]  eta: 0:04:45  lr: 0.000001  min_lr: 0.000001  loss: 2.6276 (2.5011)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9693 (1.1421)  time: 0.6302  data: 0.0005  max mem: 54644
Epoch: [298]  [1000/1251]  eta: 0:02:39  lr: 0.000001  min_lr: 0.000001  loss: 2.6290 (2.4951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0785 (1.1509)  time: 0.6297  data: 0.0005  max mem: 54644
Epoch: [298]  [1200/1251]  eta: 0:00:32  lr: 0.000001  min_lr: 0.000001  loss: 2.3648 (2.4945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2214 (1.1486)  time: 0.6343  data: 0.0005  max mem: 54644
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.3929 (2.4942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0798 (1.1477)  time: 0.5335  data: 0.0005  max mem: 54644
Epoch: [298] Total time: 0:13:10 (0.6319 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.3929 (2.4837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0798 (1.1477)
Test:  [ 0/25]  eta: 0:02:45  loss: 0.5492 (0.5492)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 6.6176  data: 6.2951  max mem: 54644
Test:  [10/25]  eta: 0:00:13  loss: 0.6762 (0.6801)  acc1: 87.2000 (87.5636)  acc5: 98.0000 (97.9273)  time: 0.8744  data: 0.5725  max mem: 54644
Test:  [20/25]  eta: 0:00:03  loss: 0.8304 (0.7947)  acc1: 83.6000 (84.8191)  acc5: 96.8000 (97.0286)  time: 0.3002  data: 0.0002  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.8784 (0.8085)  acc1: 82.8000 (84.5440)  acc5: 96.8000 (96.9280)  time: 0.3002  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5570 s / it)
* Acc@1 84.990 Acc@5 97.058 loss 0.802
Accuracy of the model on the 50000 test images: 85.0%
Max accuracy: 85.01%
Epoch: [299]  [   0/1251]  eta: 1:24:48  lr: 0.000001  min_lr: 0.000001  loss: 2.7277 (2.7277)  weight_decay: 0.0500 (0.0500)  time: 4.0673  data: 3.3657  max mem: 54644
Epoch: [299]  [ 200/1251]  eta: 0:11:18  lr: 0.000001  min_lr: 0.000001  loss: 2.7637 (2.4684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0423 (1.2895)  time: 0.6277  data: 0.0005  max mem: 54644
Epoch: [299]  [ 400/1251]  eta: 0:09:01  lr: 0.000001  min_lr: 0.000001  loss: 2.5306 (2.4852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9152 (1.2297)  time: 0.6276  data: 0.0005  max mem: 54644
Epoch: [299]  [ 600/1251]  eta: 0:06:52  lr: 0.000001  min_lr: 0.000001  loss: 2.4441 (2.4987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2475 (1.2251)  time: 0.6281  data: 0.0005  max mem: 54644
Epoch: [299]  [ 800/1251]  eta: 0:04:45  lr: 0.000001  min_lr: 0.000001  loss: 2.5943 (2.4921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0714 (1.2032)  time: 0.6283  data: 0.0005  max mem: 54644
Epoch: [299]  [1000/1251]  eta: 0:02:38  lr: 0.000001  min_lr: 0.000001  loss: 2.6644 (2.4999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0324 (1.2123)  time: 0.6328  data: 0.0005  max mem: 54644
Epoch: [299]  [1200/1251]  eta: 0:00:32  lr: 0.000001  min_lr: 0.000001  loss: 2.4230 (2.4951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0393 (1.2157)  time: 0.6279  data: 0.0005  max mem: 54644
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6306 (2.4964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.2090)  time: 0.5381  data: 0.0007  max mem: 54644
Epoch: [299] Total time: 0:13:08 (0.6303 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6306 (2.4834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9913 (1.2090)
Test:  [ 0/25]  eta: 0:02:37  loss: 0.6251 (0.6251)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 6.3013  data: 5.9726  max mem: 54644
Test:  [10/25]  eta: 0:00:12  loss: 0.7527 (0.7560)  acc1: 86.8000 (87.3455)  acc5: 97.6000 (97.8546)  time: 0.8463  data: 0.5434  max mem: 54644
Test:  [20/25]  eta: 0:00:02  loss: 0.9078 (0.8710)  acc1: 82.8000 (84.5524)  acc5: 96.4000 (96.8952)  time: 0.3007  data: 0.0003  max mem: 54644
Test:  [24/25]  eta: 0:00:00  loss: 0.9482 (0.8840)  acc1: 82.4000 (84.2560)  acc5: 96.4000 (96.8480)  time: 0.3007  data: 0.0001  max mem: 54644
Test: Total time: 0:00:13 (0.5465 s / it)
* Acc@1 84.882 Acc@5 96.962 loss 0.877
Accuracy of the model on the 50000 test images: 84.9%
Max accuracy: 85.01%
Training time 2:41:28
