| distributed init (rank 0): env://, gpu 0
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 3): env://, gpu 3
Namespace(batch_size=128, epochs=300, update_freq=4, model='base', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_base_8.7G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f9ef9f52b90>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(48, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=48, bias=False)
          (norm): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.011)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.023)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.034)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (norm): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.045)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.056)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.068)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.079)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.090)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.102)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.113)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.124)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.135)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.147)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.158)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.169)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.181)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.192)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.203)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.215)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.226)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.237)
    )
    (10): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.248)
    )
    (11): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.260)
    )
    (12): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.271)
    )
    (13): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.282)
    )
    (14): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.294)
    )
    (15): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.305)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.316)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.327)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.339)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(768, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(768, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.350)
    )
  )
  (head): ConvX(
    (conv): Conv2d(768, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 50901626
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer1.3.mlp.conv_in.conv.weight",
      "layer1.3.mlp.dw.conv.weight",
      "layer1.3.mlp.re.region.0.weight",
      "layer1.3.mlp.re.region.3.weight",
      "layer1.3.mlp.proj.conv.weight",
      "layer1.3.dcnn.conv_in.conv.weight",
      "layer1.3.dcnn.spe.conv.weight",
      "layer1.3.dcnn.att.logit_scale",
      "layer1.3.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer2.5.mlp.conv_in.conv.weight",
      "layer2.5.mlp.dw.conv.weight",
      "layer2.5.mlp.re.region.0.weight",
      "layer2.5.mlp.re.region.3.weight",
      "layer2.5.mlp.proj.conv.weight",
      "layer2.5.dcnn.conv_in.conv.weight",
      "layer2.5.dcnn.spe.conv.weight",
      "layer2.5.dcnn.att.logit_scale",
      "layer2.5.dcnn.proj.conv.weight",
      "layer2.6.mlp.conv_in.conv.weight",
      "layer2.6.mlp.dw.conv.weight",
      "layer2.6.mlp.re.region.0.weight",
      "layer2.6.mlp.re.region.3.weight",
      "layer2.6.mlp.proj.conv.weight",
      "layer2.6.dcnn.conv_in.conv.weight",
      "layer2.6.dcnn.spe.conv.weight",
      "layer2.6.dcnn.att.logit_scale",
      "layer2.6.dcnn.proj.conv.weight",
      "layer2.7.mlp.conv_in.conv.weight",
      "layer2.7.mlp.dw.conv.weight",
      "layer2.7.mlp.re.region.0.weight",
      "layer2.7.mlp.re.region.3.weight",
      "layer2.7.mlp.proj.conv.weight",
      "layer2.7.dcnn.conv_in.conv.weight",
      "layer2.7.dcnn.spe.conv.weight",
      "layer2.7.dcnn.att.logit_scale",
      "layer2.7.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer3.10.mlp.conv_in.conv.weight",
      "layer3.10.mlp.dw.conv.weight",
      "layer3.10.mlp.re.region.0.weight",
      "layer3.10.mlp.re.region.3.weight",
      "layer3.10.mlp.proj.conv.weight",
      "layer3.10.dcnn.conv_in.conv.weight",
      "layer3.10.dcnn.spe.conv.weight",
      "layer3.10.dcnn.att.logit_scale",
      "layer3.10.dcnn.proj.conv.weight",
      "layer3.11.mlp.conv_in.conv.weight",
      "layer3.11.mlp.dw.conv.weight",
      "layer3.11.mlp.re.region.0.weight",
      "layer3.11.mlp.re.region.3.weight",
      "layer3.11.mlp.proj.conv.weight",
      "layer3.11.dcnn.conv_in.conv.weight",
      "layer3.11.dcnn.spe.conv.weight",
      "layer3.11.dcnn.att.logit_scale",
      "layer3.11.dcnn.proj.conv.weight",
      "layer3.12.mlp.conv_in.conv.weight",
      "layer3.12.mlp.dw.conv.weight",
      "layer3.12.mlp.re.region.0.weight",
      "layer3.12.mlp.re.region.3.weight",
      "layer3.12.mlp.proj.conv.weight",
      "layer3.12.dcnn.conv_in.conv.weight",
      "layer3.12.dcnn.spe.conv.weight",
      "layer3.12.dcnn.att.logit_scale",
      "layer3.12.dcnn.proj.conv.weight",
      "layer3.13.mlp.conv_in.conv.weight",
      "layer3.13.mlp.dw.conv.weight",
      "layer3.13.mlp.re.region.0.weight",
      "layer3.13.mlp.re.region.3.weight",
      "layer3.13.mlp.proj.conv.weight",
      "layer3.13.dcnn.conv_in.conv.weight",
      "layer3.13.dcnn.spe.conv.weight",
      "layer3.13.dcnn.att.logit_scale",
      "layer3.13.dcnn.proj.conv.weight",
      "layer3.14.mlp.conv_in.conv.weight",
      "layer3.14.mlp.dw.conv.weight",
      "layer3.14.mlp.re.region.0.weight",
      "layer3.14.mlp.re.region.3.weight",
      "layer3.14.mlp.proj.conv.weight",
      "layer3.14.dcnn.conv_in.conv.weight",
      "layer3.14.dcnn.spe.conv.weight",
      "layer3.14.dcnn.att.logit_scale",
      "layer3.14.dcnn.proj.conv.weight",
      "layer3.15.mlp.conv_in.conv.weight",
      "layer3.15.mlp.dw.conv.weight",
      "layer3.15.mlp.re.region.0.weight",
      "layer3.15.mlp.re.region.3.weight",
      "layer3.15.mlp.proj.conv.weight",
      "layer3.15.dcnn.conv_in.conv.weight",
      "layer3.15.dcnn.spe.conv.weight",
      "layer3.15.dcnn.att.logit_scale",
      "layer3.15.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "layer4.3.mlp.conv_in.conv.weight",
      "layer4.3.mlp.dw.conv.weight",
      "layer4.3.mlp.re.region.0.weight",
      "layer4.3.mlp.re.region.3.weight",
      "layer4.3.mlp.proj.conv.weight",
      "layer4.3.dcnn.conv_in.conv.weight",
      "layer4.3.dcnn.spe.conv.weight",
      "layer4.3.dcnn.att.logit_scale",
      "layer4.3.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer1.3.mlp.conv_in.norm.weight",
      "layer1.3.mlp.conv_in.norm.bias",
      "layer1.3.mlp.dw.norm.weight",
      "layer1.3.mlp.dw.norm.bias",
      "layer1.3.mlp.re.region.1.weight",
      "layer1.3.mlp.re.region.1.bias",
      "layer1.3.mlp.re.region.3.bias",
      "layer1.3.mlp.proj.norm.weight",
      "layer1.3.mlp.proj.norm.bias",
      "layer1.3.dcnn.conv_in.norm.weight",
      "layer1.3.dcnn.conv_in.norm.bias",
      "layer1.3.dcnn.spe.norm.weight",
      "layer1.3.dcnn.spe.norm.bias",
      "layer1.3.dcnn.proj.norm.weight",
      "layer1.3.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer2.5.mlp.conv_in.norm.weight",
      "layer2.5.mlp.conv_in.norm.bias",
      "layer2.5.mlp.dw.norm.weight",
      "layer2.5.mlp.dw.norm.bias",
      "layer2.5.mlp.re.region.1.weight",
      "layer2.5.mlp.re.region.1.bias",
      "layer2.5.mlp.re.region.3.bias",
      "layer2.5.mlp.proj.norm.weight",
      "layer2.5.mlp.proj.norm.bias",
      "layer2.5.dcnn.conv_in.norm.weight",
      "layer2.5.dcnn.conv_in.norm.bias",
      "layer2.5.dcnn.spe.norm.weight",
      "layer2.5.dcnn.spe.norm.bias",
      "layer2.5.dcnn.proj.norm.weight",
      "layer2.5.dcnn.proj.norm.bias",
      "layer2.6.mlp.conv_in.norm.weight",
      "layer2.6.mlp.conv_in.norm.bias",
      "layer2.6.mlp.dw.norm.weight",
      "layer2.6.mlp.dw.norm.bias",
      "layer2.6.mlp.re.region.1.weight",
      "layer2.6.mlp.re.region.1.bias",
      "layer2.6.mlp.re.region.3.bias",
      "layer2.6.mlp.proj.norm.weight",
      "layer2.6.mlp.proj.norm.bias",
      "layer2.6.dcnn.conv_in.norm.weight",
      "layer2.6.dcnn.conv_in.norm.bias",
      "layer2.6.dcnn.spe.norm.weight",
      "layer2.6.dcnn.spe.norm.bias",
      "layer2.6.dcnn.proj.norm.weight",
      "layer2.6.dcnn.proj.norm.bias",
      "layer2.7.mlp.conv_in.norm.weight",
      "layer2.7.mlp.conv_in.norm.bias",
      "layer2.7.mlp.dw.norm.weight",
      "layer2.7.mlp.dw.norm.bias",
      "layer2.7.mlp.re.region.1.weight",
      "layer2.7.mlp.re.region.1.bias",
      "layer2.7.mlp.re.region.3.bias",
      "layer2.7.mlp.proj.norm.weight",
      "layer2.7.mlp.proj.norm.bias",
      "layer2.7.dcnn.conv_in.norm.weight",
      "layer2.7.dcnn.conv_in.norm.bias",
      "layer2.7.dcnn.spe.norm.weight",
      "layer2.7.dcnn.spe.norm.bias",
      "layer2.7.dcnn.proj.norm.weight",
      "layer2.7.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer3.10.mlp.conv_in.norm.weight",
      "layer3.10.mlp.conv_in.norm.bias",
      "layer3.10.mlp.dw.norm.weight",
      "layer3.10.mlp.dw.norm.bias",
      "layer3.10.mlp.re.region.1.weight",
      "layer3.10.mlp.re.region.1.bias",
      "layer3.10.mlp.re.region.3.bias",
      "layer3.10.mlp.proj.norm.weight",
      "layer3.10.mlp.proj.norm.bias",
      "layer3.10.dcnn.conv_in.norm.weight",
      "layer3.10.dcnn.conv_in.norm.bias",
      "layer3.10.dcnn.spe.norm.weight",
      "layer3.10.dcnn.spe.norm.bias",
      "layer3.10.dcnn.proj.norm.weight",
      "layer3.10.dcnn.proj.norm.bias",
      "layer3.11.mlp.conv_in.norm.weight",
      "layer3.11.mlp.conv_in.norm.bias",
      "layer3.11.mlp.dw.norm.weight",
      "layer3.11.mlp.dw.norm.bias",
      "layer3.11.mlp.re.region.1.weight",
      "layer3.11.mlp.re.region.1.bias",
      "layer3.11.mlp.re.region.3.bias",
      "layer3.11.mlp.proj.norm.weight",
      "layer3.11.mlp.proj.norm.bias",
      "layer3.11.dcnn.conv_in.norm.weight",
      "layer3.11.dcnn.conv_in.norm.bias",
      "layer3.11.dcnn.spe.norm.weight",
      "layer3.11.dcnn.spe.norm.bias",
      "layer3.11.dcnn.proj.norm.weight",
      "layer3.11.dcnn.proj.norm.bias",
      "layer3.12.mlp.conv_in.norm.weight",
      "layer3.12.mlp.conv_in.norm.bias",
      "layer3.12.mlp.dw.norm.weight",
      "layer3.12.mlp.dw.norm.bias",
      "layer3.12.mlp.re.region.1.weight",
      "layer3.12.mlp.re.region.1.bias",
      "layer3.12.mlp.re.region.3.bias",
      "layer3.12.mlp.proj.norm.weight",
      "layer3.12.mlp.proj.norm.bias",
      "layer3.12.dcnn.conv_in.norm.weight",
      "layer3.12.dcnn.conv_in.norm.bias",
      "layer3.12.dcnn.spe.norm.weight",
      "layer3.12.dcnn.spe.norm.bias",
      "layer3.12.dcnn.proj.norm.weight",
      "layer3.12.dcnn.proj.norm.bias",
      "layer3.13.mlp.conv_in.norm.weight",
      "layer3.13.mlp.conv_in.norm.bias",
      "layer3.13.mlp.dw.norm.weight",
      "layer3.13.mlp.dw.norm.bias",
      "layer3.13.mlp.re.region.1.weight",
      "layer3.13.mlp.re.region.1.bias",
      "layer3.13.mlp.re.region.3.bias",
      "layer3.13.mlp.proj.norm.weight",
      "layer3.13.mlp.proj.norm.bias",
      "layer3.13.dcnn.conv_in.norm.weight",
      "layer3.13.dcnn.conv_in.norm.bias",
      "layer3.13.dcnn.spe.norm.weight",
      "layer3.13.dcnn.spe.norm.bias",
      "layer3.13.dcnn.proj.norm.weight",
      "layer3.13.dcnn.proj.norm.bias",
      "layer3.14.mlp.conv_in.norm.weight",
      "layer3.14.mlp.conv_in.norm.bias",
      "layer3.14.mlp.dw.norm.weight",
      "layer3.14.mlp.dw.norm.bias",
      "layer3.14.mlp.re.region.1.weight",
      "layer3.14.mlp.re.region.1.bias",
      "layer3.14.mlp.re.region.3.bias",
      "layer3.14.mlp.proj.norm.weight",
      "layer3.14.mlp.proj.norm.bias",
      "layer3.14.dcnn.conv_in.norm.weight",
      "layer3.14.dcnn.conv_in.norm.bias",
      "layer3.14.dcnn.spe.norm.weight",
      "layer3.14.dcnn.spe.norm.bias",
      "layer3.14.dcnn.proj.norm.weight",
      "layer3.14.dcnn.proj.norm.bias",
      "layer3.15.mlp.conv_in.norm.weight",
      "layer3.15.mlp.conv_in.norm.bias",
      "layer3.15.mlp.dw.norm.weight",
      "layer3.15.mlp.dw.norm.bias",
      "layer3.15.mlp.re.region.1.weight",
      "layer3.15.mlp.re.region.1.bias",
      "layer3.15.mlp.re.region.3.bias",
      "layer3.15.mlp.proj.norm.weight",
      "layer3.15.mlp.proj.norm.bias",
      "layer3.15.dcnn.conv_in.norm.weight",
      "layer3.15.dcnn.conv_in.norm.bias",
      "layer3.15.dcnn.spe.norm.weight",
      "layer3.15.dcnn.spe.norm.bias",
      "layer3.15.dcnn.proj.norm.weight",
      "layer3.15.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "layer4.3.mlp.conv_in.norm.weight",
      "layer4.3.mlp.conv_in.norm.bias",
      "layer4.3.mlp.dw.norm.weight",
      "layer4.3.mlp.dw.norm.bias",
      "layer4.3.mlp.re.region.1.weight",
      "layer4.3.mlp.re.region.1.bias",
      "layer4.3.mlp.re.region.3.bias",
      "layer4.3.mlp.proj.norm.weight",
      "layer4.3.mlp.proj.norm.bias",
      "layer4.3.dcnn.conv_in.norm.weight",
      "layer4.3.dcnn.conv_in.norm.bias",
      "layer4.3.dcnn.spe.norm.weight",
      "layer4.3.dcnn.spe.norm.bias",
      "layer4.3.dcnn.proj.norm.weight",
      "layer4.3.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 4:43:03  lr: 0.000000  min_lr: 0.000000  loss: 7.0080 (7.0080)  weight_decay: 0.0500 (0.0500)  time: 13.5760  data: 3.0439  max mem: 43713
Epoch: [0]  [ 200/1251]  eta: 0:10:26  lr: 0.000032  min_lr: 0.000032  loss: 6.9343 (6.9575)  weight_decay: 0.0500 (0.0500)  grad_norm: 49.3027 (nan)  time: 0.5294  data: 0.0005  max mem: 43713
Epoch: [0]  [ 400/1251]  eta: 0:07:58  lr: 0.000064  min_lr: 0.000064  loss: 6.8700 (6.9344)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.9513 (nan)  time: 0.5289  data: 0.0005  max mem: 43713
Epoch: [0]  [ 600/1251]  eta: 0:05:59  lr: 0.000096  min_lr: 0.000096  loss: 6.7970 (6.8963)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8524 (nan)  time: 0.5284  data: 0.0005  max mem: 43713
Epoch: [0]  [ 800/1251]  eta: 0:04:06  lr: 0.000128  min_lr: 0.000128  loss: 6.6889 (6.8535)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3540 (nan)  time: 0.5282  data: 0.0005  max mem: 43713
Epoch: [0]  [1000/1251]  eta: 0:02:16  lr: 0.000160  min_lr: 0.000160  loss: 6.6199 (6.8089)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2822 (nan)  time: 0.5284  data: 0.0005  max mem: 43713
Epoch: [0]  [1200/1251]  eta: 0:00:27  lr: 0.000192  min_lr: 0.000192  loss: 6.6012 (6.7695)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2137 (nan)  time: 0.5286  data: 0.0005  max mem: 43713
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.5215 (6.7597)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1954 (nan)  time: 0.4484  data: 0.0005  max mem: 43713
Epoch: [0] Total time: 0:11:14 (0.5389 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.5215 (6.7573)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1954 (nan)
Test:  [ 0/25]  eta: 0:04:34  loss: 5.6729 (5.6729)  acc1: 3.2000 (3.2000)  acc5: 13.6000 (13.6000)  time: 10.9919  data: 6.8878  max mem: 43713
Test:  [10/25]  eta: 0:00:18  loss: 5.6812 (5.7105)  acc1: 2.0000 (2.6545)  acc5: 10.8000 (10.4727)  time: 1.2409  data: 0.6264  max mem: 43713
Test:  [20/25]  eta: 0:00:03  loss: 5.8384 (5.8013)  acc1: 2.4000 (2.8190)  acc5: 10.4000 (10.6476)  time: 0.2691  data: 0.0001  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 5.8384 (5.7542)  acc1: 2.4000 (3.4560)  acc5: 10.8000 (11.6960)  time: 0.2703  data: 0.0001  max mem: 43713
Test: Total time: 0:00:17 (0.7011 s / it)
* Acc@1 3.180 Acc@5 11.438 loss 5.773
Accuracy of the model on the 50000 test images: 3.2%
Max accuracy: 3.18%
Epoch: [1]  [   0/1251]  eta: 1:01:55  lr: 0.000200  min_lr: 0.000200  loss: 6.1143 (6.1143)  weight_decay: 0.0500 (0.0500)  time: 2.9702  data: 2.3531  max mem: 43713
Epoch: [1]  [ 200/1251]  eta: 0:09:24  lr: 0.000232  min_lr: 0.000232  loss: 6.4851 (6.4884)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3906 (3.3242)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [1]  [ 400/1251]  eta: 0:07:31  lr: 0.000264  min_lr: 0.000264  loss: 6.3671 (6.4438)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1788 (3.2523)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [1]  [ 600/1251]  eta: 0:05:43  lr: 0.000296  min_lr: 0.000296  loss: 6.3320 (6.4161)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1412 (3.2165)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [1]  [ 800/1251]  eta: 0:03:57  lr: 0.000328  min_lr: 0.000328  loss: 6.3686 (6.3819)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1688 (3.2068)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [1]  [1000/1251]  eta: 0:02:11  lr: 0.000360  min_lr: 0.000360  loss: 6.1364 (6.3513)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9385 (3.1803)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [1]  [1200/1251]  eta: 0:00:26  lr: 0.000392  min_lr: 0.000392  loss: 6.1717 (6.3248)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0475 (3.1709)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 6.0815 (6.3172)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1258 (3.1678)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [1] Total time: 0:10:56 (0.5245 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 6.0815 (6.3170)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.1258 (3.1678)
Test:  [ 0/25]  eta: 0:01:55  loss: 4.7263 (4.7263)  acc1: 10.8000 (10.8000)  acc5: 29.6000 (29.6000)  time: 4.6270  data: 4.3074  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 4.5314 (4.5653)  acc1: 11.6000 (12.2182)  acc5: 29.6000 (31.0545)  time: 0.6770  data: 0.4080  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 4.8501 (4.7608)  acc1: 10.8000 (11.4476)  acc5: 27.2000 (28.9524)  time: 0.2730  data: 0.0091  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 4.8955 (4.7386)  acc1: 11.6000 (11.9520)  acc5: 27.2000 (29.7760)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4516 s / it)
* Acc@1 12.130 Acc@5 30.086 loss 4.728
Accuracy of the model on the 50000 test images: 12.1%
Max accuracy: 12.13%
Epoch: [2]  [   0/1251]  eta: 1:02:25  lr: 0.000400  min_lr: 0.000400  loss: 5.8285 (5.8285)  weight_decay: 0.0500 (0.0500)  time: 2.9937  data: 2.4547  max mem: 43713
Epoch: [2]  [ 200/1251]  eta: 0:09:23  lr: 0.000432  min_lr: 0.000432  loss: 6.2786 (6.1431)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0700 (3.0577)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [2]  [ 400/1251]  eta: 0:07:31  lr: 0.000464  min_lr: 0.000464  loss: 5.9748 (6.0797)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3825 (3.1623)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [2]  [ 600/1251]  eta: 0:05:43  lr: 0.000496  min_lr: 0.000496  loss: 6.1205 (6.0432)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9566 (3.1086)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [2]  [ 800/1251]  eta: 0:03:57  lr: 0.000528  min_lr: 0.000528  loss: 5.9345 (6.0187)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0319 (3.1446)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [2]  [1000/1251]  eta: 0:02:12  lr: 0.000560  min_lr: 0.000560  loss: 5.7329 (5.9852)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8120 (3.1197)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [2]  [1200/1251]  eta: 0:00:26  lr: 0.000592  min_lr: 0.000592  loss: 5.8872 (5.9635)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5770 (3.1121)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 5.9393 (5.9598)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6885 (3.1018)  time: 0.4437  data: 0.0004  max mem: 43713
Epoch: [2] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 5.9393 (5.9487)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6885 (3.1018)
Test:  [ 0/25]  eta: 0:02:21  loss: 3.7331 (3.7331)  acc1: 23.2000 (23.2000)  acc5: 54.0000 (54.0000)  time: 5.6723  data: 5.3532  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 3.7433 (3.8374)  acc1: 21.6000 (22.6909)  acc5: 52.4000 (48.7636)  time: 0.7562  data: 0.4869  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 4.1777 (4.0425)  acc1: 19.6000 (20.9333)  acc5: 41.2000 (44.8000)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 4.1777 (3.9991)  acc1: 20.4000 (21.9040)  acc5: 41.6000 (45.6320)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4863 s / it)
* Acc@1 21.744 Acc@5 45.880 loss 3.995
Accuracy of the model on the 50000 test images: 21.7%
Max accuracy: 21.74%
Epoch: [3]  [   0/1251]  eta: 0:51:37  lr: 0.000600  min_lr: 0.000600  loss: 5.8464 (5.8464)  weight_decay: 0.0500 (0.0500)  time: 2.4762  data: 1.9354  max mem: 43713
Epoch: [3]  [ 200/1251]  eta: 0:09:20  lr: 0.000632  min_lr: 0.000632  loss: 5.6306 (5.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7748 (3.2752)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [3]  [ 400/1251]  eta: 0:07:30  lr: 0.000664  min_lr: 0.000664  loss: 5.6546 (5.6939)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5897 (2.9814)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [3]  [ 600/1251]  eta: 0:05:43  lr: 0.000696  min_lr: 0.000696  loss: 5.2518 (5.6740)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7662 (2.9682)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [3]  [ 800/1251]  eta: 0:03:57  lr: 0.000728  min_lr: 0.000728  loss: 5.6535 (5.6585)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5795 (2.9486)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [3]  [1000/1251]  eta: 0:02:12  lr: 0.000760  min_lr: 0.000760  loss: 5.6165 (5.6532)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5132 (2.8847)  time: 0.5327  data: 0.0004  max mem: 43713
Epoch: [3]  [1200/1251]  eta: 0:00:26  lr: 0.000792  min_lr: 0.000792  loss: 5.7705 (5.6242)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5889 (2.8374)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.7084 (5.6208)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5889 (2.8222)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [3] Total time: 0:10:56 (0.5249 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.7084 (5.6275)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5889 (2.8222)
Test:  [ 0/25]  eta: 0:02:29  loss: 3.0662 (3.0662)  acc1: 39.6000 (39.6000)  acc5: 64.8000 (64.8000)  time: 5.9763  data: 5.6722  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 3.0144 (3.0682)  acc1: 38.8000 (34.8000)  acc5: 64.4000 (63.6364)  time: 0.7840  data: 0.5159  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 3.4312 (3.3503)  acc1: 28.8000 (31.5238)  acc5: 54.8000 (58.4000)  time: 0.2651  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 3.4830 (3.3227)  acc1: 29.6000 (31.9520)  acc5: 54.8000 (58.8320)  time: 0.2652  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4989 s / it)
* Acc@1 31.960 Acc@5 58.822 loss 3.324
Accuracy of the model on the 50000 test images: 32.0%
Max accuracy: 31.96%
Epoch: [4]  [   0/1251]  eta: 0:57:45  lr: 0.000800  min_lr: 0.000800  loss: 5.3947 (5.3947)  weight_decay: 0.0500 (0.0500)  time: 2.7705  data: 2.2268  max mem: 43713
Epoch: [4]  [ 200/1251]  eta: 0:09:21  lr: 0.000832  min_lr: 0.000832  loss: 5.4770 (5.4365)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4077 (2.7002)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [4]  [ 400/1251]  eta: 0:07:30  lr: 0.000864  min_lr: 0.000864  loss: 5.3216 (5.4418)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.3472 (2.5982)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [4]  [ 600/1251]  eta: 0:05:43  lr: 0.000896  min_lr: 0.000896  loss: 5.5312 (5.4227)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2737 (2.6399)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [4]  [ 800/1251]  eta: 0:03:57  lr: 0.000928  min_lr: 0.000928  loss: 5.3484 (5.3990)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0450 (2.5731)  time: 0.5276  data: 0.0005  max mem: 43713
Epoch: [4]  [1000/1251]  eta: 0:02:11  lr: 0.000960  min_lr: 0.000960  loss: 5.5669 (5.3954)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1905 (2.5136)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [4]  [1200/1251]  eta: 0:00:26  lr: 0.000992  min_lr: 0.000992  loss: 5.6696 (5.3727)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6062 (2.4911)  time: 0.5251  data: 0.0005  max mem: 43713
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 5.3405 (5.3671)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0906 (2.4744)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [4] Total time: 0:10:56 (0.5245 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 5.3405 (5.3608)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0906 (2.4744)
Test:  [ 0/25]  eta: 0:02:20  loss: 2.4927 (2.4927)  acc1: 50.8000 (50.8000)  acc5: 75.2000 (75.2000)  time: 5.6062  data: 5.2996  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 2.4927 (2.6290)  acc1: 49.6000 (44.8364)  acc5: 74.4000 (72.5091)  time: 0.7508  data: 0.4822  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 3.0146 (2.8979)  acc1: 38.0000 (40.0762)  acc5: 64.0000 (66.3429)  time: 0.2651  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 3.0498 (2.8855)  acc1: 37.6000 (40.4160)  acc5: 62.4000 (66.5120)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4835 s / it)
* Acc@1 40.132 Acc@5 67.208 loss 2.874
Accuracy of the model on the 50000 test images: 40.1%
Max accuracy: 40.13%
Epoch: [5]  [   0/1251]  eta: 0:51:54  lr: 0.001000  min_lr: 0.001000  loss: 5.5419 (5.5419)  weight_decay: 0.0500 (0.0500)  time: 2.4893  data: 1.9462  max mem: 43713
Epoch: [5]  [ 200/1251]  eta: 0:09:21  lr: 0.001032  min_lr: 0.001032  loss: 5.1025 (5.3065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7991 (1.9689)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [5]  [ 400/1251]  eta: 0:07:30  lr: 0.001064  min_lr: 0.001064  loss: 5.3727 (5.2333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8202 (2.0741)  time: 0.5254  data: 0.0005  max mem: 43713
Epoch: [5]  [ 600/1251]  eta: 0:05:43  lr: 0.001096  min_lr: 0.001096  loss: 5.0739 (5.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8218 (2.1074)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [5]  [ 800/1251]  eta: 0:03:57  lr: 0.001128  min_lr: 0.001128  loss: 5.0642 (5.1610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8350 (2.0853)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [5]  [1000/1251]  eta: 0:02:11  lr: 0.001160  min_lr: 0.001160  loss: 5.1035 (5.1541)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8235 (2.0401)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [5]  [1200/1251]  eta: 0:00:26  lr: 0.001192  min_lr: 0.001192  loss: 5.0747 (5.1364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6992 (2.0069)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 4.8764 (5.1312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7416 (1.9983)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [5] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 4.8764 (5.1404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7416 (1.9983)
Test:  [ 0/25]  eta: 0:02:15  loss: 2.0235 (2.0235)  acc1: 61.2000 (61.2000)  acc5: 81.2000 (81.2000)  time: 5.4127  data: 5.1183  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 2.0842 (2.1761)  acc1: 51.6000 (51.9273)  acc5: 80.4000 (78.4364)  time: 0.7326  data: 0.4655  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 2.6166 (2.5186)  acc1: 43.6000 (46.3429)  acc5: 69.6000 (72.0952)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 2.7496 (2.5139)  acc1: 41.2000 (46.5600)  acc5: 65.6000 (72.0320)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4758 s / it)
* Acc@1 46.246 Acc@5 72.716 loss 2.504
Accuracy of the model on the 50000 test images: 46.2%
Max accuracy: 46.25%
Epoch: [6]  [   0/1251]  eta: 1:01:21  lr: 0.001200  min_lr: 0.001200  loss: 5.6722 (5.6722)  weight_decay: 0.0500 (0.0500)  time: 2.9429  data: 2.3992  max mem: 43713
Epoch: [6]  [ 200/1251]  eta: 0:09:23  lr: 0.001232  min_lr: 0.001232  loss: 5.0304 (5.0156)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7709 (1.8537)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [6]  [ 400/1251]  eta: 0:07:30  lr: 0.001264  min_lr: 0.001264  loss: 5.1118 (4.9984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6110 (1.7939)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [6]  [ 600/1251]  eta: 0:05:43  lr: 0.001296  min_lr: 0.001296  loss: 5.2587 (4.9837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5419 (1.7419)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [6]  [ 800/1251]  eta: 0:03:57  lr: 0.001328  min_lr: 0.001328  loss: 5.0037 (4.9794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4568 (1.7476)  time: 0.5287  data: 0.0005  max mem: 43713
Epoch: [6]  [1000/1251]  eta: 0:02:12  lr: 0.001360  min_lr: 0.001360  loss: 4.6851 (4.9704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5443 (1.7140)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [6]  [1200/1251]  eta: 0:00:26  lr: 0.001393  min_lr: 0.001393  loss: 4.7197 (4.9650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5532 (1.6928)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 5.1529 (4.9637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4608 (1.6820)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [6] Total time: 0:10:56 (0.5247 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 5.1529 (4.9758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4608 (1.6820)
Test:  [ 0/25]  eta: 0:02:10  loss: 2.0116 (2.0116)  acc1: 60.8000 (60.8000)  acc5: 81.2000 (81.2000)  time: 5.2395  data: 4.9540  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 1.9983 (2.0566)  acc1: 58.0000 (56.8727)  acc5: 84.4000 (82.1455)  time: 0.7165  data: 0.4507  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 2.5380 (2.3693)  acc1: 43.6000 (50.6095)  acc5: 72.4000 (76.3238)  time: 0.2640  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 2.5779 (2.3726)  acc1: 43.6000 (50.6720)  acc5: 71.2000 (76.2240)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4661 s / it)
* Acc@1 50.776 Acc@5 76.668 loss 2.354
Accuracy of the model on the 50000 test images: 50.8%
Max accuracy: 50.78%
Epoch: [7]  [   0/1251]  eta: 0:58:56  lr: 0.001400  min_lr: 0.001400  loss: 5.7262 (5.7262)  weight_decay: 0.0500 (0.0500)  time: 2.8267  data: 2.2835  max mem: 43713
Epoch: [7]  [ 200/1251]  eta: 0:09:21  lr: 0.001432  min_lr: 0.001432  loss: 4.9590 (4.9113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4113 (1.4637)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [7]  [ 400/1251]  eta: 0:07:30  lr: 0.001464  min_lr: 0.001464  loss: 4.7853 (4.8667)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6772 (1.5221)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [7]  [ 600/1251]  eta: 0:05:43  lr: 0.001496  min_lr: 0.001496  loss: 4.8029 (4.8430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3626 (1.5134)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [7]  [ 800/1251]  eta: 0:03:57  lr: 0.001528  min_lr: 0.001528  loss: 5.1081 (4.8486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3619 (1.4938)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [7]  [1000/1251]  eta: 0:02:12  lr: 0.001561  min_lr: 0.001561  loss: 4.7931 (4.8416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3262 (1.4679)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [7]  [1200/1251]  eta: 0:00:26  lr: 0.001593  min_lr: 0.001593  loss: 4.8614 (4.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2187 (1.4410)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 4.9870 (4.8234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2484 (1.4363)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [7] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 4.9870 (4.8165)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2484 (1.4363)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.7187 (1.7187)  acc1: 65.2000 (65.2000)  acc5: 86.4000 (86.4000)  time: 5.5959  data: 5.3014  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.7187 (1.7981)  acc1: 60.8000 (61.7818)  acc5: 87.2000 (85.3818)  time: 0.7495  data: 0.4822  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 2.2040 (2.1429)  acc1: 52.8000 (54.8571)  acc5: 75.6000 (79.3714)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 2.3932 (2.1451)  acc1: 50.4000 (54.7040)  acc5: 73.6000 (79.2320)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4842 s / it)
* Acc@1 54.560 Acc@5 79.800 loss 2.140
Accuracy of the model on the 50000 test images: 54.6%
Max accuracy: 54.56%
Epoch: [8]  [   0/1251]  eta: 0:57:53  lr: 0.001600  min_lr: 0.001600  loss: 4.4485 (4.4485)  weight_decay: 0.0500 (0.0500)  time: 2.7764  data: 2.2523  max mem: 43713
Epoch: [8]  [ 200/1251]  eta: 0:09:22  lr: 0.001632  min_lr: 0.001632  loss: 4.9626 (4.7989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3279 (1.4181)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [8]  [ 400/1251]  eta: 0:07:31  lr: 0.001664  min_lr: 0.001664  loss: 4.4007 (4.7854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2346 (1.3744)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [8]  [ 600/1251]  eta: 0:05:43  lr: 0.001696  min_lr: 0.001696  loss: 4.8960 (4.7478)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1204 (1.3285)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [8]  [ 800/1251]  eta: 0:03:57  lr: 0.001728  min_lr: 0.001728  loss: 4.7752 (4.7295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1561 (1.3125)  time: 0.5310  data: 0.0004  max mem: 43713
Epoch: [8]  [1000/1251]  eta: 0:02:12  lr: 0.001761  min_lr: 0.001761  loss: 4.8911 (4.7238)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2486 (1.2955)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [8]  [1200/1251]  eta: 0:00:26  lr: 0.001793  min_lr: 0.001793  loss: 4.6668 (4.7227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1482 (1.2783)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 4.8809 (4.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.2754)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [8] Total time: 0:10:56 (0.5247 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 4.8809 (4.7066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.2754)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.7158 (1.7158)  acc1: 67.6000 (67.6000)  acc5: 85.2000 (85.2000)  time: 5.5062  data: 5.2080  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.7158 (1.7821)  acc1: 65.2000 (62.8364)  acc5: 88.4000 (86.5091)  time: 0.7411  data: 0.4738  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 2.2498 (2.0961)  acc1: 55.6000 (56.8191)  acc5: 77.6000 (80.9905)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 2.2838 (2.0993)  acc1: 52.0000 (56.6400)  acc5: 76.8000 (81.0080)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4787 s / it)
* Acc@1 56.640 Acc@5 81.350 loss 2.093
Accuracy of the model on the 50000 test images: 56.6%
Max accuracy: 56.64%
Epoch: [9]  [   0/1251]  eta: 0:59:19  lr: 0.001800  min_lr: 0.001800  loss: 5.1415 (5.1415)  weight_decay: 0.0500 (0.0500)  time: 2.8451  data: 2.3157  max mem: 43713
Epoch: [9]  [ 200/1251]  eta: 0:09:22  lr: 0.001832  min_lr: 0.001832  loss: 4.8425 (4.6568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9944 (1.1209)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [9]  [ 400/1251]  eta: 0:07:30  lr: 0.001864  min_lr: 0.001864  loss: 4.5971 (4.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1251 (1.1538)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [9]  [ 600/1251]  eta: 0:05:43  lr: 0.001896  min_lr: 0.001896  loss: 4.7475 (4.6353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1835 (1.1662)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [9]  [ 800/1251]  eta: 0:03:57  lr: 0.001929  min_lr: 0.001929  loss: 4.7080 (4.6333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0099 (1.1579)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [9]  [1000/1251]  eta: 0:02:11  lr: 0.001961  min_lr: 0.001961  loss: 4.5903 (4.6073)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1860 (1.1659)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [9]  [1200/1251]  eta: 0:00:26  lr: 0.001993  min_lr: 0.001993  loss: 4.3734 (4.5962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1140 (1.1445)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.6773 (4.5943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1149 (1.1424)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [9] Total time: 0:10:56 (0.5247 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.6773 (4.6245)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1149 (1.1424)
Test:  [ 0/25]  eta: 0:01:59  loss: 1.4672 (1.4672)  acc1: 70.0000 (70.0000)  acc5: 89.2000 (89.2000)  time: 4.7628  data: 4.4403  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 1.5260 (1.6220)  acc1: 65.6000 (66.5818)  acc5: 89.2000 (88.5455)  time: 0.6918  data: 0.4215  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.9765 (1.9190)  acc1: 57.2000 (60.2476)  acc5: 80.0000 (83.2952)  time: 0.2794  data: 0.0098  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 2.0757 (1.9223)  acc1: 54.8000 (59.9840)  acc5: 79.6000 (83.3120)  time: 0.2711  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4604 s / it)
* Acc@1 59.384 Acc@5 83.304 loss 1.926
Accuracy of the model on the 50000 test images: 59.4%
Max accuracy: 59.38%
Epoch: [10]  [   0/1251]  eta: 1:03:08  lr: 0.002000  min_lr: 0.002000  loss: 3.9646 (3.9646)  weight_decay: 0.0500 (0.0500)  time: 3.0282  data: 2.5038  max mem: 43713
Epoch: [10]  [ 200/1251]  eta: 0:09:23  lr: 0.002032  min_lr: 0.002032  loss: 4.8950 (4.6224)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0177 (1.0214)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [10]  [ 400/1251]  eta: 0:07:31  lr: 0.002064  min_lr: 0.002064  loss: 4.5957 (4.5913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0027 (1.0541)  time: 0.5346  data: 0.0005  max mem: 43713
Epoch: [10]  [ 600/1251]  eta: 0:05:43  lr: 0.002096  min_lr: 0.002096  loss: 4.3451 (4.5797)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0016 (1.0549)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [10]  [ 800/1251]  eta: 0:03:57  lr: 0.002129  min_lr: 0.002129  loss: 4.4291 (4.5702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9018 (1.0435)  time: 0.5313  data: 0.0005  max mem: 43713
Epoch: [10]  [1000/1251]  eta: 0:02:12  lr: 0.002161  min_lr: 0.002161  loss: 4.7987 (4.5577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8828 (1.0317)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [10]  [1200/1251]  eta: 0:00:26  lr: 0.002193  min_lr: 0.002193  loss: 4.8481 (4.5303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8328 (1.0120)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 4.5022 (4.5262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9082 (1.0084)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [10] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 4.5022 (4.5337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9082 (1.0084)
Test:  [ 0/25]  eta: 0:02:18  loss: 1.4393 (1.4393)  acc1: 69.6000 (69.6000)  acc5: 89.6000 (89.6000)  time: 5.5581  data: 5.2662  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.4393 (1.5178)  acc1: 68.0000 (66.2182)  acc5: 89.6000 (89.3091)  time: 0.7462  data: 0.4790  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.8491 (1.8067)  acc1: 57.2000 (60.7619)  acc5: 82.8000 (84.9524)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.9855 (1.8138)  acc1: 56.8000 (60.7360)  acc5: 81.6000 (84.9120)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4812 s / it)
* Acc@1 61.074 Acc@5 84.314 loss 1.821
Accuracy of the model on the 50000 test images: 61.1%
Max accuracy: 61.07%
Epoch: [11]  [   0/1251]  eta: 0:55:12  lr: 0.002200  min_lr: 0.002200  loss: 4.6339 (4.6339)  weight_decay: 0.0500 (0.0500)  time: 2.6478  data: 2.1199  max mem: 43713
Epoch: [11]  [ 200/1251]  eta: 0:09:21  lr: 0.002232  min_lr: 0.002232  loss: 4.4251 (4.4806)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (1.0358)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [11]  [ 400/1251]  eta: 0:07:30  lr: 0.002264  min_lr: 0.002264  loss: 4.5337 (4.4989)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9326 (1.0210)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [11]  [ 600/1251]  eta: 0:05:43  lr: 0.002297  min_lr: 0.002297  loss: 4.5786 (4.4872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0050 (1.0004)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [11]  [ 800/1251]  eta: 0:03:57  lr: 0.002329  min_lr: 0.002329  loss: 4.8002 (4.5021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8839 (0.9801)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [11]  [1000/1251]  eta: 0:02:11  lr: 0.002361  min_lr: 0.002361  loss: 4.4757 (4.4851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9951 (0.9661)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [11]  [1200/1251]  eta: 0:00:26  lr: 0.002393  min_lr: 0.002393  loss: 4.5982 (4.4742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (0.9609)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 4.2988 (4.4706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9579)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [11] Total time: 0:10:56 (0.5245 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 4.2988 (4.4478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9579)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.3364 (1.3364)  acc1: 74.4000 (74.4000)  acc5: 90.4000 (90.4000)  time: 5.7170  data: 5.4115  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.3364 (1.4586)  acc1: 70.4000 (68.5091)  acc5: 92.0000 (90.4727)  time: 0.7600  data: 0.4923  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.8254 (1.7539)  acc1: 60.4000 (62.8381)  acc5: 83.2000 (85.8476)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.9376 (1.7632)  acc1: 57.2000 (62.5120)  acc5: 82.4000 (85.8880)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4872 s / it)
* Acc@1 62.610 Acc@5 85.510 loss 1.762
Accuracy of the model on the 50000 test images: 62.6%
Max accuracy: 62.61%
Epoch: [12]  [   0/1251]  eta: 0:58:06  lr: 0.002400  min_lr: 0.002400  loss: 4.2413 (4.2413)  weight_decay: 0.0500 (0.0500)  time: 2.7873  data: 2.2413  max mem: 43713
Epoch: [12]  [ 200/1251]  eta: 0:09:22  lr: 0.002432  min_lr: 0.002432  loss: 4.5516 (4.4062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9580 (0.9236)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [12]  [ 400/1251]  eta: 0:07:30  lr: 0.002464  min_lr: 0.002464  loss: 4.6087 (4.4166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8636 (0.9099)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [12]  [ 600/1251]  eta: 0:05:43  lr: 0.002497  min_lr: 0.002497  loss: 4.5503 (4.4120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0480 (0.9271)  time: 0.5322  data: 0.0005  max mem: 43713
Epoch: [12]  [ 800/1251]  eta: 0:03:57  lr: 0.002529  min_lr: 0.002529  loss: 4.5707 (4.4000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8299 (0.9148)  time: 0.5319  data: 0.0005  max mem: 43713
Epoch: [12]  [1000/1251]  eta: 0:02:12  lr: 0.002561  min_lr: 0.002561  loss: 4.6961 (4.3976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.9046)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [12]  [1200/1251]  eta: 0:00:26  lr: 0.002593  min_lr: 0.002593  loss: 4.4709 (4.3879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9725 (0.8970)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 4.5769 (4.3884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8968 (0.8973)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [12] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 4.5769 (4.3999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8968 (0.8973)
Test:  [ 0/25]  eta: 0:01:42  loss: 1.1880 (1.1880)  acc1: 78.4000 (78.4000)  acc5: 94.0000 (94.0000)  time: 4.0845  data: 3.7770  max mem: 43713
Test:  [10/25]  eta: 0:00:09  loss: 1.2537 (1.3814)  acc1: 69.6000 (69.7091)  acc5: 93.6000 (91.9636)  time: 0.6492  data: 0.3791  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.7584 (1.6985)  acc1: 60.8000 (64.1333)  acc5: 86.0000 (87.3333)  time: 0.2865  data: 0.0211  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.9358 (1.7098)  acc1: 60.4000 (63.9040)  acc5: 84.0000 (87.1200)  time: 0.2658  data: 0.0016  max mem: 43713
Test: Total time: 0:00:10 (0.4380 s / it)
* Acc@1 64.090 Acc@5 86.500 loss 1.724
Accuracy of the model on the 50000 test images: 64.1%
Max accuracy: 64.09%
Epoch: [13]  [   0/1251]  eta: 0:59:09  lr: 0.002600  min_lr: 0.002600  loss: 4.6605 (4.6605)  weight_decay: 0.0500 (0.0500)  time: 2.8370  data: 2.3096  max mem: 43713
Epoch: [13]  [ 200/1251]  eta: 0:09:21  lr: 0.002632  min_lr: 0.002632  loss: 4.2719 (4.2786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7521 (0.7898)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [13]  [ 400/1251]  eta: 0:07:29  lr: 0.002665  min_lr: 0.002665  loss: 4.2391 (4.2971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7440 (0.8037)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [13]  [ 600/1251]  eta: 0:05:43  lr: 0.002697  min_lr: 0.002697  loss: 4.1459 (4.3105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.8230)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [13]  [ 800/1251]  eta: 0:03:57  lr: 0.002729  min_lr: 0.002729  loss: 3.8369 (4.3114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8120 (0.8161)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [13]  [1000/1251]  eta: 0:02:11  lr: 0.002761  min_lr: 0.002761  loss: 4.3602 (4.3104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (0.8004)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [13]  [1200/1251]  eta: 0:00:26  lr: 0.002793  min_lr: 0.002793  loss: 4.4465 (4.3125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8778 (0.8006)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.5261 (4.3075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7432 (0.8003)  time: 0.4438  data: 0.0004  max mem: 43713
Epoch: [13] Total time: 0:10:56 (0.5244 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.5261 (4.3256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7432 (0.8003)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.2361 (1.2361)  acc1: 76.0000 (76.0000)  acc5: 92.4000 (92.4000)  time: 5.5691  data: 5.2563  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2752 (1.3790)  acc1: 73.2000 (70.4364)  acc5: 93.2000 (91.4545)  time: 0.7468  data: 0.4782  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.7826 (1.6572)  acc1: 59.2000 (64.6095)  acc5: 86.0000 (87.0667)  time: 0.2643  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.7929 (1.6675)  acc1: 60.0000 (64.4640)  acc5: 84.0000 (86.9920)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4844 s / it)
* Acc@1 64.776 Acc@5 86.936 loss 1.658
Accuracy of the model on the 50000 test images: 64.8%
Max accuracy: 64.78%
Epoch: [14]  [   0/1251]  eta: 1:02:12  lr: 0.002800  min_lr: 0.002800  loss: 4.1222 (4.1222)  weight_decay: 0.0500 (0.0500)  time: 2.9836  data: 2.4453  max mem: 43713
Epoch: [14]  [ 200/1251]  eta: 0:09:23  lr: 0.002833  min_lr: 0.002833  loss: 4.5211 (4.2982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.8097)  time: 0.5253  data: 0.0005  max mem: 43713
Epoch: [14]  [ 400/1251]  eta: 0:07:31  lr: 0.002865  min_lr: 0.002865  loss: 4.2863 (4.2686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.7833)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [14]  [ 600/1251]  eta: 0:05:43  lr: 0.002897  min_lr: 0.002897  loss: 4.4746 (4.2633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6699 (0.7570)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [14]  [ 800/1251]  eta: 0:03:57  lr: 0.002929  min_lr: 0.002929  loss: 4.6799 (4.2931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8465 (0.7558)  time: 0.5277  data: 0.0004  max mem: 43713
Epoch: [14]  [1000/1251]  eta: 0:02:12  lr: 0.002961  min_lr: 0.002961  loss: 4.3892 (4.2865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6581 (0.7692)  time: 0.5263  data: 0.0004  max mem: 43713
Epoch: [14]  [1200/1251]  eta: 0:00:26  lr: 0.002993  min_lr: 0.002993  loss: 4.3512 (4.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.7612)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 3.9201 (4.2769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.7590)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [14] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 3.9201 (4.2794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7017 (0.7590)
Test:  [ 0/25]  eta: 0:02:25  loss: 1.1434 (1.1434)  acc1: 77.6000 (77.6000)  acc5: 94.0000 (94.0000)  time: 5.8161  data: 5.5089  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2646 (1.3597)  acc1: 72.0000 (71.8909)  acc5: 93.2000 (91.8182)  time: 0.7689  data: 0.5011  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.7881 (1.6044)  acc1: 61.2000 (66.1143)  acc5: 84.4000 (87.8476)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.8107 (1.6095)  acc1: 61.2000 (65.7280)  acc5: 84.0000 (87.7920)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4893 s / it)
* Acc@1 65.726 Acc@5 87.638 loss 1.608
Accuracy of the model on the 50000 test images: 65.7%
Max accuracy: 65.73%
Epoch: [15]  [   0/1251]  eta: 0:52:02  lr: 0.003000  min_lr: 0.003000  loss: 4.0759 (4.0759)  weight_decay: 0.0500 (0.0500)  time: 2.4964  data: 1.9532  max mem: 43713
Epoch: [15]  [ 200/1251]  eta: 0:09:21  lr: 0.003033  min_lr: 0.003033  loss: 4.2173 (4.2216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7546 (0.7630)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [15]  [ 400/1251]  eta: 0:07:29  lr: 0.003065  min_lr: 0.003065  loss: 4.5677 (4.2358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (0.7794)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [15]  [ 600/1251]  eta: 0:05:43  lr: 0.003097  min_lr: 0.003097  loss: 4.1590 (4.2334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6265 (0.7557)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [15]  [ 800/1251]  eta: 0:03:57  lr: 0.003129  min_lr: 0.003129  loss: 4.3020 (4.2403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7001 (0.7448)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [15]  [1000/1251]  eta: 0:02:11  lr: 0.003161  min_lr: 0.003161  loss: 4.2718 (4.2403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6581 (0.7367)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [15]  [1200/1251]  eta: 0:00:26  lr: 0.003193  min_lr: 0.003193  loss: 4.0658 (4.2323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6456 (0.7239)  time: 0.5325  data: 0.0005  max mem: 43713
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 4.3269 (4.2348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.7226)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [15] Total time: 0:10:56 (0.5249 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 4.3269 (4.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.7226)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.0810 (1.0810)  acc1: 80.0000 (80.0000)  acc5: 93.6000 (93.6000)  time: 5.7163  data: 5.4221  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2017 (1.2921)  acc1: 72.4000 (72.0727)  acc5: 92.8000 (91.8182)  time: 0.7602  data: 0.4932  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.6479 (1.5774)  acc1: 62.4000 (66.3238)  acc5: 86.4000 (87.9429)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.7715 (1.5909)  acc1: 61.6000 (65.7600)  acc5: 85.2000 (87.9840)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4870 s / it)
* Acc@1 66.490 Acc@5 88.100 loss 1.578
Accuracy of the model on the 50000 test images: 66.5%
Max accuracy: 66.49%
Epoch: [16]  [   0/1251]  eta: 1:03:18  lr: 0.003201  min_lr: 0.003201  loss: 4.0442 (4.0442)  weight_decay: 0.0500 (0.0500)  time: 3.0365  data: 2.5015  max mem: 43713
Epoch: [16]  [ 200/1251]  eta: 0:09:23  lr: 0.003233  min_lr: 0.003233  loss: 4.2636 (4.2172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6375 (0.7081)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [16]  [ 400/1251]  eta: 0:07:31  lr: 0.003265  min_lr: 0.003265  loss: 3.8890 (4.1995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6414 (0.6931)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [16]  [ 600/1251]  eta: 0:05:43  lr: 0.003297  min_lr: 0.003297  loss: 4.2474 (4.2083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6527 (0.6771)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [16]  [ 800/1251]  eta: 0:03:57  lr: 0.003329  min_lr: 0.003329  loss: 4.2186 (4.2171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5981 (0.6833)  time: 0.5280  data: 0.0004  max mem: 43713
Epoch: [16]  [1000/1251]  eta: 0:02:12  lr: 0.003361  min_lr: 0.003361  loss: 4.4484 (4.2093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.7024)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [16]  [1200/1251]  eta: 0:00:26  lr: 0.003393  min_lr: 0.003393  loss: 4.2914 (4.1937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5983 (0.6937)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 4.3646 (4.1963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6007 (0.6904)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [16] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 4.3646 (4.1972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6007 (0.6904)
Test:  [ 0/25]  eta: 0:02:13  loss: 1.1144 (1.1144)  acc1: 77.2000 (77.2000)  acc5: 94.0000 (94.0000)  time: 5.3522  data: 5.0456  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 1.2786 (1.3089)  acc1: 72.0000 (72.0364)  acc5: 93.6000 (92.4364)  time: 0.7268  data: 0.4590  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.6548 (1.5951)  acc1: 63.2000 (66.7048)  acc5: 85.2000 (88.0571)  time: 0.2640  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.7105 (1.5978)  acc1: 65.2000 (66.7520)  acc5: 85.2000 (88.1120)  time: 0.2638  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4763 s / it)
* Acc@1 66.756 Acc@5 88.258 loss 1.594
Accuracy of the model on the 50000 test images: 66.8%
Max accuracy: 66.76%
Epoch: [17]  [   0/1251]  eta: 0:58:48  lr: 0.003401  min_lr: 0.003401  loss: 4.2799 (4.2799)  weight_decay: 0.0500 (0.0500)  time: 2.8205  data: 2.2868  max mem: 43713
Epoch: [17]  [ 200/1251]  eta: 0:09:22  lr: 0.003433  min_lr: 0.003433  loss: 4.0175 (4.1468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5806 (0.6137)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [17]  [ 400/1251]  eta: 0:07:30  lr: 0.003465  min_lr: 0.003465  loss: 3.6400 (4.0926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (0.6470)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [17]  [ 600/1251]  eta: 0:05:43  lr: 0.003497  min_lr: 0.003497  loss: 4.5265 (4.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5881 (0.6494)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [17]  [ 800/1251]  eta: 0:03:57  lr: 0.003529  min_lr: 0.003529  loss: 4.5037 (4.1461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5774 (0.6414)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [17]  [1000/1251]  eta: 0:02:12  lr: 0.003561  min_lr: 0.003561  loss: 4.4711 (4.1411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.6491)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [17]  [1200/1251]  eta: 0:00:26  lr: 0.003593  min_lr: 0.003593  loss: 4.2898 (4.1605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6157 (0.6503)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 4.4443 (4.1663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.6484)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [17] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 4.4443 (4.1703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6139 (0.6484)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.1276 (1.1276)  acc1: 80.8000 (80.8000)  acc5: 94.4000 (94.4000)  time: 5.6853  data: 5.3915  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.3285 (1.4073)  acc1: 72.8000 (73.3455)  acc5: 93.6000 (92.8727)  time: 0.7574  data: 0.4904  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.6762 (1.6730)  acc1: 64.4000 (67.3524)  acc5: 88.8000 (88.8000)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.8689 (1.6772)  acc1: 62.4000 (66.9920)  acc5: 86.8000 (88.8480)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4862 s / it)
* Acc@1 67.416 Acc@5 88.568 loss 1.674
Accuracy of the model on the 50000 test images: 67.4%
Max accuracy: 67.42%
Epoch: [18]  [   0/1251]  eta: 0:59:00  lr: 0.003601  min_lr: 0.003601  loss: 3.4355 (3.4355)  weight_decay: 0.0500 (0.0500)  time: 2.8304  data: 2.2863  max mem: 43713
Epoch: [18]  [ 200/1251]  eta: 0:09:22  lr: 0.003633  min_lr: 0.003633  loss: 4.4335 (4.1417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5663 (0.6276)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [18]  [ 400/1251]  eta: 0:07:30  lr: 0.003665  min_lr: 0.003665  loss: 4.0841 (4.1127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5510 (0.6330)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [18]  [ 600/1251]  eta: 0:05:43  lr: 0.003697  min_lr: 0.003697  loss: 3.7776 (4.1079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5781 (0.6215)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [18]  [ 800/1251]  eta: 0:03:57  lr: 0.003729  min_lr: 0.003729  loss: 4.3589 (4.1167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7181 (0.6289)  time: 0.5279  data: 0.0004  max mem: 43713
Epoch: [18]  [1000/1251]  eta: 0:02:11  lr: 0.003761  min_lr: 0.003761  loss: 4.2092 (4.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (0.6259)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [18]  [1200/1251]  eta: 0:00:26  lr: 0.003793  min_lr: 0.003793  loss: 4.2415 (4.1196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6672 (0.6283)  time: 0.5261  data: 0.0004  max mem: 43713
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 4.2282 (4.1205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5360 (0.6261)  time: 0.4530  data: 0.0004  max mem: 43713
Epoch: [18] Total time: 0:10:56 (0.5249 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 4.2282 (4.1327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5360 (0.6261)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0656 (1.0656)  acc1: 78.0000 (78.0000)  acc5: 94.0000 (94.0000)  time: 5.5109  data: 5.2269  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.1694 (1.2322)  acc1: 78.0000 (74.1818)  acc5: 93.6000 (93.4909)  time: 0.7416  data: 0.4754  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.5668 (1.4969)  acc1: 65.6000 (69.0857)  acc5: 89.6000 (89.7524)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.6631 (1.5051)  acc1: 65.6000 (68.8480)  acc5: 87.6000 (89.5360)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4796 s / it)
* Acc@1 68.340 Acc@5 89.170 loss 1.514
Accuracy of the model on the 50000 test images: 68.3%
Max accuracy: 68.34%
Epoch: [19]  [   0/1251]  eta: 0:58:58  lr: 0.003801  min_lr: 0.003801  loss: 4.5363 (4.5363)  weight_decay: 0.0500 (0.0500)  time: 2.8283  data: 2.2864  max mem: 43713
Epoch: [19]  [ 200/1251]  eta: 0:09:22  lr: 0.003833  min_lr: 0.003833  loss: 4.1068 (4.0789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.6406)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [19]  [ 400/1251]  eta: 0:07:30  lr: 0.003865  min_lr: 0.003865  loss: 4.3335 (4.1013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5685 (0.6258)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [19]  [ 600/1251]  eta: 0:05:43  lr: 0.003897  min_lr: 0.003897  loss: 4.1521 (4.1183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6397 (0.6339)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [19]  [ 800/1251]  eta: 0:03:57  lr: 0.003929  min_lr: 0.003929  loss: 4.2182 (4.1003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4963 (0.6201)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [19]  [1000/1251]  eta: 0:02:12  lr: 0.003961  min_lr: 0.003961  loss: 4.1994 (4.1120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5520 (0.6200)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [19]  [1200/1251]  eta: 0:00:26  lr: 0.003993  min_lr: 0.003993  loss: 3.9017 (4.1091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5796 (0.6177)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.3061 (4.1132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5026 (0.6145)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [19] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.3061 (4.1045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5026 (0.6145)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.0485 (1.0485)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 5.6036  data: 5.3176  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2933 (1.2703)  acc1: 75.6000 (74.0364)  acc5: 94.4000 (93.6000)  time: 0.7500  data: 0.4837  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.6156 (1.5478)  acc1: 64.8000 (68.7810)  acc5: 88.8000 (89.8667)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.7265 (1.5598)  acc1: 64.8000 (68.5920)  acc5: 87.6000 (89.7600)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4827 s / it)
* Acc@1 68.448 Acc@5 89.374 loss 1.569
Accuracy of the model on the 50000 test images: 68.4%
Max accuracy: 68.45%
Epoch: [20]  [   0/1251]  eta: 1:02:01  lr: 0.004000  min_lr: 0.004000  loss: 4.4569 (4.4569)  weight_decay: 0.0500 (0.0500)  time: 2.9748  data: 2.4283  max mem: 43713
Epoch: [20]  [ 200/1251]  eta: 0:09:22  lr: 0.004000  min_lr: 0.004000  loss: 3.9458 (4.0221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5617 (0.6105)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [20]  [ 400/1251]  eta: 0:07:31  lr: 0.004000  min_lr: 0.004000  loss: 4.4293 (4.0881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5421 (0.6072)  time: 0.5250  data: 0.0004  max mem: 43713
Epoch: [20]  [ 600/1251]  eta: 0:05:43  lr: 0.004000  min_lr: 0.004000  loss: 4.1929 (4.0478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5164 (0.5908)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [20]  [ 800/1251]  eta: 0:03:57  lr: 0.004000  min_lr: 0.004000  loss: 4.2363 (4.0525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5038 (0.5890)  time: 0.5294  data: 0.0004  max mem: 43713
Epoch: [20]  [1000/1251]  eta: 0:02:12  lr: 0.004000  min_lr: 0.004000  loss: 3.7467 (4.0401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5252 (0.5925)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [20]  [1200/1251]  eta: 0:00:26  lr: 0.004000  min_lr: 0.004000  loss: 3.5306 (4.0280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6291 (0.5894)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.1854 (4.0317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (0.5893)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [20] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.1854 (4.0681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5706 (0.5893)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.9836 (0.9836)  acc1: 80.0000 (80.0000)  acc5: 94.8000 (94.8000)  time: 5.5908  data: 5.2766  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.1266 (1.2023)  acc1: 77.6000 (75.4545)  acc5: 94.0000 (93.7455)  time: 0.7488  data: 0.4800  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.5512 (1.4572)  acc1: 66.0000 (69.6000)  acc5: 88.8000 (89.7714)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.6352 (1.4724)  acc1: 64.8000 (69.0080)  acc5: 87.2000 (89.6640)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4823 s / it)
* Acc@1 68.736 Acc@5 89.640 loss 1.475
Accuracy of the model on the 50000 test images: 68.7%
Max accuracy: 68.74%
Epoch: [21]  [   0/1251]  eta: 0:57:57  lr: 0.004000  min_lr: 0.004000  loss: 3.8761 (3.8761)  weight_decay: 0.0500 (0.0500)  time: 2.7800  data: 2.2486  max mem: 43713
Epoch: [21]  [ 200/1251]  eta: 0:09:23  lr: 0.004000  min_lr: 0.004000  loss: 3.7672 (4.0922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5138 (0.6175)  time: 0.5323  data: 0.0004  max mem: 43713
Epoch: [21]  [ 400/1251]  eta: 0:07:31  lr: 0.004000  min_lr: 0.004000  loss: 4.2547 (4.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (0.6052)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [21]  [ 600/1251]  eta: 0:05:43  lr: 0.004000  min_lr: 0.004000  loss: 4.2546 (4.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5531 (0.6050)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [21]  [ 800/1251]  eta: 0:03:57  lr: 0.004000  min_lr: 0.004000  loss: 3.9014 (4.0608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6075 (0.5952)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [21]  [1000/1251]  eta: 0:02:12  lr: 0.004000  min_lr: 0.004000  loss: 4.2413 (4.0635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5955 (0.5929)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [21]  [1200/1251]  eta: 0:00:26  lr: 0.004000  min_lr: 0.004000  loss: 4.1186 (4.0513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.5936)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1607 (4.0502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5640 (0.5917)  time: 0.4439  data: 0.0006  max mem: 43713
Epoch: [21] Total time: 0:10:57 (0.5252 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.1607 (4.0283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5640 (0.5917)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.0735 (1.0735)  acc1: 80.0000 (80.0000)  acc5: 94.4000 (94.4000)  time: 5.7629  data: 5.4768  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.1961 (1.2139)  acc1: 75.2000 (74.6182)  acc5: 94.8000 (93.9636)  time: 0.7647  data: 0.4982  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.5405 (1.4573)  acc1: 66.0000 (69.6762)  acc5: 88.8000 (90.2857)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.6439 (1.4636)  acc1: 64.4000 (69.3760)  acc5: 87.2000 (90.3360)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4889 s / it)
* Acc@1 69.718 Acc@5 90.058 loss 1.457
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.72%
Epoch: [22]  [   0/1251]  eta: 0:56:59  lr: 0.003999  min_lr: 0.003999  loss: 2.9056 (2.9056)  weight_decay: 0.0500 (0.0500)  time: 2.7335  data: 2.1976  max mem: 43713
Epoch: [22]  [ 200/1251]  eta: 0:09:22  lr: 0.003999  min_lr: 0.003999  loss: 3.8908 (3.9849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5296 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [22]  [ 400/1251]  eta: 0:07:30  lr: 0.003999  min_lr: 0.003999  loss: 4.2487 (3.9961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5950 (nan)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [22]  [ 600/1251]  eta: 0:05:43  lr: 0.003999  min_lr: 0.003999  loss: 3.9042 (3.9893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4707 (nan)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [22]  [ 800/1251]  eta: 0:03:57  lr: 0.003999  min_lr: 0.003999  loss: 4.0631 (3.9852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5332 (nan)  time: 0.5292  data: 0.0004  max mem: 43713
Epoch: [22]  [1000/1251]  eta: 0:02:12  lr: 0.003999  min_lr: 0.003999  loss: 4.0829 (3.9771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5527 (nan)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [22]  [1200/1251]  eta: 0:00:26  lr: 0.003999  min_lr: 0.003999  loss: 4.2228 (3.9870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5595 (nan)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.2253 (3.9893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5352 (nan)  time: 0.4442  data: 0.0005  max mem: 43713
Epoch: [22] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.2253 (3.9934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5352 (nan)
Test:  [ 0/25]  eta: 0:02:27  loss: 1.1801 (1.1801)  acc1: 79.2000 (79.2000)  acc5: 95.6000 (95.6000)  time: 5.9166  data: 5.6294  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2090 (1.2515)  acc1: 76.4000 (75.8909)  acc5: 94.8000 (94.1818)  time: 0.7786  data: 0.5120  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.5629 (1.5133)  acc1: 68.0000 (70.0381)  acc5: 90.0000 (90.3810)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.6938 (1.5231)  acc1: 66.8000 (69.7280)  acc5: 87.6000 (90.2720)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4960 s / it)
* Acc@1 70.024 Acc@5 90.104 loss 1.524
Accuracy of the model on the 50000 test images: 70.0%
Max accuracy: 70.02%
Epoch: [23]  [   0/1251]  eta: 0:49:55  lr: 0.003999  min_lr: 0.003999  loss: 4.6070 (4.6070)  weight_decay: 0.0500 (0.0500)  time: 2.3945  data: 1.8697  max mem: 43713
Epoch: [23]  [ 200/1251]  eta: 0:09:19  lr: 0.003999  min_lr: 0.003999  loss: 3.6716 (3.9377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5511 (0.5881)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [23]  [ 400/1251]  eta: 0:07:29  lr: 0.003999  min_lr: 0.003999  loss: 3.9770 (3.9448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4812 (0.5484)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [23]  [ 600/1251]  eta: 0:05:42  lr: 0.003998  min_lr: 0.003998  loss: 4.1858 (3.9539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5359 (0.5680)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [23]  [ 800/1251]  eta: 0:03:57  lr: 0.003998  min_lr: 0.003998  loss: 4.3341 (3.9613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5751 (0.5789)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [23]  [1000/1251]  eta: 0:02:11  lr: 0.003998  min_lr: 0.003998  loss: 4.1537 (3.9596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5272 (0.5742)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [23]  [1200/1251]  eta: 0:00:26  lr: 0.003998  min_lr: 0.003998  loss: 4.0303 (3.9535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4533 (0.5699)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 4.1487 (3.9529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4870 (0.5680)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [23] Total time: 0:10:56 (0.5244 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 4.1487 (3.9560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4870 (0.5680)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9962 (0.9962)  acc1: 83.6000 (83.6000)  acc5: 95.6000 (95.6000)  time: 5.7746  data: 5.4844  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.2013 (1.2224)  acc1: 76.0000 (75.6727)  acc5: 94.0000 (94.1455)  time: 0.7656  data: 0.4989  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.4956 (1.4559)  acc1: 66.8000 (70.7619)  acc5: 90.4000 (90.7619)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.5807 (1.4624)  acc1: 66.8000 (70.3360)  acc5: 88.4000 (90.7040)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4897 s / it)
* Acc@1 70.378 Acc@5 90.678 loss 1.467
Accuracy of the model on the 50000 test images: 70.4%
Max accuracy: 70.38%
Epoch: [24]  [   0/1251]  eta: 0:59:38  lr: 0.003998  min_lr: 0.003998  loss: 4.7296 (4.7296)  weight_decay: 0.0500 (0.0500)  time: 2.8603  data: 2.3345  max mem: 43713
Epoch: [24]  [ 200/1251]  eta: 0:09:22  lr: 0.003998  min_lr: 0.003998  loss: 4.0598 (3.9731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5427 (0.6152)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [24]  [ 400/1251]  eta: 0:07:31  lr: 0.003998  min_lr: 0.003998  loss: 3.9327 (3.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5648 (0.5808)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [24]  [ 600/1251]  eta: 0:05:43  lr: 0.003997  min_lr: 0.003997  loss: 4.0240 (3.9379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5160 (0.5752)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [24]  [ 800/1251]  eta: 0:03:57  lr: 0.003997  min_lr: 0.003997  loss: 4.0559 (3.9296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6013 (0.5768)  time: 0.5389  data: 0.0005  max mem: 43713
Epoch: [24]  [1000/1251]  eta: 0:02:12  lr: 0.003997  min_lr: 0.003997  loss: 3.8678 (3.9458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5702 (0.5720)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [24]  [1200/1251]  eta: 0:00:26  lr: 0.003997  min_lr: 0.003997  loss: 4.1408 (3.9564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4671 (0.5674)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.6093 (3.9527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4723 (0.5650)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [24] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.6093 (3.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4723 (0.5650)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8949 (0.8949)  acc1: 81.6000 (81.6000)  acc5: 96.0000 (96.0000)  time: 5.7416  data: 5.4466  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0867 (1.1096)  acc1: 76.4000 (76.9455)  acc5: 95.2000 (94.6909)  time: 0.7631  data: 0.4954  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.4294 (1.3620)  acc1: 68.8000 (71.4286)  acc5: 89.6000 (91.1238)  time: 0.2653  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.5349 (1.3724)  acc1: 68.8000 (71.3440)  acc5: 89.6000 (91.2000)  time: 0.2653  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4888 s / it)
* Acc@1 71.426 Acc@5 91.006 loss 1.365
Accuracy of the model on the 50000 test images: 71.4%
Max accuracy: 71.43%
Epoch: [25]  [   0/1251]  eta: 0:59:14  lr: 0.003997  min_lr: 0.003997  loss: 2.9554 (2.9554)  weight_decay: 0.0500 (0.0500)  time: 2.8416  data: 2.3069  max mem: 43713
Epoch: [25]  [ 200/1251]  eta: 0:09:23  lr: 0.003997  min_lr: 0.003997  loss: 4.0634 (3.8996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4836 (0.5764)  time: 0.5259  data: 0.0004  max mem: 43713
Epoch: [25]  [ 400/1251]  eta: 0:07:30  lr: 0.003996  min_lr: 0.003996  loss: 4.1020 (3.9322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5839 (0.5819)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [25]  [ 600/1251]  eta: 0:05:43  lr: 0.003996  min_lr: 0.003996  loss: 4.2528 (3.9406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5459 (0.5808)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [25]  [ 800/1251]  eta: 0:03:57  lr: 0.003996  min_lr: 0.003996  loss: 3.8641 (3.9261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5192 (0.5651)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [25]  [1000/1251]  eta: 0:02:12  lr: 0.003996  min_lr: 0.003996  loss: 4.1437 (3.9333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5567 (0.5821)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [25]  [1200/1251]  eta: 0:00:26  lr: 0.003996  min_lr: 0.003996  loss: 4.0206 (3.9349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5262 (0.5857)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 4.0370 (3.9348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5318 (0.5888)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [25] Total time: 0:10:57 (0.5252 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 4.0370 (3.9108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5318 (0.5888)
Test:  [ 0/25]  eta: 0:01:42  loss: 0.9808 (0.9808)  acc1: 80.0000 (80.0000)  acc5: 95.6000 (95.6000)  time: 4.0927  data: 3.7813  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 1.0601 (1.1811)  acc1: 77.6000 (74.9091)  acc5: 94.4000 (94.2545)  time: 0.7093  data: 0.4364  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.4676 (1.4196)  acc1: 66.4000 (69.8667)  acc5: 90.0000 (90.5333)  time: 0.3177  data: 0.0510  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.5850 (1.4288)  acc1: 66.8000 (69.9200)  acc5: 88.0000 (90.4480)  time: 0.2653  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4634 s / it)
* Acc@1 71.088 Acc@5 90.912 loss 1.410
Accuracy of the model on the 50000 test images: 71.1%
Max accuracy: 71.43%
Epoch: [26]  [   0/1251]  eta: 1:10:49  lr: 0.003995  min_lr: 0.003995  loss: 3.0214 (3.0214)  weight_decay: 0.0500 (0.0500)  time: 3.3971  data: 2.6034  max mem: 43713
Epoch: [26]  [ 200/1251]  eta: 0:09:26  lr: 0.003995  min_lr: 0.003995  loss: 4.2185 (3.8483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4604 (0.5180)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [26]  [ 400/1251]  eta: 0:07:32  lr: 0.003995  min_lr: 0.003995  loss: 4.1478 (3.8845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5158 (0.5599)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [26]  [ 600/1251]  eta: 0:05:44  lr: 0.003995  min_lr: 0.003995  loss: 4.1376 (3.8814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5463 (0.5474)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [26]  [ 800/1251]  eta: 0:03:57  lr: 0.003994  min_lr: 0.003994  loss: 4.1307 (3.8874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5112 (0.5482)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [26]  [1000/1251]  eta: 0:02:12  lr: 0.003994  min_lr: 0.003994  loss: 3.7790 (3.8850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4878 (0.5520)  time: 0.5300  data: 0.0004  max mem: 43713
Epoch: [26]  [1200/1251]  eta: 0:00:26  lr: 0.003994  min_lr: 0.003994  loss: 3.6289 (3.8984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5066 (0.5490)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.8818 (3.8949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4934 (0.5471)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [26] Total time: 0:10:57 (0.5252 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.8818 (3.8870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4934 (0.5471)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8668 (0.8668)  acc1: 84.0000 (84.0000)  acc5: 96.0000 (96.0000)  time: 5.4731  data: 5.1841  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.1107 (1.0951)  acc1: 78.8000 (76.6182)  acc5: 94.8000 (94.4727)  time: 0.7381  data: 0.4715  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3840 (1.3347)  acc1: 70.4000 (71.9429)  acc5: 90.0000 (91.0476)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4305 (1.3359)  acc1: 70.4000 (71.9680)  acc5: 89.2000 (91.0240)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4780 s / it)
* Acc@1 71.756 Acc@5 91.388 loss 1.328
Accuracy of the model on the 50000 test images: 71.8%
Max accuracy: 71.76%
Epoch: [27]  [   0/1251]  eta: 1:03:50  lr: 0.003994  min_lr: 0.003994  loss: 4.2446 (4.2446)  weight_decay: 0.0500 (0.0500)  time: 3.0623  data: 2.5213  max mem: 43713
Epoch: [27]  [ 200/1251]  eta: 0:09:23  lr: 0.003994  min_lr: 0.003994  loss: 3.8984 (3.8755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5956 (0.6263)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [27]  [ 400/1251]  eta: 0:07:31  lr: 0.003993  min_lr: 0.003993  loss: 3.7317 (3.8280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4630 (0.5949)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [27]  [ 600/1251]  eta: 0:05:43  lr: 0.003993  min_lr: 0.003993  loss: 3.8466 (3.8167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5543 (0.5827)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [27]  [ 800/1251]  eta: 0:03:57  lr: 0.003993  min_lr: 0.003993  loss: 4.1238 (3.8384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5357 (0.5705)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [27]  [1000/1251]  eta: 0:02:12  lr: 0.003992  min_lr: 0.003992  loss: 4.1100 (3.8305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6239 (0.5656)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [27]  [1200/1251]  eta: 0:00:26  lr: 0.003992  min_lr: 0.003992  loss: 3.6650 (3.8423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4594 (0.5585)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 3.8527 (3.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.5693)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [27] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 3.8527 (3.8552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6393 (0.5693)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.9607 (0.9607)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.5127  data: 5.1994  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0983 (1.1547)  acc1: 77.6000 (76.5818)  acc5: 94.8000 (94.4364)  time: 0.7417  data: 0.4730  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.4102 (1.3859)  acc1: 69.2000 (71.6381)  acc5: 90.4000 (91.2762)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.5623 (1.3935)  acc1: 69.2000 (71.4880)  acc5: 88.8000 (91.2480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4788 s / it)
* Acc@1 71.942 Acc@5 91.422 loss 1.399
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.94%
Epoch: [28]  [   0/1251]  eta: 1:02:57  lr: 0.003992  min_lr: 0.003992  loss: 4.1168 (4.1168)  weight_decay: 0.0500 (0.0500)  time: 3.0195  data: 2.4750  max mem: 43713
Epoch: [28]  [ 200/1251]  eta: 0:09:23  lr: 0.003992  min_lr: 0.003992  loss: 4.0670 (3.8523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5089 (0.5988)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [28]  [ 400/1251]  eta: 0:07:31  lr: 0.003991  min_lr: 0.003991  loss: 3.9051 (3.8722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5295 (0.5758)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [28]  [ 600/1251]  eta: 0:05:43  lr: 0.003991  min_lr: 0.003991  loss: 3.8321 (3.8398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6465 (0.5931)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [28]  [ 800/1251]  eta: 0:03:57  lr: 0.003991  min_lr: 0.003991  loss: 4.0352 (3.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4650 (nan)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [28]  [1000/1251]  eta: 0:02:12  lr: 0.003990  min_lr: 0.003990  loss: 3.9173 (3.8242)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5502 (nan)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [28]  [1200/1251]  eta: 0:00:26  lr: 0.003990  min_lr: 0.003990  loss: 3.7201 (3.8107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6321 (nan)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.8103 (3.8131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5870 (nan)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [28] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.8103 (3.8383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5870 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.9448 (0.9448)  acc1: 82.4000 (82.4000)  acc5: 95.6000 (95.6000)  time: 5.5281  data: 5.2383  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0654 (1.1153)  acc1: 76.8000 (77.7818)  acc5: 94.8000 (94.5455)  time: 0.7431  data: 0.4765  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3979 (1.3428)  acc1: 68.8000 (72.4762)  acc5: 90.0000 (91.4476)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.5518 (1.3571)  acc1: 68.8000 (72.2080)  acc5: 89.2000 (91.2640)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4808 s / it)
* Acc@1 72.242 Acc@5 91.490 loss 1.346
Accuracy of the model on the 50000 test images: 72.2%
Max accuracy: 72.24%
Epoch: [29]  [   0/1251]  eta: 0:54:47  lr: 0.003990  min_lr: 0.003990  loss: 3.8448 (3.8448)  weight_decay: 0.0500 (0.0500)  time: 2.6278  data: 2.0986  max mem: 43713
Epoch: [29]  [ 200/1251]  eta: 0:09:22  lr: 0.003989  min_lr: 0.003989  loss: 3.7911 (3.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4422 (0.5994)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [29]  [ 400/1251]  eta: 0:07:30  lr: 0.003989  min_lr: 0.003989  loss: 4.1233 (3.8083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5226 (0.5575)  time: 0.5249  data: 0.0004  max mem: 43713
Epoch: [29]  [ 600/1251]  eta: 0:05:43  lr: 0.003989  min_lr: 0.003989  loss: 3.6359 (3.8105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4558 (0.5765)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [29]  [ 800/1251]  eta: 0:03:57  lr: 0.003988  min_lr: 0.003988  loss: 4.1121 (3.8157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5863 (0.5892)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [29]  [1000/1251]  eta: 0:02:12  lr: 0.003988  min_lr: 0.003988  loss: 3.5892 (3.8058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5263 (0.5907)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [29]  [1200/1251]  eta: 0:00:26  lr: 0.003988  min_lr: 0.003988  loss: 3.9710 (3.8093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4669 (0.5929)  time: 0.5262  data: 0.0005  max mem: 43713
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.6919 (3.8094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5387 (0.5940)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [29] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.6919 (3.8136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5387 (0.5940)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9207 (0.9207)  acc1: 82.8000 (82.8000)  acc5: 97.2000 (97.2000)  time: 5.7961  data: 5.4907  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0601 (1.0881)  acc1: 78.4000 (76.7273)  acc5: 95.6000 (94.7636)  time: 0.7675  data: 0.4994  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3948 (1.3092)  acc1: 68.4000 (72.0191)  acc5: 90.0000 (91.5238)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4181 (1.3211)  acc1: 68.8000 (71.7760)  acc5: 89.2000 (91.4080)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4901 s / it)
* Acc@1 72.462 Acc@5 91.650 loss 1.309
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.46%
Epoch: [30]  [   0/1251]  eta: 1:01:46  lr: 0.003987  min_lr: 0.003987  loss: 4.2784 (4.2784)  weight_decay: 0.0500 (0.0500)  time: 2.9625  data: 2.4206  max mem: 43713
Epoch: [30]  [ 200/1251]  eta: 0:09:24  lr: 0.003987  min_lr: 0.003987  loss: 4.1581 (3.7654)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5761 (0.6156)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [30]  [ 400/1251]  eta: 0:07:32  lr: 0.003987  min_lr: 0.003987  loss: 3.7131 (3.7692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4325 (0.5851)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [30]  [ 600/1251]  eta: 0:05:44  lr: 0.003986  min_lr: 0.003986  loss: 3.8068 (3.7667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5616 (0.5881)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [30]  [ 800/1251]  eta: 0:03:57  lr: 0.003986  min_lr: 0.003986  loss: 3.9669 (3.7869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5282 (0.5951)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [30]  [1000/1251]  eta: 0:02:12  lr: 0.003985  min_lr: 0.003985  loss: 4.0885 (3.7815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6153 (0.5932)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [30]  [1200/1251]  eta: 0:00:26  lr: 0.003985  min_lr: 0.003985  loss: 3.5239 (3.7758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5339 (0.5919)  time: 0.5261  data: 0.0006  max mem: 43713
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.8069 (3.7744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6512 (0.5959)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [30] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.8069 (3.7904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6512 (0.5959)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.9275 (0.9275)  acc1: 82.4000 (82.4000)  acc5: 98.0000 (98.0000)  time: 5.2381  data: 4.9283  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9996 (1.0916)  acc1: 78.8000 (77.4545)  acc5: 95.6000 (95.2000)  time: 0.7169  data: 0.4484  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3495 (1.2874)  acc1: 69.2000 (73.2000)  acc5: 92.4000 (92.4000)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4740 (1.3058)  acc1: 69.2000 (72.5920)  acc5: 90.0000 (92.1760)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4683 s / it)
* Acc@1 73.020 Acc@5 92.030 loss 1.298
Accuracy of the model on the 50000 test images: 73.0%
Max accuracy: 73.02%
Epoch: [31]  [   0/1251]  eta: 0:50:22  lr: 0.003985  min_lr: 0.003985  loss: 2.8831 (2.8831)  weight_decay: 0.0500 (0.0500)  time: 2.4158  data: 1.8738  max mem: 43713
Epoch: [31]  [ 200/1251]  eta: 0:09:22  lr: 0.003984  min_lr: 0.003984  loss: 4.0804 (3.7282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5951 (0.5700)  time: 0.5338  data: 0.0004  max mem: 43713
Epoch: [31]  [ 400/1251]  eta: 0:07:30  lr: 0.003984  min_lr: 0.003984  loss: 4.0228 (3.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5398 (0.5987)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [31]  [ 600/1251]  eta: 0:05:43  lr: 0.003983  min_lr: 0.003983  loss: 3.6095 (3.7737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6301 (0.5954)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [31]  [ 800/1251]  eta: 0:03:57  lr: 0.003983  min_lr: 0.003983  loss: 3.9073 (3.7791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5922 (0.6063)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [31]  [1000/1251]  eta: 0:02:11  lr: 0.003982  min_lr: 0.003982  loss: 3.9947 (3.7879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5636 (0.6000)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [31]  [1200/1251]  eta: 0:00:26  lr: 0.003982  min_lr: 0.003982  loss: 3.6158 (3.7836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6216 (0.6014)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.8369 (3.7809)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5523 (0.6008)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [31] Total time: 0:10:56 (0.5246 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.8369 (3.7812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5523 (0.6008)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8281 (0.8281)  acc1: 81.6000 (81.6000)  acc5: 96.0000 (96.0000)  time: 5.3914  data: 5.1035  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9824 (1.0158)  acc1: 78.4000 (77.8182)  acc5: 95.6000 (94.7273)  time: 0.7308  data: 0.4642  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2858 (1.2303)  acc1: 70.0000 (73.3524)  acc5: 91.2000 (92.0952)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4422 (1.2441)  acc1: 70.0000 (73.0240)  acc5: 90.4000 (91.9040)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4746 s / it)
* Acc@1 73.356 Acc@5 92.100 loss 1.236
Accuracy of the model on the 50000 test images: 73.4%
Max accuracy: 73.36%
Epoch: [32]  [   0/1251]  eta: 0:58:40  lr: 0.003982  min_lr: 0.003982  loss: 4.0257 (4.0257)  weight_decay: 0.0500 (0.0500)  time: 2.8145  data: 2.2827  max mem: 43713
Epoch: [32]  [ 200/1251]  eta: 0:09:23  lr: 0.003981  min_lr: 0.003981  loss: 3.6425 (3.7384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5262 (0.6155)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [32]  [ 400/1251]  eta: 0:07:31  lr: 0.003981  min_lr: 0.003981  loss: 3.6139 (3.7659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5396 (0.6286)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [32]  [ 600/1251]  eta: 0:05:43  lr: 0.003980  min_lr: 0.003980  loss: 3.6812 (3.7768)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5428 (0.6396)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [32]  [ 800/1251]  eta: 0:03:57  lr: 0.003980  min_lr: 0.003980  loss: 3.6830 (3.7652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6507 (0.6289)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [32]  [1000/1251]  eta: 0:02:12  lr: 0.003979  min_lr: 0.003979  loss: 3.3619 (3.7482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5841 (0.6346)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [32]  [1200/1251]  eta: 0:00:26  lr: 0.003979  min_lr: 0.003979  loss: 3.9551 (3.7622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6479 (0.6407)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.7380 (3.7655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5390 (0.6361)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [32] Total time: 0:10:56 (0.5249 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.7380 (3.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5390 (0.6361)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8246 (0.8246)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.4485  data: 5.1439  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0924 (1.1106)  acc1: 78.8000 (77.6727)  acc5: 96.0000 (95.0546)  time: 0.7359  data: 0.4679  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.4382 (1.3187)  acc1: 70.4000 (73.0286)  acc5: 92.0000 (92.1333)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4694 (1.3277)  acc1: 68.8000 (72.7680)  acc5: 90.4000 (92.0480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4771 s / it)
* Acc@1 73.570 Acc@5 92.266 loss 1.314
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.57%
Epoch: [33]  [   0/1251]  eta: 0:53:31  lr: 0.003979  min_lr: 0.003979  loss: 2.6740 (2.6740)  weight_decay: 0.0500 (0.0500)  time: 2.5674  data: 2.0323  max mem: 43713
Epoch: [33]  [ 200/1251]  eta: 0:09:20  lr: 0.003978  min_lr: 0.003978  loss: 3.7761 (3.7121)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5208 (0.6378)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [33]  [ 400/1251]  eta: 0:07:30  lr: 0.003978  min_lr: 0.003978  loss: 3.9127 (3.7065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5190 (0.6139)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [33]  [ 600/1251]  eta: 0:05:43  lr: 0.003977  min_lr: 0.003977  loss: 3.8933 (3.6904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4631 (0.6076)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [33]  [ 800/1251]  eta: 0:03:57  lr: 0.003977  min_lr: 0.003977  loss: 3.8664 (3.7023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6623 (0.6149)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [33]  [1000/1251]  eta: 0:02:12  lr: 0.003976  min_lr: 0.003976  loss: 3.9867 (3.7088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5817 (0.6312)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [33]  [1200/1251]  eta: 0:00:26  lr: 0.003976  min_lr: 0.003976  loss: 3.6303 (3.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6050 (0.6251)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.8459 (3.7160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5355 (0.6219)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [33] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.8459 (3.7371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5355 (0.6219)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.8693 (0.8693)  acc1: 83.6000 (83.6000)  acc5: 96.4000 (96.4000)  time: 5.3701  data: 5.0620  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9813 (1.0461)  acc1: 80.4000 (78.4727)  acc5: 96.0000 (95.1273)  time: 0.7284  data: 0.4604  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3129 (1.2666)  acc1: 70.4000 (74.2095)  acc5: 90.8000 (92.1143)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4465 (1.2792)  acc1: 70.0000 (73.9200)  acc5: 90.0000 (91.9680)  time: 0.2639  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4743 s / it)
* Acc@1 74.120 Acc@5 92.364 loss 1.265
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.12%
Epoch: [34]  [   0/1251]  eta: 1:04:25  lr: 0.003975  min_lr: 0.003975  loss: 4.0973 (4.0973)  weight_decay: 0.0500 (0.0500)  time: 3.0902  data: 2.5457  max mem: 43713
Epoch: [34]  [ 200/1251]  eta: 0:09:23  lr: 0.003975  min_lr: 0.003975  loss: 3.7053 (3.6706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5762 (0.6388)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [34]  [ 400/1251]  eta: 0:07:31  lr: 0.003974  min_lr: 0.003974  loss: 3.9341 (3.7025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8009 (0.6284)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [34]  [ 600/1251]  eta: 0:05:44  lr: 0.003974  min_lr: 0.003974  loss: 3.8964 (3.6916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4785 (0.6278)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [34]  [ 800/1251]  eta: 0:03:58  lr: 0.003973  min_lr: 0.003973  loss: 3.8704 (3.7087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6653 (0.6256)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [34]  [1000/1251]  eta: 0:02:12  lr: 0.003972  min_lr: 0.003972  loss: 3.8422 (3.7197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5452 (0.6316)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [34]  [1200/1251]  eta: 0:00:26  lr: 0.003972  min_lr: 0.003972  loss: 3.6233 (3.7124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5713 (0.6442)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 3.6485 (3.7124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (0.6486)  time: 0.4432  data: 0.0005  max mem: 43713
Epoch: [34] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 3.6485 (3.7237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6033 (0.6486)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8406 (0.8406)  acc1: 83.6000 (83.6000)  acc5: 96.4000 (96.4000)  time: 5.5228  data: 5.2106  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9710 (1.0230)  acc1: 78.0000 (77.9636)  acc5: 96.0000 (95.1636)  time: 0.7421  data: 0.4740  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2729 (1.2413)  acc1: 70.8000 (73.6571)  acc5: 92.0000 (92.3429)  time: 0.2639  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3715 (1.2535)  acc1: 70.8000 (73.5680)  acc5: 89.6000 (92.2720)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4807 s / it)
* Acc@1 73.670 Acc@5 92.338 loss 1.247
Accuracy of the model on the 50000 test images: 73.7%
Max accuracy: 74.12%
Epoch: [35]  [   0/1251]  eta: 1:15:53  lr: 0.003972  min_lr: 0.003972  loss: 3.0122 (3.0122)  weight_decay: 0.0500 (0.0500)  time: 3.6401  data: 1.5979  max mem: 43713
Epoch: [35]  [ 200/1251]  eta: 0:09:29  lr: 0.003971  min_lr: 0.003971  loss: 3.5893 (3.6756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6438 (0.6616)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [35]  [ 400/1251]  eta: 0:07:33  lr: 0.003971  min_lr: 0.003971  loss: 3.7206 (3.7052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6640 (0.6371)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [35]  [ 600/1251]  eta: 0:05:44  lr: 0.003970  min_lr: 0.003970  loss: 3.5721 (3.7325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.6455)  time: 0.5292  data: 0.0005  max mem: 43713
Epoch: [35]  [ 800/1251]  eta: 0:03:58  lr: 0.003969  min_lr: 0.003969  loss: 3.8616 (3.7207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5582 (0.6448)  time: 0.5252  data: 0.0004  max mem: 43713
Epoch: [35]  [1000/1251]  eta: 0:02:12  lr: 0.003969  min_lr: 0.003969  loss: 3.5887 (3.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7479 (0.6592)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [35]  [1200/1251]  eta: 0:00:26  lr: 0.003968  min_lr: 0.003968  loss: 3.6428 (3.7105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6988 (0.6645)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.6408 (3.7100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5707 (0.6637)  time: 0.4441  data: 0.0008  max mem: 43713
Epoch: [35] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.6408 (3.7139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5707 (0.6637)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7815 (0.7815)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 5.5252  data: 5.2084  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9178 (0.9751)  acc1: 80.8000 (79.0182)  acc5: 96.0000 (95.2727)  time: 0.7429  data: 0.4738  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2705 (1.1933)  acc1: 70.0000 (74.6095)  acc5: 91.6000 (92.6095)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3628 (1.2028)  acc1: 71.2000 (74.3840)  acc5: 90.4000 (92.4320)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4796 s / it)
* Acc@1 74.148 Acc@5 92.604 loss 1.202
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.15%
Epoch: [36]  [   0/1251]  eta: 0:56:21  lr: 0.003968  min_lr: 0.003968  loss: 4.1106 (4.1106)  weight_decay: 0.0500 (0.0500)  time: 2.7028  data: 2.1778  max mem: 43713
Epoch: [36]  [ 200/1251]  eta: 0:09:22  lr: 0.003967  min_lr: 0.003967  loss: 3.7890 (3.7480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5058 (0.6507)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [36]  [ 400/1251]  eta: 0:07:31  lr: 0.003967  min_lr: 0.003967  loss: 3.7791 (3.7278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7256 (0.6743)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [36]  [ 600/1251]  eta: 0:05:44  lr: 0.003966  min_lr: 0.003966  loss: 3.2303 (3.7045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5800 (0.6507)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [36]  [ 800/1251]  eta: 0:03:57  lr: 0.003965  min_lr: 0.003965  loss: 3.8000 (3.7007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5543 (nan)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [36]  [1000/1251]  eta: 0:02:12  lr: 0.003965  min_lr: 0.003965  loss: 3.7539 (3.6834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6351 (nan)  time: 0.5316  data: 0.0004  max mem: 43713
Epoch: [36]  [1200/1251]  eta: 0:00:26  lr: 0.003964  min_lr: 0.003964  loss: 4.0202 (3.6951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (nan)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.9830 (3.6955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (nan)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [36] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.9830 (3.6939)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6781 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.9678 (0.9678)  acc1: 82.0000 (82.0000)  acc5: 95.2000 (95.2000)  time: 5.5307  data: 5.2367  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.1074 (1.0952)  acc1: 78.8000 (77.6364)  acc5: 95.2000 (95.0182)  time: 0.7435  data: 0.4763  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3523 (1.3121)  acc1: 68.8000 (73.0857)  acc5: 91.6000 (92.0000)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4360 (1.3194)  acc1: 69.2000 (72.9920)  acc5: 90.8000 (92.1120)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4800 s / it)
* Acc@1 74.140 Acc@5 92.566 loss 1.301
Accuracy of the model on the 50000 test images: 74.1%
Max accuracy: 74.15%
Epoch: [37]  [   0/1251]  eta: 1:15:00  lr: 0.003964  min_lr: 0.003964  loss: 3.8450 (3.8450)  weight_decay: 0.0500 (0.0500)  time: 3.5978  data: 2.9814  max mem: 43713
Epoch: [37]  [ 200/1251]  eta: 0:09:27  lr: 0.003963  min_lr: 0.003963  loss: 3.8668 (3.7098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5645 (0.6442)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [37]  [ 400/1251]  eta: 0:07:33  lr: 0.003962  min_lr: 0.003962  loss: 3.9371 (3.6896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0186 (0.7075)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [37]  [ 600/1251]  eta: 0:05:44  lr: 0.003962  min_lr: 0.003962  loss: 3.9545 (3.6732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5482 (0.6755)  time: 0.5290  data: 0.0004  max mem: 43713
Epoch: [37]  [ 800/1251]  eta: 0:03:58  lr: 0.003961  min_lr: 0.003961  loss: 3.8940 (3.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6444 (0.6714)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [37]  [1000/1251]  eta: 0:02:12  lr: 0.003960  min_lr: 0.003960  loss: 3.6445 (3.6868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5504 (0.6783)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [37]  [1200/1251]  eta: 0:00:26  lr: 0.003960  min_lr: 0.003960  loss: 3.8559 (3.6915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6241 (0.6789)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.7672 (3.6946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5886 (0.6765)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [37] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.7672 (3.6845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5886 (0.6765)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8763 (0.8763)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.4550  data: 5.1452  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0701 (1.0667)  acc1: 78.4000 (78.8727)  acc5: 95.6000 (95.2727)  time: 0.7365  data: 0.4681  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2631 (1.2594)  acc1: 72.0000 (74.5143)  acc5: 91.6000 (92.7429)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3734 (1.2631)  acc1: 72.0000 (74.3840)  acc5: 90.8000 (92.6560)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4766 s / it)
* Acc@1 74.472 Acc@5 92.722 loss 1.257
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.47%
Epoch: [38]  [   0/1251]  eta: 0:58:02  lr: 0.003959  min_lr: 0.003959  loss: 4.0576 (4.0576)  weight_decay: 0.0500 (0.0500)  time: 2.7837  data: 2.2427  max mem: 43713
Epoch: [38]  [ 200/1251]  eta: 0:09:23  lr: 0.003959  min_lr: 0.003959  loss: 3.8099 (3.6810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6609 (0.6178)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [38]  [ 400/1251]  eta: 0:07:30  lr: 0.003958  min_lr: 0.003958  loss: 3.6647 (3.7110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.6848)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [38]  [ 600/1251]  eta: 0:05:43  lr: 0.003957  min_lr: 0.003957  loss: 3.7944 (3.7069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5786 (0.6821)  time: 0.5256  data: 0.0004  max mem: 43713
Epoch: [38]  [ 800/1251]  eta: 0:03:57  lr: 0.003956  min_lr: 0.003956  loss: 3.8396 (3.6917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4347 (0.6971)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [38]  [1000/1251]  eta: 0:02:12  lr: 0.003956  min_lr: 0.003956  loss: 3.8153 (3.6955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (0.6883)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [38]  [1200/1251]  eta: 0:00:26  lr: 0.003955  min_lr: 0.003955  loss: 3.8183 (3.6903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5231 (0.6823)  time: 0.5329  data: 0.0004  max mem: 43713
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 3.4272 (3.6855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4808 (0.6779)  time: 0.4445  data: 0.0005  max mem: 43713
Epoch: [38] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 3.4272 (3.6690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4808 (0.6779)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8084 (0.8084)  acc1: 83.2000 (83.2000)  acc5: 97.2000 (97.2000)  time: 5.4028  data: 5.1043  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9200 (0.9781)  acc1: 78.4000 (78.7636)  acc5: 96.0000 (95.6000)  time: 0.7317  data: 0.4643  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2050 (1.1835)  acc1: 71.6000 (74.6286)  acc5: 92.4000 (92.6667)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2965 (1.1889)  acc1: 72.4000 (74.4800)  acc5: 90.8000 (92.6400)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4740 s / it)
* Acc@1 74.644 Acc@5 92.826 loss 1.184
Accuracy of the model on the 50000 test images: 74.6%
Max accuracy: 74.64%
Epoch: [39]  [   0/1251]  eta: 0:47:53  lr: 0.003955  min_lr: 0.003955  loss: 4.0724 (4.0724)  weight_decay: 0.0500 (0.0500)  time: 2.2968  data: 1.7507  max mem: 43713
Epoch: [39]  [ 200/1251]  eta: 0:09:19  lr: 0.003954  min_lr: 0.003954  loss: 3.6855 (3.6388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5448 (0.7518)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [39]  [ 400/1251]  eta: 0:07:28  lr: 0.003953  min_lr: 0.003953  loss: 3.7072 (3.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6313 (0.6812)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [39]  [ 600/1251]  eta: 0:05:43  lr: 0.003952  min_lr: 0.003952  loss: 3.8545 (3.6778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7899 (0.7156)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [39]  [ 800/1251]  eta: 0:03:57  lr: 0.003952  min_lr: 0.003952  loss: 3.6568 (3.6835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6623 (0.7139)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [39]  [1000/1251]  eta: 0:02:11  lr: 0.003951  min_lr: 0.003951  loss: 3.6612 (3.6681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6659 (0.6952)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [39]  [1200/1251]  eta: 0:00:26  lr: 0.003950  min_lr: 0.003950  loss: 3.7226 (3.6657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7164 (0.7100)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.8941 (3.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5270 (0.7070)  time: 0.4439  data: 0.0006  max mem: 43713
Epoch: [39] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.8941 (3.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5270 (0.7070)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8497 (0.8497)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2000)  time: 5.8360  data: 5.5313  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9793 (1.0232)  acc1: 79.6000 (79.8545)  acc5: 96.4000 (95.8546)  time: 0.7724  data: 0.5044  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2922 (1.2395)  acc1: 72.8000 (75.1810)  acc5: 91.6000 (92.5905)  time: 0.2651  data: 0.0009  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4103 (1.2481)  acc1: 70.8000 (74.8160)  acc5: 90.4000 (92.4960)  time: 0.2650  data: 0.0008  max mem: 43713
Test: Total time: 0:00:12 (0.4931 s / it)
* Acc@1 74.976 Acc@5 92.812 loss 1.244
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 74.98%
Epoch: [40]  [   0/1251]  eta: 0:58:16  lr: 0.003950  min_lr: 0.003950  loss: 3.0650 (3.0650)  weight_decay: 0.0500 (0.0500)  time: 2.7947  data: 2.2703  max mem: 43713
Epoch: [40]  [ 200/1251]  eta: 0:09:22  lr: 0.003949  min_lr: 0.003949  loss: 3.5753 (3.6446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4624 (0.6446)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [40]  [ 400/1251]  eta: 0:07:31  lr: 0.003948  min_lr: 0.003948  loss: 3.1546 (3.6273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6383 (0.6637)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [40]  [ 600/1251]  eta: 0:05:43  lr: 0.003947  min_lr: 0.003947  loss: 3.7195 (3.6652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8832 (0.7365)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [40]  [ 800/1251]  eta: 0:03:57  lr: 0.003947  min_lr: 0.003947  loss: 3.9589 (3.6656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7231 (0.7396)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [40]  [1000/1251]  eta: 0:02:12  lr: 0.003946  min_lr: 0.003946  loss: 3.7002 (3.6703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5062 (0.7312)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [40]  [1200/1251]  eta: 0:00:26  lr: 0.003945  min_lr: 0.003945  loss: 3.8606 (3.6705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.7382)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.8074 (3.6673)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.7375)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [40] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.8074 (3.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6541 (0.7375)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7911 (0.7911)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.2952  data: 4.9990  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 1.0251 (1.0434)  acc1: 78.8000 (79.3818)  acc5: 95.6000 (95.7091)  time: 0.7223  data: 0.4547  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2514 (1.2381)  acc1: 72.4000 (75.1619)  acc5: 92.4000 (92.6857)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3704 (1.2381)  acc1: 72.0000 (74.9120)  acc5: 90.8000 (92.6880)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4706 s / it)
* Acc@1 74.978 Acc@5 93.022 loss 1.237
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 74.98%
Epoch: [41]  [   0/1251]  eta: 0:45:13  lr: 0.003945  min_lr: 0.003945  loss: 3.7381 (3.7381)  weight_decay: 0.0500 (0.0500)  time: 2.1689  data: 1.6259  max mem: 43713
Epoch: [41]  [ 200/1251]  eta: 0:09:20  lr: 0.003944  min_lr: 0.003944  loss: 3.7242 (3.5365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6318 (0.7220)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [41]  [ 400/1251]  eta: 0:07:29  lr: 0.003943  min_lr: 0.003943  loss: 3.6933 (3.5866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5776 (0.7069)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [41]  [ 600/1251]  eta: 0:05:43  lr: 0.003942  min_lr: 0.003942  loss: 3.5879 (3.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7194 (0.7332)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [41]  [ 800/1251]  eta: 0:03:57  lr: 0.003941  min_lr: 0.003941  loss: 3.7135 (3.6146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5543 (0.7280)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [41]  [1000/1251]  eta: 0:02:11  lr: 0.003940  min_lr: 0.003940  loss: 3.7645 (3.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5521 (0.7233)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [41]  [1200/1251]  eta: 0:00:26  lr: 0.003940  min_lr: 0.003940  loss: 3.7055 (3.6309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7732 (0.7385)  time: 0.5245  data: 0.0005  max mem: 43713
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 3.8500 (3.6360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7388)  time: 0.4515  data: 0.0006  max mem: 43713
Epoch: [41] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 3.8500 (3.6362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.7388)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.9322 (0.9322)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.5450  data: 5.2468  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0556 (1.0898)  acc1: 78.8000 (78.8727)  acc5: 96.4000 (95.5273)  time: 0.7448  data: 0.4773  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.3196 (1.2831)  acc1: 71.2000 (74.3810)  acc5: 92.0000 (92.8000)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4338 (1.2906)  acc1: 70.8000 (74.2560)  acc5: 90.8000 (92.6560)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4804 s / it)
* Acc@1 74.824 Acc@5 93.132 loss 1.271
Accuracy of the model on the 50000 test images: 74.8%
Max accuracy: 74.98%
Epoch: [42]  [   0/1251]  eta: 1:12:31  lr: 0.003939  min_lr: 0.003939  loss: 3.8424 (3.8424)  weight_decay: 0.0500 (0.0500)  time: 3.4783  data: 2.1585  max mem: 43713
Epoch: [42]  [ 200/1251]  eta: 0:09:27  lr: 0.003939  min_lr: 0.003939  loss: 3.7823 (3.6664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6371 (0.7407)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [42]  [ 400/1251]  eta: 0:07:32  lr: 0.003938  min_lr: 0.003938  loss: 3.4908 (3.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7612 (0.7702)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [42]  [ 600/1251]  eta: 0:05:45  lr: 0.003937  min_lr: 0.003937  loss: 3.3885 (3.6020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5990 (0.7494)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [42]  [ 800/1251]  eta: 0:03:58  lr: 0.003936  min_lr: 0.003936  loss: 3.2319 (3.6042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5597 (0.7341)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [42]  [1000/1251]  eta: 0:02:12  lr: 0.003935  min_lr: 0.003935  loss: 3.9035 (3.6029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8419 (0.7617)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [42]  [1200/1251]  eta: 0:00:26  lr: 0.003934  min_lr: 0.003934  loss: 3.6942 (3.6113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6363 (0.7484)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 3.5057 (3.6052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6666 (0.7477)  time: 0.4432  data: 0.0005  max mem: 43713
Epoch: [42] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 3.5057 (3.6161)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6666 (0.7477)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.7442 (0.7442)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 4.9968  data: 4.6762  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9282 (0.9385)  acc1: 81.6000 (80.2545)  acc5: 95.6000 (95.3455)  time: 0.7200  data: 0.4513  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1659 (1.1364)  acc1: 73.2000 (75.3714)  acc5: 92.4000 (92.8381)  time: 0.2779  data: 0.0144  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2682 (1.1450)  acc1: 72.0000 (75.0240)  acc5: 92.0000 (92.8960)  time: 0.2635  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4682 s / it)
* Acc@1 75.368 Acc@5 93.304 loss 1.133
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.37%
Epoch: [43]  [   0/1251]  eta: 1:06:10  lr: 0.003934  min_lr: 0.003934  loss: 3.6586 (3.6586)  weight_decay: 0.0500 (0.0500)  time: 3.1736  data: 2.6309  max mem: 43713
Epoch: [43]  [ 200/1251]  eta: 0:09:24  lr: 0.003933  min_lr: 0.003933  loss: 3.6347 (3.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6305 (0.6227)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [43]  [ 400/1251]  eta: 0:07:31  lr: 0.003932  min_lr: 0.003932  loss: 3.7936 (3.6200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7873 (0.6723)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [43]  [ 600/1251]  eta: 0:05:43  lr: 0.003931  min_lr: 0.003931  loss: 3.7589 (3.6168)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9833 (0.7583)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [43]  [ 800/1251]  eta: 0:03:57  lr: 0.003930  min_lr: 0.003930  loss: 3.3918 (3.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5935 (0.7547)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [43]  [1000/1251]  eta: 0:02:12  lr: 0.003929  min_lr: 0.003929  loss: 3.4800 (3.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9703 (0.7773)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [43]  [1200/1251]  eta: 0:00:26  lr: 0.003928  min_lr: 0.003928  loss: 3.8876 (3.5906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7751 (0.7592)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 3.8141 (3.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5847 (0.7563)  time: 0.4431  data: 0.0005  max mem: 43713
Epoch: [43] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 3.8141 (3.6041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5847 (0.7563)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8598 (0.8598)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 5.4966  data: 5.2032  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9907 (1.0099)  acc1: 80.4000 (80.5455)  acc5: 96.4000 (95.8182)  time: 0.7397  data: 0.4733  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1962 (1.2017)  acc1: 72.4000 (75.3333)  acc5: 93.2000 (93.0286)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3605 (1.2094)  acc1: 71.6000 (75.0240)  acc5: 90.4000 (92.9440)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4764 s / it)
* Acc@1 75.530 Acc@5 93.236 loss 1.191
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.53%
Epoch: [44]  [   0/1251]  eta: 0:55:27  lr: 0.003928  min_lr: 0.003928  loss: 4.4309 (4.4309)  weight_decay: 0.0500 (0.0500)  time: 2.6599  data: 2.1235  max mem: 43713
Epoch: [44]  [ 200/1251]  eta: 0:09:22  lr: 0.003927  min_lr: 0.003927  loss: 3.6658 (3.5457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (0.9034)  time: 0.5246  data: 0.0006  max mem: 43713
Epoch: [44]  [ 400/1251]  eta: 0:07:31  lr: 0.003926  min_lr: 0.003926  loss: 3.5738 (3.5667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5347 (0.8586)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [44]  [ 600/1251]  eta: 0:05:43  lr: 0.003925  min_lr: 0.003925  loss: 3.5822 (3.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6622 (0.7876)  time: 0.5246  data: 0.0006  max mem: 43713
Epoch: [44]  [ 800/1251]  eta: 0:03:57  lr: 0.003924  min_lr: 0.003924  loss: 3.7602 (3.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8269 (0.8056)  time: 0.5250  data: 0.0006  max mem: 43713
Epoch: [44]  [1000/1251]  eta: 0:02:12  lr: 0.003923  min_lr: 0.003923  loss: 3.6216 (3.5838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5921 (0.7991)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [44]  [1200/1251]  eta: 0:00:26  lr: 0.003922  min_lr: 0.003922  loss: 3.6341 (3.5906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (0.7827)  time: 0.5228  data: 0.0006  max mem: 43713
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 3.3649 (3.5863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9402 (0.7887)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [44] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 3.3649 (3.5904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9402 (0.7887)
Test:  [ 0/25]  eta: 0:01:41  loss: 0.7615 (0.7615)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 4.0568  data: 3.7215  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9815 (0.9657)  acc1: 80.0000 (80.1091)  acc5: 96.4000 (95.8545)  time: 0.7032  data: 0.4321  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2045 (1.1492)  acc1: 73.2000 (75.4476)  acc5: 92.4000 (93.4476)  time: 0.3160  data: 0.0516  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3048 (1.1585)  acc1: 72.0000 (75.2640)  acc5: 92.0000 (93.2320)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4605 s / it)
* Acc@1 75.816 Acc@5 93.396 loss 1.142
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 75.82%
Epoch: [45]  [   0/1251]  eta: 0:59:22  lr: 0.003922  min_lr: 0.003922  loss: 3.8869 (3.8869)  weight_decay: 0.0500 (0.0500)  time: 2.8480  data: 2.3093  max mem: 43713
Epoch: [45]  [ 200/1251]  eta: 0:09:24  lr: 0.003921  min_lr: 0.003921  loss: 3.6053 (3.6240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.8038)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [45]  [ 400/1251]  eta: 0:07:31  lr: 0.003920  min_lr: 0.003920  loss: 3.6680 (3.6167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6982 (0.7990)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [45]  [ 600/1251]  eta: 0:05:43  lr: 0.003919  min_lr: 0.003919  loss: 3.7365 (3.6057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7205 (0.8132)  time: 0.5328  data: 0.0005  max mem: 43713
Epoch: [45]  [ 800/1251]  eta: 0:03:57  lr: 0.003918  min_lr: 0.003918  loss: 3.8179 (3.5971)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8533 (0.8085)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [45]  [1000/1251]  eta: 0:02:12  lr: 0.003917  min_lr: 0.003917  loss: 3.7151 (3.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.8114)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [45]  [1200/1251]  eta: 0:00:26  lr: 0.003916  min_lr: 0.003916  loss: 3.6330 (3.5863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5611 (0.7917)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.4909 (3.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6284 (0.7922)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [45] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.4909 (3.5831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6284 (0.7922)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7879 (0.7879)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.2702  data: 4.9576  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8581 (0.9418)  acc1: 80.4000 (79.6000)  acc5: 96.4000 (95.7818)  time: 0.7190  data: 0.4510  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1832 (1.1336)  acc1: 73.6000 (75.3333)  acc5: 92.8000 (93.1429)  time: 0.2637  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2612 (1.1342)  acc1: 72.0000 (75.3440)  acc5: 91.6000 (93.1520)  time: 0.2636  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4672 s / it)
* Acc@1 75.876 Acc@5 93.506 loss 1.122
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.88%
Epoch: [46]  [   0/1251]  eta: 0:58:57  lr: 0.003916  min_lr: 0.003916  loss: 4.2023 (4.2023)  weight_decay: 0.0500 (0.0500)  time: 2.8280  data: 2.2962  max mem: 43713
Epoch: [46]  [ 200/1251]  eta: 0:09:22  lr: 0.003914  min_lr: 0.003914  loss: 3.8286 (3.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7016 (0.8113)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [46]  [ 400/1251]  eta: 0:07:31  lr: 0.003913  min_lr: 0.003913  loss: 3.8268 (3.5885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6685 (0.8048)  time: 0.5344  data: 0.0004  max mem: 43713
Epoch: [46]  [ 600/1251]  eta: 0:05:43  lr: 0.003912  min_lr: 0.003912  loss: 3.6259 (3.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6323 (0.7711)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [46]  [ 800/1251]  eta: 0:03:57  lr: 0.003911  min_lr: 0.003911  loss: 3.8093 (3.5689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0752 (0.8180)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [46]  [1000/1251]  eta: 0:02:12  lr: 0.003910  min_lr: 0.003910  loss: 3.2742 (3.5642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6594 (0.8068)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [46]  [1200/1251]  eta: 0:00:26  lr: 0.003909  min_lr: 0.003909  loss: 3.3155 (3.5629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8536 (0.8217)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 3.6188 (3.5585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (0.8304)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [46] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 3.6188 (3.5719)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0822 (0.8304)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7293 (0.7293)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.3552  data: 5.0384  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9095 (0.9311)  acc1: 81.6000 (80.8727)  acc5: 96.4000 (96.1091)  time: 0.7274  data: 0.4583  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1528 (1.1399)  acc1: 74.4000 (76.2095)  acc5: 93.2000 (93.2762)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2720 (1.1524)  acc1: 73.6000 (75.7920)  acc5: 91.2000 (93.1840)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4734 s / it)
* Acc@1 75.914 Acc@5 93.454 loss 1.146
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 75.91%
Epoch: [47]  [   0/1251]  eta: 0:58:02  lr: 0.003909  min_lr: 0.003909  loss: 3.7156 (3.7156)  weight_decay: 0.0500 (0.0500)  time: 2.7840  data: 2.2498  max mem: 43713
Epoch: [47]  [ 200/1251]  eta: 0:09:22  lr: 0.003908  min_lr: 0.003908  loss: 3.5716 (3.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7026 (0.6700)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [47]  [ 400/1251]  eta: 0:07:31  lr: 0.003907  min_lr: 0.003907  loss: 3.6328 (3.5697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7692 (0.7436)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [47]  [ 600/1251]  eta: 0:05:44  lr: 0.003906  min_lr: 0.003906  loss: 3.7081 (3.5746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8333 (0.7788)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [47]  [ 800/1251]  eta: 0:03:57  lr: 0.003905  min_lr: 0.003905  loss: 3.5304 (3.5825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.7653)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [47]  [1000/1251]  eta: 0:02:12  lr: 0.003904  min_lr: 0.003904  loss: 3.4124 (3.5802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (0.7586)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [47]  [1200/1251]  eta: 0:00:26  lr: 0.003902  min_lr: 0.003902  loss: 3.4813 (3.5658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6335 (0.7810)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.7640 (3.5678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6625 (0.7833)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [47] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.7640 (3.5700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6625 (0.7833)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8949 (0.8949)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.5812  data: 5.2771  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 1.0189 (1.0734)  acc1: 81.6000 (79.3091)  acc5: 96.0000 (95.8909)  time: 0.7480  data: 0.4800  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2947 (1.2698)  acc1: 72.8000 (75.4095)  acc5: 93.2000 (93.3333)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.4011 (1.2772)  acc1: 72.8000 (75.2640)  acc5: 91.6000 (93.2000)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4830 s / it)
* Acc@1 75.594 Acc@5 93.462 loss 1.274
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.91%
Epoch: [48]  [   0/1251]  eta: 1:12:00  lr: 0.003902  min_lr: 0.003902  loss: 2.4869 (2.4869)  weight_decay: 0.0500 (0.0500)  time: 3.4536  data: 2.4672  max mem: 43713
Epoch: [48]  [ 200/1251]  eta: 0:09:27  lr: 0.003901  min_lr: 0.003901  loss: 3.6672 (3.5787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6674 (0.8739)  time: 0.5251  data: 0.0004  max mem: 43713
Epoch: [48]  [ 400/1251]  eta: 0:07:32  lr: 0.003900  min_lr: 0.003900  loss: 3.7785 (3.5669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0040 (0.8655)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [48]  [ 600/1251]  eta: 0:05:44  lr: 0.003899  min_lr: 0.003899  loss: 3.3661 (3.5512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5685 (0.8323)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [48]  [ 800/1251]  eta: 0:03:58  lr: 0.003898  min_lr: 0.003898  loss: 3.7166 (3.5710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (0.8608)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [48]  [1000/1251]  eta: 0:02:12  lr: 0.003897  min_lr: 0.003897  loss: 3.6194 (3.5718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5869 (0.8314)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [48]  [1200/1251]  eta: 0:00:26  lr: 0.003895  min_lr: 0.003895  loss: 3.3441 (3.5661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (0.8359)  time: 0.5306  data: 0.0005  max mem: 43713
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 3.8033 (3.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.8335)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [48] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 3.8033 (3.5612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.8335)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7682 (0.7682)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.2898  data: 4.9855  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9343 (0.9617)  acc1: 80.0000 (79.9636)  acc5: 96.4000 (96.2545)  time: 0.7215  data: 0.4535  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2010 (1.1607)  acc1: 72.4000 (75.7333)  acc5: 94.0000 (93.9238)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3047 (1.1655)  acc1: 71.6000 (75.6320)  acc5: 92.4000 (93.7760)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4707 s / it)
* Acc@1 76.010 Acc@5 93.626 loss 1.158
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.01%
Epoch: [49]  [   0/1251]  eta: 0:54:32  lr: 0.003895  min_lr: 0.003895  loss: 2.8738 (2.8738)  weight_decay: 0.0500 (0.0500)  time: 2.6156  data: 2.0872  max mem: 43713
Epoch: [49]  [ 200/1251]  eta: 0:09:21  lr: 0.003894  min_lr: 0.003894  loss: 3.6691 (3.5201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5477 (0.7697)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [49]  [ 400/1251]  eta: 0:07:29  lr: 0.003893  min_lr: 0.003893  loss: 3.6033 (3.5065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9016 (0.8549)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [49]  [ 600/1251]  eta: 0:05:43  lr: 0.003892  min_lr: 0.003892  loss: 3.7526 (3.5248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6997 (0.8550)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [49]  [ 800/1251]  eta: 0:03:57  lr: 0.003890  min_lr: 0.003890  loss: 3.7340 (3.5309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8916 (0.8553)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [49]  [1000/1251]  eta: 0:02:11  lr: 0.003889  min_lr: 0.003889  loss: 3.5552 (3.5273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9300 (0.8438)  time: 0.5285  data: 0.0004  max mem: 43713
Epoch: [49]  [1200/1251]  eta: 0:00:26  lr: 0.003888  min_lr: 0.003888  loss: 3.6450 (3.5252)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8480 (0.8489)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 3.7109 (3.5269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7871 (0.8426)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [49] Total time: 0:10:56 (0.5247 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 3.7109 (3.5464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7871 (0.8426)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.7474 (0.7474)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 4.4927  data: 4.1804  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9329 (0.9638)  acc1: 79.6000 (80.0364)  acc5: 96.4000 (96.0727)  time: 0.6767  data: 0.4083  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1991 (1.1584)  acc1: 73.2000 (76.4952)  acc5: 92.8000 (93.4857)  time: 0.2795  data: 0.0156  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3162 (1.1725)  acc1: 73.2000 (75.9680)  acc5: 92.4000 (93.4720)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4493 s / it)
* Acc@1 76.184 Acc@5 93.676 loss 1.163
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.18%
Epoch: [50]  [   0/1251]  eta: 0:55:42  lr: 0.003888  min_lr: 0.003888  loss: 4.1574 (4.1574)  weight_decay: 0.0500 (0.0500)  time: 2.6720  data: 2.1284  max mem: 43713
Epoch: [50]  [ 200/1251]  eta: 0:09:21  lr: 0.003887  min_lr: 0.003887  loss: 3.4283 (3.4924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6857 (0.7948)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [50]  [ 400/1251]  eta: 0:07:30  lr: 0.003885  min_lr: 0.003885  loss: 3.6346 (3.5268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8814 (0.8929)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [50]  [ 600/1251]  eta: 0:05:43  lr: 0.003884  min_lr: 0.003884  loss: 3.5348 (3.5460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (0.8635)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [50]  [ 800/1251]  eta: 0:03:57  lr: 0.003883  min_lr: 0.003883  loss: 3.7077 (3.5461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8717 (0.8835)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [50]  [1000/1251]  eta: 0:02:12  lr: 0.003882  min_lr: 0.003882  loss: 3.8159 (3.5525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5765 (0.8631)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [50]  [1200/1251]  eta: 0:00:26  lr: 0.003881  min_lr: 0.003881  loss: 3.5683 (3.5510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9242 (0.8686)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.7603 (3.5489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7823 (0.8698)  time: 0.4433  data: 0.0006  max mem: 43713
Epoch: [50] Total time: 0:10:56 (0.5248 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.7603 (3.5401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7823 (0.8698)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7458 (0.7458)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.7340  data: 5.4240  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9368 (0.9282)  acc1: 81.2000 (80.3636)  acc5: 96.8000 (96.5091)  time: 0.7618  data: 0.4934  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1714 (1.1282)  acc1: 74.0000 (75.9619)  acc5: 94.0000 (93.7714)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.3132 (1.1383)  acc1: 74.0000 (75.8240)  acc5: 92.0000 (93.6320)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4882 s / it)
* Acc@1 76.348 Acc@5 93.726 loss 1.127
Accuracy of the model on the 50000 test images: 76.3%
Max accuracy: 76.35%
Epoch: [51]  [   0/1251]  eta: 0:55:27  lr: 0.003880  min_lr: 0.003880  loss: 2.8671 (2.8671)  weight_decay: 0.0500 (0.0500)  time: 2.6598  data: 2.1268  max mem: 43713
Epoch: [51]  [ 200/1251]  eta: 0:09:23  lr: 0.003879  min_lr: 0.003879  loss: 3.7114 (3.5656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7690 (0.8951)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [51]  [ 400/1251]  eta: 0:07:30  lr: 0.003878  min_lr: 0.003878  loss: 3.5739 (3.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8757 (0.8862)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [51]  [ 600/1251]  eta: 0:05:43  lr: 0.003877  min_lr: 0.003877  loss: 3.6567 (3.5407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8228 (0.8757)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [51]  [ 800/1251]  eta: 0:03:57  lr: 0.003875  min_lr: 0.003875  loss: 3.5750 (3.5383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7556 (0.8609)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [51]  [1000/1251]  eta: 0:02:12  lr: 0.003874  min_lr: 0.003874  loss: 3.2866 (3.5371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9282 (0.8514)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [51]  [1200/1251]  eta: 0:00:26  lr: 0.003873  min_lr: 0.003873  loss: 3.5483 (3.5214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7185 (0.8311)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.8138 (3.5256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.8267)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [51] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.8138 (3.5258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7178 (0.8267)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8476 (0.8476)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.3409  data: 5.0542  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9414 (0.9928)  acc1: 80.8000 (80.1091)  acc5: 96.4000 (96.0364)  time: 0.7262  data: 0.4597  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2069 (1.1648)  acc1: 74.0000 (75.8095)  acc5: 93.2000 (93.5619)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2558 (1.1744)  acc1: 74.0000 (75.6640)  acc5: 92.4000 (93.3760)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4721 s / it)
* Acc@1 76.090 Acc@5 93.744 loss 1.170
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.35%
Epoch: [52]  [   0/1251]  eta: 1:13:33  lr: 0.003873  min_lr: 0.003873  loss: 3.3406 (3.3406)  weight_decay: 0.0500 (0.0500)  time: 3.5280  data: 2.7242  max mem: 43713
Epoch: [52]  [ 200/1251]  eta: 0:09:26  lr: 0.003871  min_lr: 0.003871  loss: 3.7303 (3.5262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6229 (0.8548)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [52]  [ 400/1251]  eta: 0:07:32  lr: 0.003870  min_lr: 0.003870  loss: 3.6655 (3.5504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7047 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [52]  [ 600/1251]  eta: 0:05:44  lr: 0.003869  min_lr: 0.003869  loss: 3.8702 (3.5582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7741 (nan)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [52]  [ 800/1251]  eta: 0:03:58  lr: 0.003867  min_lr: 0.003867  loss: 3.4833 (3.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5906 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [52]  [1000/1251]  eta: 0:02:12  lr: 0.003866  min_lr: 0.003866  loss: 3.5832 (3.5391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6026 (nan)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [52]  [1200/1251]  eta: 0:00:26  lr: 0.003865  min_lr: 0.003865  loss: 3.8556 (3.5438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6439 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 3.5920 (3.5458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7743 (nan)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [52] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 3.5920 (3.5220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7743 (nan)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.7505 (0.7505)  acc1: 86.8000 (86.8000)  acc5: 96.8000 (96.8000)  time: 4.3712  data: 4.0692  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9558 (0.9742)  acc1: 79.6000 (80.1455)  acc5: 96.4000 (96.0364)  time: 0.6873  data: 0.4189  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1686 (1.1586)  acc1: 74.0000 (76.0952)  acc5: 93.2000 (93.6571)  time: 0.2917  data: 0.0270  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2719 (1.1636)  acc1: 74.0000 (75.9840)  acc5: 92.8000 (93.5840)  time: 0.2645  data: 0.0002  max mem: 43713
Test: Total time: 0:00:11 (0.4541 s / it)
* Acc@1 76.490 Acc@5 93.756 loss 1.148
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.49%
Epoch: [53]  [   0/1251]  eta: 0:58:07  lr: 0.003865  min_lr: 0.003865  loss: 3.9257 (3.9257)  weight_decay: 0.0500 (0.0500)  time: 2.7878  data: 2.2548  max mem: 43713
Epoch: [53]  [ 200/1251]  eta: 0:09:22  lr: 0.003863  min_lr: 0.003863  loss: 3.7036 (3.5076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.8460)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [53]  [ 400/1251]  eta: 0:07:31  lr: 0.003862  min_lr: 0.003862  loss: 3.4879 (3.5194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1692 (0.9092)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [53]  [ 600/1251]  eta: 0:05:43  lr: 0.003861  min_lr: 0.003861  loss: 3.5948 (3.5102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7039 (0.8841)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [53]  [ 800/1251]  eta: 0:03:57  lr: 0.003859  min_lr: 0.003859  loss: 3.6026 (3.5213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6815 (0.8825)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [53]  [1000/1251]  eta: 0:02:12  lr: 0.003858  min_lr: 0.003858  loss: 3.3215 (3.5052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8542 (0.8932)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [53]  [1200/1251]  eta: 0:00:26  lr: 0.003857  min_lr: 0.003857  loss: 3.6274 (3.5062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7742 (0.8945)  time: 0.5240  data: 0.0006  max mem: 43713
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 3.3839 (3.5063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9948 (0.8986)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [53] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 3.3839 (3.5052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9948 (0.8986)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8000 (0.8000)  acc1: 85.6000 (85.6000)  acc5: 97.2000 (97.2000)  time: 5.5338  data: 5.2202  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8801 (0.9332)  acc1: 82.4000 (81.5636)  acc5: 97.2000 (96.2545)  time: 0.7437  data: 0.4749  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1550 (1.1420)  acc1: 74.4000 (76.7810)  acc5: 94.0000 (93.6571)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1975 (1.1488)  acc1: 74.0000 (76.5440)  acc5: 91.6000 (93.5520)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4801 s / it)
* Acc@1 76.664 Acc@5 93.788 loss 1.141
Accuracy of the model on the 50000 test images: 76.7%
Max accuracy: 76.66%
Epoch: [54]  [   0/1251]  eta: 0:53:27  lr: 0.003856  min_lr: 0.003856  loss: 3.9234 (3.9234)  weight_decay: 0.0500 (0.0500)  time: 2.5637  data: 2.0335  max mem: 43713
Epoch: [54]  [ 200/1251]  eta: 0:09:22  lr: 0.003855  min_lr: 0.003855  loss: 3.7505 (3.4526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (0.9801)  time: 0.5311  data: 0.0005  max mem: 43713
Epoch: [54]  [ 400/1251]  eta: 0:07:30  lr: 0.003854  min_lr: 0.003854  loss: 3.4864 (3.4811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9927 (0.9454)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [54]  [ 600/1251]  eta: 0:05:43  lr: 0.003852  min_lr: 0.003852  loss: 3.4735 (3.4886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9454 (0.9306)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [54]  [ 800/1251]  eta: 0:03:57  lr: 0.003851  min_lr: 0.003851  loss: 3.6670 (3.4937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7027 (0.9258)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [54]  [1000/1251]  eta: 0:02:12  lr: 0.003849  min_lr: 0.003849  loss: 3.5677 (3.5119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8166 (0.9061)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [54]  [1200/1251]  eta: 0:00:26  lr: 0.003848  min_lr: 0.003848  loss: 3.4374 (3.5232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7442 (0.9066)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.4020 (3.5223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7442 (0.9061)  time: 0.4444  data: 0.0005  max mem: 43713
Epoch: [54] Total time: 0:10:57 (0.5252 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.4020 (3.5047)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7442 (0.9061)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6890 (0.6890)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.3096  data: 5.0187  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8641 (0.9058)  acc1: 80.4000 (80.9818)  acc5: 96.0000 (95.8909)  time: 0.7226  data: 0.4565  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1702 (1.0987)  acc1: 74.0000 (76.3238)  acc5: 93.6000 (93.5429)  time: 0.2637  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2210 (1.1012)  acc1: 72.8000 (76.0640)  acc5: 91.6000 (93.6320)  time: 0.2636  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4703 s / it)
* Acc@1 76.510 Acc@5 93.884 loss 1.092
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.66%
Epoch: [55]  [   0/1251]  eta: 1:13:53  lr: 0.003848  min_lr: 0.003848  loss: 3.7991 (3.7991)  weight_decay: 0.0500 (0.0500)  time: 3.5439  data: 2.8523  max mem: 43713
Epoch: [55]  [ 200/1251]  eta: 0:09:28  lr: 0.003846  min_lr: 0.003846  loss: 3.7415 (3.5060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3023 (1.1234)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [55]  [ 400/1251]  eta: 0:07:32  lr: 0.003845  min_lr: 0.003845  loss: 3.3938 (3.4881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8135 (0.9659)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [55]  [ 600/1251]  eta: 0:05:44  lr: 0.003844  min_lr: 0.003844  loss: 3.6559 (3.4853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.9374)  time: 0.5355  data: 0.0004  max mem: 43713
Epoch: [55]  [ 800/1251]  eta: 0:03:58  lr: 0.003842  min_lr: 0.003842  loss: 3.7600 (3.4940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6940 (0.9171)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [55]  [1000/1251]  eta: 0:02:12  lr: 0.003841  min_lr: 0.003841  loss: 3.5329 (3.4815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7715 (0.9063)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [55]  [1200/1251]  eta: 0:00:26  lr: 0.003839  min_lr: 0.003839  loss: 3.6727 (3.4884)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5245  data: 0.0005  max mem: 43713
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 2.8903 (3.4842)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [55] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 2.8903 (3.5050)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7276 (0.7276)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.4838  data: 5.1929  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9067 (0.8850)  acc1: 80.8000 (81.4909)  acc5: 96.4000 (96.3636)  time: 0.7388  data: 0.4723  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0765 (1.0693)  acc1: 74.8000 (76.9333)  acc5: 93.6000 (93.7524)  time: 0.2639  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1692 (1.0757)  acc1: 74.8000 (76.7200)  acc5: 92.4000 (93.8080)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4766 s / it)
* Acc@1 76.752 Acc@5 93.984 loss 1.071
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.75%
Epoch: [56]  [   0/1251]  eta: 1:00:44  lr: 0.003839  min_lr: 0.003839  loss: 3.8203 (3.8203)  weight_decay: 0.0500 (0.0500)  time: 2.9131  data: 2.3829  max mem: 43713
Epoch: [56]  [ 200/1251]  eta: 0:09:22  lr: 0.003838  min_lr: 0.003838  loss: 3.7364 (3.4742)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6796 (0.8744)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [56]  [ 400/1251]  eta: 0:07:31  lr: 0.003836  min_lr: 0.003836  loss: 3.6204 (3.4582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0645 (0.9503)  time: 0.5328  data: 0.0004  max mem: 43713
Epoch: [56]  [ 600/1251]  eta: 0:05:44  lr: 0.003835  min_lr: 0.003835  loss: 3.6376 (3.4811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7916 (0.9169)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [56]  [ 800/1251]  eta: 0:03:57  lr: 0.003833  min_lr: 0.003833  loss: 3.7391 (3.5000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8884 (0.9025)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [56]  [1000/1251]  eta: 0:02:12  lr: 0.003832  min_lr: 0.003832  loss: 3.3063 (3.4940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6749 (0.9054)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [56]  [1200/1251]  eta: 0:00:26  lr: 0.003831  min_lr: 0.003831  loss: 3.1765 (3.5014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7629 (0.8943)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 3.8271 (3.5018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (0.9023)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [56] Total time: 0:10:57 (0.5254 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 3.8271 (3.4991)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (0.9023)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7855 (0.7855)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.5360  data: 5.2412  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9849 (0.9434)  acc1: 81.6000 (82.1818)  acc5: 96.8000 (96.4000)  time: 0.7440  data: 0.4768  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0901 (1.1242)  acc1: 77.2000 (77.6381)  acc5: 93.2000 (94.0762)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2097 (1.1340)  acc1: 74.8000 (77.1840)  acc5: 92.8000 (93.9200)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4802 s / it)
* Acc@1 76.828 Acc@5 94.004 loss 1.128
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.83%
Epoch: [57]  [   0/1251]  eta: 0:58:41  lr: 0.003830  min_lr: 0.003830  loss: 3.8646 (3.8646)  weight_decay: 0.0500 (0.0500)  time: 2.8149  data: 2.2731  max mem: 43713
Epoch: [57]  [ 200/1251]  eta: 0:09:22  lr: 0.003829  min_lr: 0.003829  loss: 3.1657 (3.4504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.8888)  time: 0.5342  data: 0.0004  max mem: 43713
Epoch: [57]  [ 400/1251]  eta: 0:07:31  lr: 0.003827  min_lr: 0.003827  loss: 3.3192 (3.4637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7846 (0.9609)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [57]  [ 600/1251]  eta: 0:05:43  lr: 0.003826  min_lr: 0.003826  loss: 3.7946 (3.4793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2498 (0.9520)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [57]  [ 800/1251]  eta: 0:03:57  lr: 0.003824  min_lr: 0.003824  loss: 3.5727 (3.4722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8984 (0.9543)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [57]  [1000/1251]  eta: 0:02:12  lr: 0.003823  min_lr: 0.003823  loss: 3.5804 (3.4824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.9391)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [57]  [1200/1251]  eta: 0:00:26  lr: 0.003821  min_lr: 0.003821  loss: 3.3280 (3.4864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8894 (0.9479)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 3.5319 (3.4827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.9553)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [57] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 3.5319 (3.4821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.9553)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7367 (0.7367)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.6000)  time: 5.9398  data: 5.6429  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9001 (0.9098)  acc1: 80.0000 (80.1455)  acc5: 96.8000 (96.2545)  time: 0.7803  data: 0.5133  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0746 (1.1050)  acc1: 74.0000 (76.3810)  acc5: 92.8000 (93.8857)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2506 (1.1133)  acc1: 73.2000 (76.2400)  acc5: 92.4000 (93.7440)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4956 s / it)
* Acc@1 76.948 Acc@5 93.994 loss 1.097
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 76.95%
Epoch: [58]  [   0/1251]  eta: 0:56:10  lr: 0.003821  min_lr: 0.003821  loss: 3.7780 (3.7780)  weight_decay: 0.0500 (0.0500)  time: 2.6941  data: 2.1467  max mem: 43713
Epoch: [58]  [ 200/1251]  eta: 0:09:23  lr: 0.003820  min_lr: 0.003820  loss: 3.5832 (3.4008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.8596)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [58]  [ 400/1251]  eta: 0:07:30  lr: 0.003818  min_lr: 0.003818  loss: 3.6102 (3.4312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.8247)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [58]  [ 600/1251]  eta: 0:05:43  lr: 0.003817  min_lr: 0.003817  loss: 3.7839 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8264 (0.8486)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [58]  [ 800/1251]  eta: 0:03:57  lr: 0.003815  min_lr: 0.003815  loss: 3.4506 (3.4830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0160 (0.9195)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [58]  [1000/1251]  eta: 0:02:12  lr: 0.003813  min_lr: 0.003813  loss: 3.6079 (3.4861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7809 (0.9026)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [58]  [1200/1251]  eta: 0:00:26  lr: 0.003812  min_lr: 0.003812  loss: 3.6621 (3.4830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8295 (0.9041)  time: 0.5301  data: 0.0004  max mem: 43713
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.5974 (3.4827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.9056)  time: 0.4443  data: 0.0006  max mem: 43713
Epoch: [58] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.5974 (3.4826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6664 (0.9056)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7633 (0.7633)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.5587  data: 5.2643  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8864 (0.9587)  acc1: 82.0000 (80.6909)  acc5: 96.8000 (96.4727)  time: 0.7455  data: 0.4788  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2369 (1.1397)  acc1: 73.6000 (76.6286)  acc5: 93.6000 (94.2286)  time: 0.2640  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2436 (1.1497)  acc1: 73.6000 (76.5440)  acc5: 93.2000 (94.0960)  time: 0.2639  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4789 s / it)
* Acc@1 76.972 Acc@5 94.016 loss 1.138
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 76.97%
Epoch: [59]  [   0/1251]  eta: 0:57:51  lr: 0.003812  min_lr: 0.003812  loss: 3.7146 (3.7146)  weight_decay: 0.0500 (0.0500)  time: 2.7748  data: 2.2423  max mem: 43713
Epoch: [59]  [ 200/1251]  eta: 0:09:21  lr: 0.003810  min_lr: 0.003810  loss: 3.5076 (3.4049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6337 (0.8973)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [59]  [ 400/1251]  eta: 0:07:29  lr: 0.003809  min_lr: 0.003809  loss: 3.7308 (3.4414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1390 (0.9774)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [59]  [ 600/1251]  eta: 0:05:43  lr: 0.003807  min_lr: 0.003807  loss: 2.8870 (3.4458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6104 (0.9096)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [59]  [ 800/1251]  eta: 0:03:57  lr: 0.003805  min_lr: 0.003805  loss: 3.5359 (3.4595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8592 (0.9214)  time: 0.5269  data: 0.0004  max mem: 43713
Epoch: [59]  [1000/1251]  eta: 0:02:12  lr: 0.003804  min_lr: 0.003804  loss: 3.2532 (3.4619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (0.9406)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [59]  [1200/1251]  eta: 0:00:26  lr: 0.003802  min_lr: 0.003802  loss: 3.3733 (3.4753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.9227)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.6245 (3.4766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8762 (0.9340)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [59] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.6245 (3.4735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8762 (0.9340)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6750 (0.6750)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.8283  data: 5.5270  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9218 (0.8952)  acc1: 80.4000 (80.9091)  acc5: 96.8000 (96.2909)  time: 0.7705  data: 0.5028  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0983 (1.0807)  acc1: 75.2000 (76.7810)  acc5: 92.8000 (93.9810)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1590 (1.0926)  acc1: 74.4000 (76.2400)  acc5: 92.8000 (93.9200)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4920 s / it)
* Acc@1 77.312 Acc@5 94.148 loss 1.081
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.31%
Epoch: [60]  [   0/1251]  eta: 0:51:27  lr: 0.003802  min_lr: 0.003802  loss: 3.0217 (3.0217)  weight_decay: 0.0500 (0.0500)  time: 2.4679  data: 1.9365  max mem: 43713
Epoch: [60]  [ 200/1251]  eta: 0:09:21  lr: 0.003800  min_lr: 0.003800  loss: 3.2571 (3.4622)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5748 (0.9505)  time: 0.5247  data: 0.0006  max mem: 43713
Epoch: [60]  [ 400/1251]  eta: 0:07:31  lr: 0.003799  min_lr: 0.003799  loss: 3.4820 (3.4721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0125 (0.9871)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [60]  [ 600/1251]  eta: 0:05:44  lr: 0.003797  min_lr: 0.003797  loss: 3.3994 (3.4828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0626 (1.0057)  time: 0.5244  data: 0.0006  max mem: 43713
Epoch: [60]  [ 800/1251]  eta: 0:03:58  lr: 0.003796  min_lr: 0.003796  loss: 3.7838 (3.4966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5859 (0.9352)  time: 0.5313  data: 0.0005  max mem: 43713
Epoch: [60]  [1000/1251]  eta: 0:02:12  lr: 0.003794  min_lr: 0.003794  loss: 3.6046 (3.4964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9470 (0.9418)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [60]  [1200/1251]  eta: 0:00:26  lr: 0.003793  min_lr: 0.003793  loss: 3.7585 (3.4856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7181 (0.9186)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.6823 (3.4895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.9088)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [60] Total time: 0:10:57 (0.5260 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.6823 (3.4658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6646 (0.9088)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7885 (0.7885)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.7068  data: 5.4013  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9794 (0.9724)  acc1: 82.0000 (81.0545)  acc5: 96.8000 (96.2182)  time: 0.7591  data: 0.4913  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1608 (1.1517)  acc1: 75.2000 (76.9524)  acc5: 93.6000 (94.0762)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2621 (1.1667)  acc1: 74.4000 (76.3680)  acc5: 92.4000 (93.8240)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4853 s / it)
* Acc@1 76.750 Acc@5 93.956 loss 1.160
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 77.31%
Epoch: [61]  [   0/1251]  eta: 1:13:06  lr: 0.003792  min_lr: 0.003792  loss: 4.3739 (4.3739)  weight_decay: 0.0500 (0.0500)  time: 3.5068  data: 2.2511  max mem: 43713
Epoch: [61]  [ 200/1251]  eta: 0:09:30  lr: 0.003791  min_lr: 0.003791  loss: 3.6058 (3.4342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0403 (0.9159)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [61]  [ 400/1251]  eta: 0:07:33  lr: 0.003789  min_lr: 0.003789  loss: 3.3812 (3.4366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8509 (0.9412)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [61]  [ 600/1251]  eta: 0:05:45  lr: 0.003787  min_lr: 0.003787  loss: 3.5472 (3.4591)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9026 (0.9618)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [61]  [ 800/1251]  eta: 0:03:58  lr: 0.003786  min_lr: 0.003786  loss: 3.5273 (3.4694)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (nan)  time: 0.5235  data: 0.0006  max mem: 43713
Epoch: [61]  [1000/1251]  eta: 0:02:12  lr: 0.003784  min_lr: 0.003784  loss: 3.5812 (3.4715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8439 (nan)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [61]  [1200/1251]  eta: 0:00:26  lr: 0.003782  min_lr: 0.003782  loss: 3.6474 (3.4803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9152 (nan)  time: 0.5318  data: 0.0006  max mem: 43713
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.4673 (3.4814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (nan)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [61] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.4673 (3.4615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7110 (nan)
Test:  [ 0/25]  eta: 0:02:00  loss: 0.7438 (0.7438)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 4.8112  data: 4.4813  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8490 (0.8861)  acc1: 82.4000 (82.0000)  acc5: 96.8000 (96.1091)  time: 0.6781  data: 0.4077  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1158 (1.0598)  acc1: 74.4000 (77.9238)  acc5: 94.0000 (93.9048)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2061 (1.0720)  acc1: 73.6000 (77.3600)  acc5: 93.6000 (93.8720)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4509 s / it)
* Acc@1 77.218 Acc@5 94.160 loss 1.058
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.31%
Epoch: [62]  [   0/1251]  eta: 1:13:19  lr: 0.003782  min_lr: 0.003782  loss: 2.9653 (2.9653)  weight_decay: 0.0500 (0.0500)  time: 3.5167  data: 2.5810  max mem: 43713
Epoch: [62]  [ 200/1251]  eta: 0:09:26  lr: 0.003780  min_lr: 0.003780  loss: 3.2665 (3.4173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9485 (0.9104)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [62]  [ 400/1251]  eta: 0:07:32  lr: 0.003779  min_lr: 0.003779  loss: 3.7481 (3.4145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8899 (0.9516)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [62]  [ 600/1251]  eta: 0:05:44  lr: 0.003777  min_lr: 0.003777  loss: 3.5120 (3.4220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7304 (0.9058)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [62]  [ 800/1251]  eta: 0:03:58  lr: 0.003775  min_lr: 0.003775  loss: 3.6002 (3.4348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (0.9091)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [62]  [1000/1251]  eta: 0:02:12  lr: 0.003774  min_lr: 0.003774  loss: 3.5460 (3.4349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0630 (0.9417)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [62]  [1200/1251]  eta: 0:00:26  lr: 0.003772  min_lr: 0.003772  loss: 3.4708 (3.4322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.9495)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 3.5059 (3.4357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.9441)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [62] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 3.5059 (3.4510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8456 (0.9441)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6564 (0.6564)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.3556  data: 5.0638  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8643 (0.9005)  acc1: 81.6000 (81.0909)  acc5: 97.2000 (96.5091)  time: 0.7277  data: 0.4607  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1011 (1.0852)  acc1: 74.0000 (76.9714)  acc5: 93.6000 (94.1333)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2366 (1.0994)  acc1: 74.4000 (76.7200)  acc5: 92.8000 (94.1440)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4744 s / it)
* Acc@1 77.240 Acc@5 94.254 loss 1.082
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.31%
Epoch: [63]  [   0/1251]  eta: 1:13:36  lr: 0.003772  min_lr: 0.003772  loss: 2.8101 (2.8101)  weight_decay: 0.0500 (0.0500)  time: 3.5303  data: 2.1972  max mem: 43713
Epoch: [63]  [ 200/1251]  eta: 0:09:26  lr: 0.003770  min_lr: 0.003770  loss: 3.6280 (3.4454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.7920)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [63]  [ 400/1251]  eta: 0:07:33  lr: 0.003768  min_lr: 0.003768  loss: 3.5371 (3.4364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7424 (0.9205)  time: 0.5251  data: 0.0004  max mem: 43713
Epoch: [63]  [ 600/1251]  eta: 0:05:44  lr: 0.003767  min_lr: 0.003767  loss: 3.6268 (3.4429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7590 (0.8812)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [63]  [ 800/1251]  eta: 0:03:58  lr: 0.003765  min_lr: 0.003765  loss: 3.7290 (3.4550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.9026)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [63]  [1000/1251]  eta: 0:02:12  lr: 0.003763  min_lr: 0.003763  loss: 3.6155 (3.4561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6615 (0.9083)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [63]  [1200/1251]  eta: 0:00:26  lr: 0.003762  min_lr: 0.003762  loss: 3.7299 (3.4473)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0644 (0.9086)  time: 0.5258  data: 0.0005  max mem: 43713
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.3901 (3.4467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0257 (0.9092)  time: 0.4441  data: 0.0007  max mem: 43713
Epoch: [63] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.3901 (3.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0257 (0.9092)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7077 (0.7077)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.4430  data: 5.1275  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9244 (0.9084)  acc1: 79.6000 (80.6182)  acc5: 96.8000 (96.2545)  time: 0.7354  data: 0.4664  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0786 (1.0953)  acc1: 73.6000 (76.9905)  acc5: 93.2000 (94.0000)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2184 (1.0974)  acc1: 74.8000 (76.9920)  acc5: 92.8000 (93.9840)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4771 s / it)
* Acc@1 77.262 Acc@5 94.136 loss 1.092
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.31%
Epoch: [64]  [   0/1251]  eta: 1:17:57  lr: 0.003761  min_lr: 0.003761  loss: 3.3421 (3.3421)  weight_decay: 0.0500 (0.0500)  time: 3.7392  data: 2.3223  max mem: 43713
Epoch: [64]  [ 200/1251]  eta: 0:09:28  lr: 0.003760  min_lr: 0.003760  loss: 3.3707 (3.4593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8116 (0.8724)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [64]  [ 400/1251]  eta: 0:07:33  lr: 0.003758  min_lr: 0.003758  loss: 3.5583 (3.4533)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.9328)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [64]  [ 600/1251]  eta: 0:05:44  lr: 0.003756  min_lr: 0.003756  loss: 3.6210 (3.4585)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8596 (0.9295)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [64]  [ 800/1251]  eta: 0:03:58  lr: 0.003754  min_lr: 0.003754  loss: 3.5015 (3.4587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8187 (0.9664)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [64]  [1000/1251]  eta: 0:02:12  lr: 0.003753  min_lr: 0.003753  loss: 3.5106 (3.4510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6724 (0.9397)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [64]  [1200/1251]  eta: 0:00:26  lr: 0.003751  min_lr: 0.003751  loss: 3.5346 (3.4610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.9293)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 3.3643 (3.4566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.9219)  time: 0.4495  data: 0.0007  max mem: 43713
Epoch: [64] Total time: 0:10:57 (0.5260 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 3.3643 (3.4542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7388 (0.9219)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6410 (0.6410)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.3706  data: 5.0842  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8755 (0.8369)  acc1: 82.8000 (81.7818)  acc5: 96.0000 (96.4364)  time: 0.7286  data: 0.4625  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0668 (1.0262)  acc1: 75.6000 (77.5810)  acc5: 93.2000 (94.1524)  time: 0.2644  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1612 (1.0373)  acc1: 75.6000 (77.3760)  acc5: 92.8000 (94.0160)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4743 s / it)
* Acc@1 77.734 Acc@5 94.302 loss 1.025
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.73%
Epoch: [65]  [   0/1251]  eta: 1:01:42  lr: 0.003751  min_lr: 0.003751  loss: 3.5514 (3.5514)  weight_decay: 0.0500 (0.0500)  time: 2.9598  data: 2.4154  max mem: 43713
Epoch: [65]  [ 200/1251]  eta: 0:09:25  lr: 0.003749  min_lr: 0.003749  loss: 3.4111 (3.4034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7718 (0.9618)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [65]  [ 400/1251]  eta: 0:07:31  lr: 0.003747  min_lr: 0.003747  loss: 3.4768 (3.4051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.9479)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [65]  [ 600/1251]  eta: 0:05:44  lr: 0.003745  min_lr: 0.003745  loss: 3.4392 (3.4200)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (0.9199)  time: 0.5279  data: 0.0004  max mem: 43713
Epoch: [65]  [ 800/1251]  eta: 0:03:57  lr: 0.003744  min_lr: 0.003744  loss: 3.5899 (3.4291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.9145)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [65]  [1000/1251]  eta: 0:02:12  lr: 0.003742  min_lr: 0.003742  loss: 3.5909 (3.4548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8696 (0.9125)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [65]  [1200/1251]  eta: 0:00:26  lr: 0.003740  min_lr: 0.003740  loss: 3.3945 (3.4482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7525 (0.9265)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 3.5979 (3.4457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6690 (0.9216)  time: 0.4475  data: 0.0006  max mem: 43713
Epoch: [65] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 3.5979 (3.4389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6690 (0.9216)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7092 (0.7092)  acc1: 84.4000 (84.4000)  acc5: 98.8000 (98.8000)  time: 5.8459  data: 5.5512  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8996 (0.8732)  acc1: 80.4000 (81.8909)  acc5: 96.4000 (96.3636)  time: 0.7720  data: 0.5049  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0363 (1.0530)  acc1: 76.0000 (77.5810)  acc5: 94.0000 (94.1714)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1437 (1.0649)  acc1: 75.2000 (77.1680)  acc5: 92.8000 (94.1600)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4930 s / it)
* Acc@1 77.516 Acc@5 94.308 loss 1.057
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [66]  [   0/1251]  eta: 1:12:05  lr: 0.003740  min_lr: 0.003740  loss: 3.0217 (3.0217)  weight_decay: 0.0500 (0.0500)  time: 3.4577  data: 1.5862  max mem: 43713
Epoch: [66]  [ 200/1251]  eta: 0:09:25  lr: 0.003738  min_lr: 0.003738  loss: 3.4512 (3.4618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8879)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [66]  [ 400/1251]  eta: 0:07:32  lr: 0.003736  min_lr: 0.003736  loss: 3.6645 (3.4592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.8930)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [66]  [ 600/1251]  eta: 0:05:44  lr: 0.003734  min_lr: 0.003734  loss: 3.5872 (3.4582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9949 (0.9659)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [66]  [ 800/1251]  eta: 0:03:57  lr: 0.003732  min_lr: 0.003732  loss: 3.4459 (3.4498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (0.9315)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [66]  [1000/1251]  eta: 0:02:12  lr: 0.003731  min_lr: 0.003731  loss: 3.3957 (3.4504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.9510)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [66]  [1200/1251]  eta: 0:00:26  lr: 0.003729  min_lr: 0.003729  loss: 3.7026 (3.4466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.9301)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 3.6204 (3.4516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7072 (0.9196)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [66] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 3.6204 (3.4349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7072 (0.9196)
Test:  [ 0/25]  eta: 0:01:44  loss: 0.7769 (0.7769)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 4.1649  data: 3.8673  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9538 (0.9438)  acc1: 81.6000 (81.4182)  acc5: 97.2000 (96.6909)  time: 0.7292  data: 0.4621  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1867 (1.1080)  acc1: 75.6000 (77.7524)  acc5: 94.0000 (94.6476)  time: 0.3250  data: 0.0608  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2364 (1.1203)  acc1: 75.2000 (77.4400)  acc5: 93.2000 (94.4800)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4727 s / it)
* Acc@1 77.506 Acc@5 94.384 loss 1.116
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [67]  [   0/1251]  eta: 1:11:31  lr: 0.003728  min_lr: 0.003728  loss: 3.3842 (3.3842)  weight_decay: 0.0500 (0.0500)  time: 3.4303  data: 1.6440  max mem: 43713
Epoch: [67]  [ 200/1251]  eta: 0:09:26  lr: 0.003727  min_lr: 0.003727  loss: 3.7247 (3.4688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.8767)  time: 0.5334  data: 0.0004  max mem: 43713
Epoch: [67]  [ 400/1251]  eta: 0:07:33  lr: 0.003725  min_lr: 0.003725  loss: 3.6986 (3.4336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.8306)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [67]  [ 600/1251]  eta: 0:05:44  lr: 0.003723  min_lr: 0.003723  loss: 3.5719 (3.4377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9117 (0.8657)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [67]  [ 800/1251]  eta: 0:03:58  lr: 0.003721  min_lr: 0.003721  loss: 3.6173 (3.4467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8216 (0.8741)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [67]  [1000/1251]  eta: 0:02:12  lr: 0.003719  min_lr: 0.003719  loss: 3.6331 (3.4451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (0.8842)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [67]  [1200/1251]  eta: 0:00:26  lr: 0.003717  min_lr: 0.003717  loss: 3.7196 (3.4518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.8812)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.5304 (3.4557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.8809)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [67] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.5304 (3.4328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7832 (0.8809)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.8015 (0.8015)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 4.5517  data: 4.2090  max mem: 43713
Test:  [10/25]  eta: 0:00:09  loss: 0.9822 (1.0023)  acc1: 82.4000 (81.3818)  acc5: 96.8000 (96.4000)  time: 0.6611  data: 0.3877  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.2392 (1.1743)  acc1: 76.0000 (77.6762)  acc5: 93.6000 (94.1524)  time: 0.2685  data: 0.0028  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2857 (1.1791)  acc1: 75.2000 (77.4240)  acc5: 93.6000 (94.0640)  time: 0.2650  data: 0.0002  max mem: 43713
Test: Total time: 0:00:11 (0.4430 s / it)
* Acc@1 77.564 Acc@5 94.208 loss 1.165
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.73%
Epoch: [68]  [   0/1251]  eta: 1:14:42  lr: 0.003717  min_lr: 0.003717  loss: 3.0872 (3.0872)  weight_decay: 0.0500 (0.0500)  time: 3.5834  data: 2.9772  max mem: 43713
Epoch: [68]  [ 200/1251]  eta: 0:09:29  lr: 0.003715  min_lr: 0.003715  loss: 3.2306 (3.4046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6656 (0.8025)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [68]  [ 400/1251]  eta: 0:07:33  lr: 0.003713  min_lr: 0.003713  loss: 3.4135 (3.4058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.8586)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [68]  [ 600/1251]  eta: 0:05:45  lr: 0.003711  min_lr: 0.003711  loss: 3.6521 (3.4137)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.8826)  time: 0.5318  data: 0.0004  max mem: 43713
Epoch: [68]  [ 800/1251]  eta: 0:03:58  lr: 0.003710  min_lr: 0.003710  loss: 3.6304 (3.4216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6734 (0.8629)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [68]  [1000/1251]  eta: 0:02:12  lr: 0.003708  min_lr: 0.003708  loss: 3.5444 (3.4292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.8759)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [68]  [1200/1251]  eta: 0:00:26  lr: 0.003706  min_lr: 0.003706  loss: 3.2849 (3.4222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8378 (0.8904)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.4294 (3.4207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.8907)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [68] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.4294 (3.4203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.8907)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6743 (0.6743)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.1807  data: 4.8680  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8296 (0.8649)  acc1: 80.8000 (81.2000)  acc5: 96.8000 (96.5091)  time: 0.7117  data: 0.4429  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1167 (1.0401)  acc1: 74.8000 (77.4476)  acc5: 93.6000 (94.5333)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1525 (1.0458)  acc1: 74.8000 (77.2480)  acc5: 93.6000 (94.5440)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4656 s / it)
* Acc@1 77.770 Acc@5 94.608 loss 1.034
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.77%
Epoch: [69]  [   0/1251]  eta: 0:57:50  lr: 0.003705  min_lr: 0.003705  loss: 3.7340 (3.7340)  weight_decay: 0.0500 (0.0500)  time: 2.7745  data: 2.2406  max mem: 43713
Epoch: [69]  [ 200/1251]  eta: 0:09:22  lr: 0.003703  min_lr: 0.003703  loss: 3.6278 (3.3743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9573 (0.9752)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [69]  [ 400/1251]  eta: 0:07:31  lr: 0.003702  min_lr: 0.003702  loss: 3.5105 (3.3861)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6943 (0.9115)  time: 0.5347  data: 0.0004  max mem: 43713
Epoch: [69]  [ 600/1251]  eta: 0:05:44  lr: 0.003700  min_lr: 0.003700  loss: 3.3516 (3.3852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8574 (0.8986)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [69]  [ 800/1251]  eta: 0:03:57  lr: 0.003698  min_lr: 0.003698  loss: 3.6438 (3.3977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7481 (0.8702)  time: 0.5289  data: 0.0005  max mem: 43713
Epoch: [69]  [1000/1251]  eta: 0:02:12  lr: 0.003696  min_lr: 0.003696  loss: 3.6481 (3.3977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0137 (0.9210)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [69]  [1200/1251]  eta: 0:00:26  lr: 0.003694  min_lr: 0.003694  loss: 3.4591 (3.4074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5690 (0.9203)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 3.5218 (3.4122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6489 (0.9269)  time: 0.4444  data: 0.0006  max mem: 43713
Epoch: [69] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 3.5218 (3.4308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6489 (0.9269)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7437 (0.7437)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.7500  data: 5.4445  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9689 (0.9276)  acc1: 80.0000 (81.2364)  acc5: 96.4000 (96.2909)  time: 0.7633  data: 0.4952  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1009 (1.1041)  acc1: 76.0000 (77.2952)  acc5: 93.6000 (94.0762)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2412 (1.1135)  acc1: 76.0000 (77.1200)  acc5: 92.4000 (93.9520)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4899 s / it)
* Acc@1 77.620 Acc@5 94.314 loss 1.100
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.77%
Epoch: [70]  [   0/1251]  eta: 1:07:35  lr: 0.003694  min_lr: 0.003694  loss: 4.0833 (4.0833)  weight_decay: 0.0500 (0.0500)  time: 3.2415  data: 1.6789  max mem: 43713
Epoch: [70]  [ 200/1251]  eta: 0:09:25  lr: 0.003692  min_lr: 0.003692  loss: 3.3819 (3.3850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6653 (0.7278)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [70]  [ 400/1251]  eta: 0:07:32  lr: 0.003690  min_lr: 0.003690  loss: 3.6080 (3.4036)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7931 (0.8089)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [70]  [ 600/1251]  eta: 0:05:44  lr: 0.003688  min_lr: 0.003688  loss: 3.5255 (3.4082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9323 (0.8600)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [70]  [ 800/1251]  eta: 0:03:58  lr: 0.003686  min_lr: 0.003686  loss: 3.6210 (3.4001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6697 (0.8471)  time: 0.5293  data: 0.0004  max mem: 43713
Epoch: [70]  [1000/1251]  eta: 0:02:12  lr: 0.003684  min_lr: 0.003684  loss: 3.6056 (3.4002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7711 (0.8760)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [70]  [1200/1251]  eta: 0:00:26  lr: 0.003682  min_lr: 0.003682  loss: 3.3971 (3.4127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6563 (0.8682)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.7148 (3.4159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (0.8664)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [70] Total time: 0:10:58 (0.5260 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.7148 (3.4127)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (0.8664)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6535 (0.6535)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.1999  data: 4.9134  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8468 (0.8814)  acc1: 82.0000 (82.1818)  acc5: 97.2000 (96.6546)  time: 0.7131  data: 0.4470  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1326 (1.0730)  acc1: 74.8000 (77.7714)  acc5: 93.2000 (94.2667)  time: 0.2643  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2346 (1.0873)  acc1: 74.0000 (77.3920)  acc5: 92.4000 (94.0800)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4667 s / it)
* Acc@1 77.708 Acc@5 94.324 loss 1.073
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.77%
Epoch: [71]  [   0/1251]  eta: 1:15:09  lr: 0.003681  min_lr: 0.003681  loss: 3.3811 (3.3811)  weight_decay: 0.0500 (0.0500)  time: 3.6044  data: 2.2684  max mem: 43713
Epoch: [71]  [ 200/1251]  eta: 0:09:28  lr: 0.003680  min_lr: 0.003680  loss: 3.4707 (3.3986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0131 (0.9580)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [71]  [ 400/1251]  eta: 0:07:33  lr: 0.003678  min_lr: 0.003678  loss: 3.6292 (3.4074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9567 (0.9201)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [71]  [ 600/1251]  eta: 0:05:44  lr: 0.003676  min_lr: 0.003676  loss: 3.2221 (3.3800)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3152 (0.9452)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [71]  [ 800/1251]  eta: 0:03:58  lr: 0.003674  min_lr: 0.003674  loss: 3.2775 (3.3786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5690 (0.9110)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [71]  [1000/1251]  eta: 0:02:12  lr: 0.003672  min_lr: 0.003672  loss: 3.5590 (3.3938)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7485 (0.9305)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [71]  [1200/1251]  eta: 0:00:26  lr: 0.003670  min_lr: 0.003670  loss: 3.7007 (3.4023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8142 (0.9093)  time: 0.5285  data: 0.0005  max mem: 43713
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.4919 (3.4043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (0.9148)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [71] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.4919 (3.4071)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (0.9148)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7058 (0.7058)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.4733  data: 5.1750  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8819 (0.8824)  acc1: 81.2000 (81.4545)  acc5: 96.8000 (96.5455)  time: 0.7382  data: 0.4707  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1474 (1.0491)  acc1: 75.6000 (77.7143)  acc5: 93.6000 (94.4952)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1532 (1.0592)  acc1: 75.6000 (77.2480)  acc5: 92.8000 (94.4480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4782 s / it)
* Acc@1 77.998 Acc@5 94.528 loss 1.038
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.00%
Epoch: [72]  [   0/1251]  eta: 1:04:13  lr: 0.003669  min_lr: 0.003669  loss: 2.4096 (2.4096)  weight_decay: 0.0500 (0.0500)  time: 3.0800  data: 2.5438  max mem: 43713
Epoch: [72]  [ 200/1251]  eta: 0:09:23  lr: 0.003667  min_lr: 0.003667  loss: 3.5306 (3.3434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8978 (1.0220)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [72]  [ 400/1251]  eta: 0:07:30  lr: 0.003665  min_lr: 0.003665  loss: 3.6415 (3.3496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7960 (nan)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [72]  [ 600/1251]  eta: 0:05:44  lr: 0.003663  min_lr: 0.003663  loss: 3.4255 (3.3689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7487 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [72]  [ 800/1251]  eta: 0:03:57  lr: 0.003661  min_lr: 0.003661  loss: 3.4924 (3.3707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6748 (nan)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [72]  [1000/1251]  eta: 0:02:12  lr: 0.003659  min_lr: 0.003659  loss: 3.7084 (3.3782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7204 (nan)  time: 0.5312  data: 0.0004  max mem: 43713
Epoch: [72]  [1200/1251]  eta: 0:00:26  lr: 0.003657  min_lr: 0.003657  loss: 3.6507 (3.3859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (nan)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 3.5957 (3.3873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (nan)  time: 0.4439  data: 0.0005  max mem: 43713
Epoch: [72] Total time: 0:10:57 (0.5254 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 3.5957 (3.4027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6408 (nan)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.7131 (0.7131)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.0569  data: 4.7586  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8953 (0.8788)  acc1: 80.0000 (81.2000)  acc5: 97.2000 (96.2909)  time: 0.7003  data: 0.4329  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0575 (1.0721)  acc1: 74.4000 (77.0286)  acc5: 93.2000 (94.2476)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2110 (1.0802)  acc1: 74.0000 (76.9600)  acc5: 93.2000 (94.2400)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4612 s / it)
* Acc@1 77.808 Acc@5 94.342 loss 1.062
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 78.00%
Epoch: [73]  [   0/1251]  eta: 1:15:27  lr: 0.003657  min_lr: 0.003657  loss: 3.9915 (3.9915)  weight_decay: 0.0500 (0.0500)  time: 3.6194  data: 2.9755  max mem: 43713
Epoch: [73]  [ 200/1251]  eta: 0:09:28  lr: 0.003655  min_lr: 0.003655  loss: 3.4822 (3.3995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7752 (0.9307)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [73]  [ 400/1251]  eta: 0:07:34  lr: 0.003653  min_lr: 0.003653  loss: 3.3805 (3.3664)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8392 (0.9558)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [73]  [ 600/1251]  eta: 0:05:45  lr: 0.003651  min_lr: 0.003651  loss: 3.4932 (3.3821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8835 (0.9525)  time: 0.5254  data: 0.0004  max mem: 43713
Epoch: [73]  [ 800/1251]  eta: 0:03:58  lr: 0.003649  min_lr: 0.003649  loss: 3.6320 (3.3635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7214 (0.9237)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [73]  [1000/1251]  eta: 0:02:12  lr: 0.003647  min_lr: 0.003647  loss: 3.4783 (3.3676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5821 (0.8895)  time: 0.5246  data: 0.0004  max mem: 43713
Epoch: [73]  [1200/1251]  eta: 0:00:26  lr: 0.003645  min_lr: 0.003645  loss: 3.2130 (3.3695)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7900 (0.8850)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.5192 (3.3735)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (0.9123)  time: 0.4441  data: 0.0005  max mem: 43713
Epoch: [73] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.5192 (3.3964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1449 (0.9123)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8052 (0.8052)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.5899  data: 5.3047  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9846 (0.9195)  acc1: 82.0000 (82.5818)  acc5: 96.8000 (96.6182)  time: 0.7487  data: 0.4825  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0510 (1.0740)  acc1: 76.0000 (78.2095)  acc5: 94.0000 (94.5143)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1966 (1.0890)  acc1: 74.8000 (77.6480)  acc5: 93.2000 (94.2560)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4819 s / it)
* Acc@1 78.002 Acc@5 94.524 loss 1.078
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.00%
Epoch: [74]  [   0/1251]  eta: 0:55:26  lr: 0.003644  min_lr: 0.003644  loss: 2.5744 (2.5744)  weight_decay: 0.0500 (0.0500)  time: 2.6593  data: 2.1181  max mem: 43713
Epoch: [74]  [ 200/1251]  eta: 0:09:24  lr: 0.003642  min_lr: 0.003642  loss: 3.3527 (3.3253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6703 (0.7894)  time: 0.5306  data: 0.0004  max mem: 43713
Epoch: [74]  [ 400/1251]  eta: 0:07:31  lr: 0.003640  min_lr: 0.003640  loss: 3.3610 (3.3432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6968 (0.8656)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [74]  [ 600/1251]  eta: 0:05:43  lr: 0.003638  min_lr: 0.003638  loss: 3.7034 (3.3594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8490 (0.8678)  time: 0.5286  data: 0.0005  max mem: 43713
Epoch: [74]  [ 800/1251]  eta: 0:03:57  lr: 0.003636  min_lr: 0.003636  loss: 3.6208 (3.3837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (0.8920)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [74]  [1000/1251]  eta: 0:02:12  lr: 0.003634  min_lr: 0.003634  loss: 3.5370 (3.3740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6268 (0.8784)  time: 0.5247  data: 0.0004  max mem: 43713
Epoch: [74]  [1200/1251]  eta: 0:00:26  lr: 0.003632  min_lr: 0.003632  loss: 3.3773 (3.3775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8699)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 3.2495 (3.3746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6925 (0.8656)  time: 0.4507  data: 0.0005  max mem: 43713
Epoch: [74] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 3.2495 (3.3951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6925 (0.8656)
Test:  [ 0/25]  eta: 0:01:38  loss: 0.6555 (0.6555)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 3.9502  data: 3.6371  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8689 (0.8503)  acc1: 81.6000 (81.9273)  acc5: 96.0000 (96.3636)  time: 0.6860  data: 0.4173  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0341 (1.0208)  acc1: 75.6000 (78.1714)  acc5: 93.6000 (94.3810)  time: 0.3121  data: 0.0477  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0846 (1.0275)  acc1: 75.2000 (77.8880)  acc5: 93.2000 (94.3200)  time: 0.2646  data: 0.0002  max mem: 43713
Test: Total time: 0:00:11 (0.4534 s / it)
* Acc@1 77.950 Acc@5 94.526 loss 1.015
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.00%
Epoch: [75]  [   0/1251]  eta: 1:15:45  lr: 0.003631  min_lr: 0.003631  loss: 2.9758 (2.9758)  weight_decay: 0.0500 (0.0500)  time: 3.6337  data: 1.7164  max mem: 43713
Epoch: [75]  [ 200/1251]  eta: 0:09:27  lr: 0.003629  min_lr: 0.003629  loss: 3.5356 (3.4028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8646 (0.9975)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [75]  [ 400/1251]  eta: 0:07:32  lr: 0.003627  min_lr: 0.003627  loss: 3.6421 (3.4080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8929 (0.9562)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [75]  [ 600/1251]  eta: 0:05:44  lr: 0.003625  min_lr: 0.003625  loss: 3.3500 (3.4087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6567 (0.9153)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [75]  [ 800/1251]  eta: 0:03:58  lr: 0.003623  min_lr: 0.003623  loss: 3.6351 (3.4094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6378 (0.8908)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [75]  [1000/1251]  eta: 0:02:12  lr: 0.003621  min_lr: 0.003621  loss: 3.5509 (3.4081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9848 (0.9057)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [75]  [1200/1251]  eta: 0:00:26  lr: 0.003619  min_lr: 0.003619  loss: 3.4604 (3.4123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.9071)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 3.6409 (3.4129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5898 (0.8960)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [75] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 3.6409 (3.3900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5898 (0.8960)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7532 (0.7532)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.5885  data: 5.2786  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9336 (0.9815)  acc1: 81.2000 (81.0909)  acc5: 96.8000 (96.6182)  time: 0.7487  data: 0.4801  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1745 (1.1279)  acc1: 76.4000 (77.6762)  acc5: 93.6000 (94.2857)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1790 (1.1379)  acc1: 76.4000 (77.4400)  acc5: 92.8000 (94.2240)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4829 s / it)
* Acc@1 77.860 Acc@5 94.462 loss 1.132
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 78.00%
Epoch: [76]  [   0/1251]  eta: 1:08:56  lr: 0.003618  min_lr: 0.003618  loss: 3.7445 (3.7445)  weight_decay: 0.0500 (0.0500)  time: 3.3063  data: 1.9814  max mem: 43713
Epoch: [76]  [ 200/1251]  eta: 0:09:26  lr: 0.003616  min_lr: 0.003616  loss: 3.6426 (3.3602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1575 (0.9709)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [76]  [ 400/1251]  eta: 0:07:32  lr: 0.003614  min_lr: 0.003614  loss: 3.4130 (3.3874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8583 (0.9496)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [76]  [ 600/1251]  eta: 0:05:44  lr: 0.003612  min_lr: 0.003612  loss: 3.4924 (3.3740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6402 (0.8732)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [76]  [ 800/1251]  eta: 0:03:58  lr: 0.003610  min_lr: 0.003610  loss: 3.1702 (3.3732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (0.8734)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [76]  [1000/1251]  eta: 0:02:12  lr: 0.003607  min_lr: 0.003607  loss: 3.4443 (3.3878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9186 (0.8793)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [76]  [1200/1251]  eta: 0:00:26  lr: 0.003605  min_lr: 0.003605  loss: 3.5160 (3.3802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7645 (0.8725)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.5282 (3.3792)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.8667)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [76] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.5282 (3.3869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.8667)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7525 (0.7525)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6748  data: 5.3727  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8989 (0.8936)  acc1: 82.4000 (81.8545)  acc5: 96.8000 (96.5818)  time: 0.7565  data: 0.4887  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0922 (1.0681)  acc1: 76.4000 (78.0952)  acc5: 94.0000 (94.5714)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1675 (1.0748)  acc1: 76.0000 (77.8720)  acc5: 94.0000 (94.5120)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4856 s / it)
* Acc@1 78.136 Acc@5 94.610 loss 1.066
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.14%
Epoch: [77]  [   0/1251]  eta: 1:03:54  lr: 0.003605  min_lr: 0.003605  loss: 2.8234 (2.8234)  weight_decay: 0.0500 (0.0500)  time: 3.0648  data: 2.5180  max mem: 43713
Epoch: [77]  [ 200/1251]  eta: 0:09:26  lr: 0.003603  min_lr: 0.003603  loss: 3.3497 (3.3278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8671)  time: 0.5350  data: 0.0004  max mem: 43713
Epoch: [77]  [ 400/1251]  eta: 0:07:32  lr: 0.003601  min_lr: 0.003601  loss: 3.4663 (3.3682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8727 (0.8685)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [77]  [ 600/1251]  eta: 0:05:44  lr: 0.003598  min_lr: 0.003598  loss: 3.4720 (3.3846)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6920 (0.8646)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [77]  [ 800/1251]  eta: 0:03:58  lr: 0.003596  min_lr: 0.003596  loss: 3.5177 (3.3869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7734 (0.8826)  time: 0.5287  data: 0.0004  max mem: 43713
Epoch: [77]  [1000/1251]  eta: 0:02:12  lr: 0.003594  min_lr: 0.003594  loss: 3.5505 (3.3882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7788 (0.8877)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [77]  [1200/1251]  eta: 0:00:26  lr: 0.003592  min_lr: 0.003592  loss: 3.5729 (3.3935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7848 (0.8935)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 3.6955 (3.3966)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.8894)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [77] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 3.6955 (3.3837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6759 (0.8894)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6963 (0.6963)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.9247  data: 5.6137  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8970 (0.9182)  acc1: 82.8000 (82.4000)  acc5: 97.2000 (96.7273)  time: 0.7792  data: 0.5106  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1497 (1.0866)  acc1: 76.4000 (78.1905)  acc5: 94.0000 (94.7429)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2140 (1.0965)  acc1: 76.4000 (77.8080)  acc5: 93.2000 (94.6080)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4964 s / it)
* Acc@1 78.236 Acc@5 94.624 loss 1.092
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.24%
Epoch: [78]  [   0/1251]  eta: 0:52:55  lr: 0.003591  min_lr: 0.003591  loss: 3.5722 (3.5722)  weight_decay: 0.0500 (0.0500)  time: 2.5385  data: 1.9972  max mem: 43713
Epoch: [78]  [ 200/1251]  eta: 0:09:23  lr: 0.003589  min_lr: 0.003589  loss: 3.5816 (3.3135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7293 (0.8917)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [78]  [ 400/1251]  eta: 0:07:30  lr: 0.003587  min_lr: 0.003587  loss: 3.5566 (3.3602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.8991)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [78]  [ 600/1251]  eta: 0:05:43  lr: 0.003585  min_lr: 0.003585  loss: 3.5615 (3.3464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7831 (0.8827)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [78]  [ 800/1251]  eta: 0:03:57  lr: 0.003583  min_lr: 0.003583  loss: 3.2104 (3.3530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.8916)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [78]  [1000/1251]  eta: 0:02:12  lr: 0.003580  min_lr: 0.003580  loss: 3.4258 (3.3573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0874 (0.8969)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [78]  [1200/1251]  eta: 0:00:26  lr: 0.003578  min_lr: 0.003578  loss: 3.6005 (3.3696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9203 (0.9008)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 3.0978 (3.3713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6050 (0.8927)  time: 0.4439  data: 0.0007  max mem: 43713
Epoch: [78] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 3.0978 (3.3715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6050 (0.8927)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6518 (0.6518)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5362  data: 5.2419  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8844 (0.8484)  acc1: 82.8000 (82.1455)  acc5: 97.6000 (96.9818)  time: 0.7441  data: 0.4769  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0568 (1.0141)  acc1: 76.0000 (78.6476)  acc5: 94.4000 (94.8381)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1498 (1.0247)  acc1: 76.0000 (78.3840)  acc5: 94.0000 (94.6560)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4808 s / it)
* Acc@1 78.388 Acc@5 94.722 loss 1.018
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.39%
Epoch: [79]  [   0/1251]  eta: 0:56:48  lr: 0.003578  min_lr: 0.003578  loss: 3.1568 (3.1568)  weight_decay: 0.0500 (0.0500)  time: 2.7244  data: 2.1937  max mem: 43713
Epoch: [79]  [ 200/1251]  eta: 0:09:22  lr: 0.003575  min_lr: 0.003575  loss: 3.3709 (3.4053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6547 (0.8844)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [79]  [ 400/1251]  eta: 0:07:31  lr: 0.003573  min_lr: 0.003573  loss: 3.5535 (3.3790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7396 (0.9376)  time: 0.5322  data: 0.0004  max mem: 43713
Epoch: [79]  [ 600/1251]  eta: 0:05:44  lr: 0.003571  min_lr: 0.003571  loss: 3.4775 (3.3737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7765 (0.9114)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [79]  [ 800/1251]  eta: 0:03:58  lr: 0.003569  min_lr: 0.003569  loss: 3.5762 (3.3793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2316 (0.9251)  time: 0.5257  data: 0.0004  max mem: 43713
Epoch: [79]  [1000/1251]  eta: 0:02:12  lr: 0.003567  min_lr: 0.003567  loss: 3.6132 (3.3858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.9032)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [79]  [1200/1251]  eta: 0:00:26  lr: 0.003564  min_lr: 0.003564  loss: 3.5926 (3.3771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7282 (0.9280)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.5945 (3.3789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.9178)  time: 0.4441  data: 0.0007  max mem: 43713
Epoch: [79] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.5945 (3.3725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6194 (0.9178)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7129 (0.7129)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.4137  data: 5.1221  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8962 (0.9041)  acc1: 81.2000 (81.8545)  acc5: 96.8000 (96.7273)  time: 0.7328  data: 0.4659  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0655 (1.0702)  acc1: 76.0000 (78.5524)  acc5: 94.4000 (94.7619)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2236 (1.0839)  acc1: 76.0000 (78.1120)  acc5: 93.2000 (94.6080)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4760 s / it)
* Acc@1 78.240 Acc@5 94.606 loss 1.081
Accuracy of the model on the 50000 test images: 78.2%
Max accuracy: 78.39%
Epoch: [80]  [   0/1251]  eta: 1:14:07  lr: 0.003564  min_lr: 0.003564  loss: 3.5392 (3.5392)  weight_decay: 0.0500 (0.0500)  time: 3.5551  data: 1.6067  max mem: 43713
Epoch: [80]  [ 200/1251]  eta: 0:09:27  lr: 0.003562  min_lr: 0.003562  loss: 3.3781 (3.3613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9709 (0.9248)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [80]  [ 400/1251]  eta: 0:07:33  lr: 0.003559  min_lr: 0.003559  loss: 3.2840 (3.3491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7876 (0.8572)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [80]  [ 600/1251]  eta: 0:05:44  lr: 0.003557  min_lr: 0.003557  loss: 3.5060 (3.3567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6020 (0.8497)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [80]  [ 800/1251]  eta: 0:03:58  lr: 0.003555  min_lr: 0.003555  loss: 3.3800 (3.3491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7268 (0.8285)  time: 0.5312  data: 0.0004  max mem: 43713
Epoch: [80]  [1000/1251]  eta: 0:02:12  lr: 0.003553  min_lr: 0.003553  loss: 3.4835 (3.3538)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7614 (0.8594)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [80]  [1200/1251]  eta: 0:00:26  lr: 0.003550  min_lr: 0.003550  loss: 3.3817 (3.3480)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5280  data: 0.0004  max mem: 43713
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 3.2963 (3.3451)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [80] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 3.2963 (3.3564)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6182 (0.6182)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.2999  data: 5.0061  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7925 (0.7978)  acc1: 84.0000 (82.7636)  acc5: 97.6000 (96.8727)  time: 0.7231  data: 0.4554  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0105 (0.9689)  acc1: 76.8000 (78.3048)  acc5: 94.0000 (94.9333)  time: 0.2654  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1133 (0.9851)  acc1: 74.4000 (78.0160)  acc5: 93.2000 (94.7360)  time: 0.2654  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4717 s / it)
* Acc@1 78.528 Acc@5 94.790 loss 0.976
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.53%
Epoch: [81]  [   0/1251]  eta: 0:53:08  lr: 0.003550  min_lr: 0.003550  loss: 3.8606 (3.8606)  weight_decay: 0.0500 (0.0500)  time: 2.5486  data: 2.0007  max mem: 43713
Epoch: [81]  [ 200/1251]  eta: 0:09:23  lr: 0.003547  min_lr: 0.003547  loss: 3.5971 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6327 (nan)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [81]  [ 400/1251]  eta: 0:07:31  lr: 0.003545  min_lr: 0.003545  loss: 3.4806 (3.3639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9582 (nan)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [81]  [ 600/1251]  eta: 0:05:43  lr: 0.003543  min_lr: 0.003543  loss: 3.5371 (3.3617)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (nan)  time: 0.5286  data: 0.0005  max mem: 43713
Epoch: [81]  [ 800/1251]  eta: 0:03:57  lr: 0.003541  min_lr: 0.003541  loss: 3.5278 (3.3669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (nan)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [81]  [1000/1251]  eta: 0:02:12  lr: 0.003538  min_lr: 0.003538  loss: 3.3317 (3.3731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6391 (nan)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [81]  [1200/1251]  eta: 0:00:26  lr: 0.003536  min_lr: 0.003536  loss: 3.5061 (3.3709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8368 (nan)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 3.4219 (3.3723)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9264 (nan)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [81] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 3.4219 (3.3582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9264 (nan)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7350 (0.7350)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.3207  data: 5.0276  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.9064 (0.8680)  acc1: 82.4000 (82.5455)  acc5: 96.8000 (96.9091)  time: 0.7243  data: 0.4574  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1021 (1.0381)  acc1: 77.2000 (78.5905)  acc5: 93.6000 (94.7429)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1519 (1.0513)  acc1: 77.2000 (78.3040)  acc5: 93.6000 (94.5600)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4715 s / it)
* Acc@1 78.430 Acc@5 94.712 loss 1.045
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.53%
Epoch: [82]  [   0/1251]  eta: 1:16:16  lr: 0.003535  min_lr: 0.003535  loss: 3.0813 (3.0813)  weight_decay: 0.0500 (0.0500)  time: 3.6584  data: 1.5996  max mem: 43713
Epoch: [82]  [ 200/1251]  eta: 0:09:27  lr: 0.003533  min_lr: 0.003533  loss: 3.4422 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8105 (0.7585)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [82]  [ 400/1251]  eta: 0:07:32  lr: 0.003531  min_lr: 0.003531  loss: 3.4674 (3.3315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6209 (0.7722)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [82]  [ 600/1251]  eta: 0:05:44  lr: 0.003528  min_lr: 0.003528  loss: 3.5609 (3.3469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.8267)  time: 0.5250  data: 0.0005  max mem: 43713
Epoch: [82]  [ 800/1251]  eta: 0:03:58  lr: 0.003526  min_lr: 0.003526  loss: 3.5075 (3.3505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8920 (0.8306)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [82]  [1000/1251]  eta: 0:02:12  lr: 0.003524  min_lr: 0.003524  loss: 3.5192 (3.3555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.8264)  time: 0.5312  data: 0.0005  max mem: 43713
Epoch: [82]  [1200/1251]  eta: 0:00:26  lr: 0.003521  min_lr: 0.003521  loss: 3.5253 (3.3670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9165 (0.8482)  time: 0.5295  data: 0.0005  max mem: 43713
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 3.5448 (3.3650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6512 (0.8470)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [82] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 3.5448 (3.3579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6512 (0.8470)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.6919 (0.6919)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 4.2726  data: 3.9671  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8175 (0.8351)  acc1: 84.0000 (83.1273)  acc5: 96.8000 (97.0545)  time: 0.6847  data: 0.4167  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0009 (1.0173)  acc1: 78.0000 (78.6095)  acc5: 95.2000 (94.9905)  time: 0.2954  data: 0.0309  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1691 (1.0274)  acc1: 76.0000 (78.4160)  acc5: 93.2000 (94.7360)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4532 s / it)
* Acc@1 78.728 Acc@5 94.698 loss 1.019
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.73%
Epoch: [83]  [   0/1251]  eta: 0:57:11  lr: 0.003521  min_lr: 0.003521  loss: 2.9167 (2.9167)  weight_decay: 0.0500 (0.0500)  time: 2.7431  data: 2.2035  max mem: 43713
Epoch: [83]  [ 200/1251]  eta: 0:09:21  lr: 0.003519  min_lr: 0.003519  loss: 3.5435 (3.2878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8465 (0.8646)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [83]  [ 400/1251]  eta: 0:07:31  lr: 0.003516  min_lr: 0.003516  loss: 3.5599 (3.3390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.9381)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [83]  [ 600/1251]  eta: 0:05:43  lr: 0.003514  min_lr: 0.003514  loss: 3.4859 (3.3365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.9445)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [83]  [ 800/1251]  eta: 0:03:57  lr: 0.003512  min_lr: 0.003512  loss: 3.5224 (3.3293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6586 (0.9179)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [83]  [1000/1251]  eta: 0:02:12  lr: 0.003509  min_lr: 0.003509  loss: 3.2715 (3.3323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9242 (0.9274)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [83]  [1200/1251]  eta: 0:00:26  lr: 0.003507  min_lr: 0.003507  loss: 3.5768 (3.3344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8138 (0.9110)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 3.3722 (3.3350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (0.9119)  time: 0.4439  data: 0.0006  max mem: 43713
Epoch: [83] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 3.3722 (3.3478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7990 (0.9119)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6218 (0.6218)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.2578  data: 4.9475  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8219 (0.8055)  acc1: 84.0000 (82.1455)  acc5: 97.6000 (96.8364)  time: 0.7180  data: 0.4501  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0013 (0.9890)  acc1: 75.6000 (78.0381)  acc5: 94.4000 (94.7238)  time: 0.2639  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1260 (0.9970)  acc1: 76.0000 (77.9040)  acc5: 93.2000 (94.5600)  time: 0.2638  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4680 s / it)
* Acc@1 78.722 Acc@5 94.828 loss 0.985
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.73%
Epoch: [84]  [   0/1251]  eta: 1:11:05  lr: 0.003506  min_lr: 0.003506  loss: 3.7526 (3.7526)  weight_decay: 0.0500 (0.0500)  time: 3.4099  data: 2.5905  max mem: 43713
Epoch: [84]  [ 200/1251]  eta: 0:09:29  lr: 0.003504  min_lr: 0.003504  loss: 3.4985 (3.3055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7568 (0.9183)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [84]  [ 400/1251]  eta: 0:07:33  lr: 0.003502  min_lr: 0.003502  loss: 3.5370 (3.3248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8807 (0.8952)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [84]  [ 600/1251]  eta: 0:05:44  lr: 0.003499  min_lr: 0.003499  loss: 3.3667 (3.3255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (0.8732)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [84]  [ 800/1251]  eta: 0:03:58  lr: 0.003497  min_lr: 0.003497  loss: 3.3749 (3.3367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7167 (0.9084)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [84]  [1000/1251]  eta: 0:02:12  lr: 0.003494  min_lr: 0.003494  loss: 3.6556 (3.3445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7314 (0.8889)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [84]  [1200/1251]  eta: 0:00:26  lr: 0.003492  min_lr: 0.003492  loss: 3.3628 (3.3507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8625 (0.8883)  time: 0.5327  data: 0.0004  max mem: 43713
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 3.4810 (3.3505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6692 (0.8767)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [84] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 3.4810 (3.3557)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6692 (0.8767)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7186 (0.7186)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.2423  data: 4.9358  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8612 (0.8587)  acc1: 82.0000 (82.2545)  acc5: 97.6000 (96.9818)  time: 0.7175  data: 0.4491  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0737 (1.0377)  acc1: 76.4000 (78.3238)  acc5: 94.0000 (94.9905)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1877 (1.0501)  acc1: 75.6000 (77.9040)  acc5: 93.6000 (94.9280)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4681 s / it)
* Acc@1 78.630 Acc@5 94.716 loss 1.037
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.73%
Epoch: [85]  [   0/1251]  eta: 1:10:45  lr: 0.003491  min_lr: 0.003491  loss: 2.8649 (2.8649)  weight_decay: 0.0500 (0.0500)  time: 3.3936  data: 2.4425  max mem: 43713
Epoch: [85]  [ 200/1251]  eta: 0:09:25  lr: 0.003489  min_lr: 0.003489  loss: 3.0869 (3.3306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9076 (0.9536)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [85]  [ 400/1251]  eta: 0:07:31  lr: 0.003487  min_lr: 0.003487  loss: 3.0320 (3.3259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8937 (0.9753)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [85]  [ 600/1251]  eta: 0:05:44  lr: 0.003484  min_lr: 0.003484  loss: 3.6261 (3.3407)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6918 (0.9008)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [85]  [ 800/1251]  eta: 0:03:58  lr: 0.003482  min_lr: 0.003482  loss: 3.1817 (3.3307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9490 (0.8936)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [85]  [1000/1251]  eta: 0:02:12  lr: 0.003479  min_lr: 0.003479  loss: 3.5143 (3.3396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7224 (0.8974)  time: 0.5342  data: 0.0006  max mem: 43713
Epoch: [85]  [1200/1251]  eta: 0:00:26  lr: 0.003477  min_lr: 0.003477  loss: 3.4501 (3.3374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0631 (0.9045)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 3.4197 (3.3398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.9022)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [85] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 3.4197 (3.3388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7362 (0.9022)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6644 (0.6644)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.2483  data: 4.9560  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8667 (0.8596)  acc1: 82.8000 (81.8545)  acc5: 97.6000 (96.8000)  time: 0.7181  data: 0.4508  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0840 (1.0270)  acc1: 74.8000 (78.4191)  acc5: 94.4000 (94.5524)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1725 (1.0416)  acc1: 74.8000 (78.2720)  acc5: 93.2000 (94.4480)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4708 s / it)
* Acc@1 78.426 Acc@5 94.736 loss 1.035
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.73%
Epoch: [86]  [   0/1251]  eta: 1:13:17  lr: 0.003476  min_lr: 0.003476  loss: 3.5160 (3.5160)  weight_decay: 0.0500 (0.0500)  time: 3.5153  data: 2.6939  max mem: 43713
Epoch: [86]  [ 200/1251]  eta: 0:09:25  lr: 0.003474  min_lr: 0.003474  loss: 3.3686 (3.3351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6299 (0.9004)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [86]  [ 400/1251]  eta: 0:07:33  lr: 0.003472  min_lr: 0.003472  loss: 3.4984 (3.3683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8431 (0.8926)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [86]  [ 600/1251]  eta: 0:05:44  lr: 0.003469  min_lr: 0.003469  loss: 3.5026 (3.3904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.9042)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [86]  [ 800/1251]  eta: 0:03:58  lr: 0.003467  min_lr: 0.003467  loss: 3.3754 (3.3763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7847 (0.8844)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [86]  [1000/1251]  eta: 0:02:12  lr: 0.003464  min_lr: 0.003464  loss: 3.4506 (3.3746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7058 (0.8774)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [86]  [1200/1251]  eta: 0:00:26  lr: 0.003462  min_lr: 0.003462  loss: 3.4973 (3.3739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8743 (0.8849)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 2.8640 (3.3687)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8793)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [86] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 2.8640 (3.3404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7416 (0.8793)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6196 (0.6196)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.7287  data: 5.4401  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7818 (0.7943)  acc1: 83.6000 (82.4364)  acc5: 97.2000 (96.7273)  time: 0.7615  data: 0.4948  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0143 (0.9640)  acc1: 76.8000 (79.1810)  acc5: 93.6000 (94.7810)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0452 (0.9776)  acc1: 76.8000 (78.7520)  acc5: 93.6000 (94.7040)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4883 s / it)
* Acc@1 78.726 Acc@5 94.846 loss 0.968
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.73%
Epoch: [87]  [   0/1251]  eta: 1:11:34  lr: 0.003461  min_lr: 0.003461  loss: 2.7017 (2.7017)  weight_decay: 0.0500 (0.0500)  time: 3.4326  data: 2.8022  max mem: 43713
Epoch: [87]  [ 200/1251]  eta: 0:09:28  lr: 0.003459  min_lr: 0.003459  loss: 3.1926 (3.2736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8500 (0.9434)  time: 0.5307  data: 0.0004  max mem: 43713
Epoch: [87]  [ 400/1251]  eta: 0:07:32  lr: 0.003456  min_lr: 0.003456  loss: 3.1870 (3.2999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8223 (0.9552)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [87]  [ 600/1251]  eta: 0:05:44  lr: 0.003454  min_lr: 0.003454  loss: 3.3243 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9552 (0.9421)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [87]  [ 800/1251]  eta: 0:03:58  lr: 0.003451  min_lr: 0.003451  loss: 3.5219 (3.3012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8727 (0.9315)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [87]  [1000/1251]  eta: 0:02:12  lr: 0.003449  min_lr: 0.003449  loss: 3.3727 (3.3115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8994 (0.9198)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [87]  [1200/1251]  eta: 0:00:26  lr: 0.003446  min_lr: 0.003446  loss: 3.3667 (3.3063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7889 (0.9137)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 3.4777 (3.3083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (0.9142)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [87] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 3.4777 (3.3293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7391 (0.9142)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6532 (0.6532)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.6879  data: 5.3835  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8328 (0.8570)  acc1: 84.8000 (82.7273)  acc5: 96.8000 (96.6546)  time: 0.7578  data: 0.4897  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0436 (1.0262)  acc1: 76.4000 (78.9714)  acc5: 94.4000 (94.4381)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1487 (1.0360)  acc1: 76.4000 (78.5760)  acc5: 92.8000 (94.4160)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4863 s / it)
* Acc@1 78.878 Acc@5 94.874 loss 1.021
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 78.88%
Epoch: [88]  [   0/1251]  eta: 0:59:52  lr: 0.003446  min_lr: 0.003446  loss: 3.8853 (3.8853)  weight_decay: 0.0500 (0.0500)  time: 2.8720  data: 2.3387  max mem: 43713
Epoch: [88]  [ 200/1251]  eta: 0:09:25  lr: 0.003443  min_lr: 0.003443  loss: 3.2447 (3.3509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7921 (0.9025)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [88]  [ 400/1251]  eta: 0:07:32  lr: 0.003441  min_lr: 0.003441  loss: 3.3672 (3.3394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0617 (0.9755)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [88]  [ 600/1251]  eta: 0:05:44  lr: 0.003438  min_lr: 0.003438  loss: 3.3847 (3.3284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8567 (0.9842)  time: 0.5296  data: 0.0005  max mem: 43713
Epoch: [88]  [ 800/1251]  eta: 0:03:58  lr: 0.003436  min_lr: 0.003436  loss: 3.3678 (3.3298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6398 (0.9077)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [88]  [1000/1251]  eta: 0:02:12  lr: 0.003433  min_lr: 0.003433  loss: 3.3072 (3.3228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8403 (0.8898)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [88]  [1200/1251]  eta: 0:00:26  lr: 0.003431  min_lr: 0.003431  loss: 3.3186 (3.3355)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.9022)  time: 0.5233  data: 0.0006  max mem: 43713
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.6190 (3.3399)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.8999)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [88] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.6190 (3.3371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6849 (0.8999)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7953 (0.7953)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.4663  data: 5.1759  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9124 (0.9386)  acc1: 81.6000 (82.8364)  acc5: 97.6000 (96.8000)  time: 0.7376  data: 0.4708  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0963 (1.0984)  acc1: 76.4000 (78.8191)  acc5: 94.4000 (94.9905)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.2348 (1.1129)  acc1: 75.6000 (78.4960)  acc5: 94.0000 (94.8960)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4781 s / it)
* Acc@1 78.558 Acc@5 94.968 loss 1.112
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.88%
Epoch: [89]  [   0/1251]  eta: 1:11:15  lr: 0.003430  min_lr: 0.003430  loss: 3.5805 (3.5805)  weight_decay: 0.0500 (0.0500)  time: 3.4174  data: 2.2555  max mem: 43713
Epoch: [89]  [ 200/1251]  eta: 0:09:27  lr: 0.003428  min_lr: 0.003428  loss: 3.3778 (3.3661)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9588 (0.9125)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [89]  [ 400/1251]  eta: 0:07:33  lr: 0.003425  min_lr: 0.003425  loss: 3.3989 (3.3178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7858 (0.9083)  time: 0.5410  data: 0.0005  max mem: 43713
Epoch: [89]  [ 600/1251]  eta: 0:05:45  lr: 0.003423  min_lr: 0.003423  loss: 3.0288 (3.3289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8372 (0.8941)  time: 0.5281  data: 0.0005  max mem: 43713
Epoch: [89]  [ 800/1251]  eta: 0:03:58  lr: 0.003420  min_lr: 0.003420  loss: 3.4786 (3.3164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7066 (0.8616)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [89]  [1000/1251]  eta: 0:02:12  lr: 0.003418  min_lr: 0.003418  loss: 3.5154 (3.3328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0388 (0.8756)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [89]  [1200/1251]  eta: 0:00:26  lr: 0.003415  min_lr: 0.003415  loss: 3.6538 (3.3372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (0.8757)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.5671 (3.3376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8831 (0.8754)  time: 0.4433  data: 0.0007  max mem: 43713
Epoch: [89] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.5671 (3.3286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8831 (0.8754)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6772 (0.6772)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.7005  data: 5.4043  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8935 (0.8491)  acc1: 82.4000 (83.5273)  acc5: 96.8000 (96.7636)  time: 0.7589  data: 0.4916  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0844 (1.0364)  acc1: 77.2000 (79.0476)  acc5: 94.4000 (95.0095)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1634 (1.0463)  acc1: 76.4000 (78.7520)  acc5: 93.6000 (94.9120)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4867 s / it)
* Acc@1 78.722 Acc@5 94.772 loss 1.043
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 78.88%
Epoch: [90]  [   0/1251]  eta: 1:14:09  lr: 0.003414  min_lr: 0.003414  loss: 3.9587 (3.9587)  weight_decay: 0.0500 (0.0500)  time: 3.5568  data: 2.4984  max mem: 43713
Epoch: [90]  [ 200/1251]  eta: 0:09:26  lr: 0.003412  min_lr: 0.003412  loss: 3.4097 (3.2837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8773 (0.9359)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [90]  [ 400/1251]  eta: 0:07:33  lr: 0.003409  min_lr: 0.003409  loss: 3.3932 (3.2926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6568 (0.8130)  time: 0.5242  data: 0.0004  max mem: 43713
Epoch: [90]  [ 600/1251]  eta: 0:05:44  lr: 0.003407  min_lr: 0.003407  loss: 3.4116 (3.3023)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8786 (0.8315)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [90]  [ 800/1251]  eta: 0:03:58  lr: 0.003404  min_lr: 0.003404  loss: 3.4958 (3.3085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9069 (0.8842)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [90]  [1000/1251]  eta: 0:02:12  lr: 0.003402  min_lr: 0.003402  loss: 3.1347 (3.3089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8887 (0.8906)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [90]  [1200/1251]  eta: 0:00:26  lr: 0.003399  min_lr: 0.003399  loss: 3.3857 (3.3107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8676 (0.8943)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.4458 (3.3118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (0.8843)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [90] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.4458 (3.3295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (0.8843)
Test:  [ 0/25]  eta: 0:02:03  loss: 0.6760 (0.6760)  acc1: 85.2000 (85.2000)  acc5: 97.6000 (97.6000)  time: 4.9473  data: 4.6370  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8156 (0.8450)  acc1: 81.6000 (82.4364)  acc5: 97.2000 (96.8727)  time: 0.6905  data: 0.4219  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0341 (1.0123)  acc1: 77.2000 (79.5619)  acc5: 94.8000 (94.7238)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1685 (1.0217)  acc1: 77.2000 (79.1200)  acc5: 93.6000 (94.5920)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4576 s / it)
* Acc@1 79.144 Acc@5 94.928 loss 1.014
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.14%
Epoch: [91]  [   0/1251]  eta: 0:53:31  lr: 0.003398  min_lr: 0.003398  loss: 3.0451 (3.0451)  weight_decay: 0.0500 (0.0500)  time: 2.5669  data: 2.0296  max mem: 43713
Epoch: [91]  [ 200/1251]  eta: 0:09:25  lr: 0.003396  min_lr: 0.003396  loss: 3.3169 (3.2675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7157 (0.8988)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [91]  [ 400/1251]  eta: 0:07:31  lr: 0.003393  min_lr: 0.003393  loss: 3.3467 (3.3015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8912 (0.8714)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [91]  [ 600/1251]  eta: 0:05:44  lr: 0.003391  min_lr: 0.003391  loss: 3.4455 (3.3144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6532 (0.8828)  time: 0.5309  data: 0.0005  max mem: 43713
Epoch: [91]  [ 800/1251]  eta: 0:03:58  lr: 0.003388  min_lr: 0.003388  loss: 3.3693 (3.3253)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8984 (0.8855)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [91]  [1000/1251]  eta: 0:02:12  lr: 0.003385  min_lr: 0.003385  loss: 3.5657 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6991 (0.8849)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [91]  [1200/1251]  eta: 0:00:26  lr: 0.003383  min_lr: 0.003383  loss: 3.4318 (3.3384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8597 (0.9144)  time: 0.5322  data: 0.0004  max mem: 43713
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.3790 (3.3422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.9056)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [91] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.3790 (3.3293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.9056)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6583 (0.6583)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.7325  data: 5.4273  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8462 (0.8589)  acc1: 82.8000 (82.7636)  acc5: 97.2000 (96.9091)  time: 0.7619  data: 0.4937  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0878 (1.0165)  acc1: 76.4000 (79.2000)  acc5: 94.8000 (95.1429)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1370 (1.0356)  acc1: 76.0000 (78.4800)  acc5: 94.4000 (94.9280)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4889 s / it)
* Acc@1 78.904 Acc@5 94.908 loss 1.035
Accuracy of the model on the 50000 test images: 78.9%
Max accuracy: 79.14%
Epoch: [92]  [   0/1251]  eta: 1:13:53  lr: 0.003382  min_lr: 0.003382  loss: 2.7058 (2.7058)  weight_decay: 0.0500 (0.0500)  time: 3.5437  data: 2.3251  max mem: 43713
Epoch: [92]  [ 200/1251]  eta: 0:09:26  lr: 0.003380  min_lr: 0.003380  loss: 3.3190 (3.3224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5994 (0.7737)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [92]  [ 400/1251]  eta: 0:07:32  lr: 0.003377  min_lr: 0.003377  loss: 3.6135 (3.2884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8244 (0.8847)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [92]  [ 600/1251]  eta: 0:05:45  lr: 0.003374  min_lr: 0.003374  loss: 3.3778 (3.3056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8280 (0.9152)  time: 0.5254  data: 0.0005  max mem: 43713
Epoch: [92]  [ 800/1251]  eta: 0:03:58  lr: 0.003372  min_lr: 0.003372  loss: 3.5659 (3.3240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8179 (0.8960)  time: 0.5251  data: 0.0005  max mem: 43713
Epoch: [92]  [1000/1251]  eta: 0:02:12  lr: 0.003369  min_lr: 0.003369  loss: 3.6066 (3.3338)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0701 (0.9050)  time: 0.5417  data: 0.0005  max mem: 43713
Epoch: [92]  [1200/1251]  eta: 0:00:26  lr: 0.003367  min_lr: 0.003367  loss: 3.4258 (3.3299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7749 (0.9047)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.1910 (3.3254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.9021)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [92] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.1910 (3.3191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.9021)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6547 (0.6547)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5245  data: 5.2249  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7919 (0.8219)  acc1: 83.6000 (83.3818)  acc5: 98.0000 (97.2000)  time: 0.7432  data: 0.4754  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0099 (0.9940)  acc1: 77.6000 (79.3905)  acc5: 94.8000 (95.0667)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1170 (1.0010)  acc1: 76.0000 (78.8160)  acc5: 93.6000 (94.9600)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4798 s / it)
* Acc@1 78.968 Acc@5 95.130 loss 0.990
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.14%
Epoch: [93]  [   0/1251]  eta: 1:12:12  lr: 0.003366  min_lr: 0.003366  loss: 3.5301 (3.5301)  weight_decay: 0.0500 (0.0500)  time: 3.4632  data: 2.3633  max mem: 43713
Epoch: [93]  [ 200/1251]  eta: 0:09:27  lr: 0.003363  min_lr: 0.003363  loss: 3.3706 (3.3084)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8501 (0.8473)  time: 0.5251  data: 0.0004  max mem: 43713
Epoch: [93]  [ 400/1251]  eta: 0:07:34  lr: 0.003361  min_lr: 0.003361  loss: 3.4894 (3.3279)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9609 (0.9373)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [93]  [ 600/1251]  eta: 0:05:45  lr: 0.003358  min_lr: 0.003358  loss: 3.5187 (3.3320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7512 (0.8975)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [93]  [ 800/1251]  eta: 0:03:58  lr: 0.003355  min_lr: 0.003355  loss: 3.4378 (3.3266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7326 (0.8873)  time: 0.5347  data: 0.0004  max mem: 43713
Epoch: [93]  [1000/1251]  eta: 0:02:12  lr: 0.003353  min_lr: 0.003353  loss: 3.3326 (3.3261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7944 (0.8860)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [93]  [1200/1251]  eta: 0:00:26  lr: 0.003350  min_lr: 0.003350  loss: 3.3767 (3.3261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8260 (0.8889)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 3.1938 (3.3260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8235 (0.8862)  time: 0.4492  data: 0.0006  max mem: 43713
Epoch: [93] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 3.1938 (3.3183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8235 (0.8862)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7352 (0.7352)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.5149  data: 5.2247  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8640 (0.8776)  acc1: 83.2000 (83.0909)  acc5: 96.8000 (96.9091)  time: 0.7420  data: 0.4753  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0953 (1.0316)  acc1: 77.2000 (79.1238)  acc5: 94.4000 (95.1619)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1549 (1.0488)  acc1: 76.4000 (78.6720)  acc5: 94.0000 (94.9920)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4792 s / it)
* Acc@1 79.096 Acc@5 95.108 loss 1.038
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.14%
Epoch: [94]  [   0/1251]  eta: 1:15:03  lr: 0.003350  min_lr: 0.003350  loss: 2.2718 (2.2718)  weight_decay: 0.0500 (0.0500)  time: 3.5999  data: 1.5507  max mem: 43713
Epoch: [94]  [ 200/1251]  eta: 0:09:30  lr: 0.003347  min_lr: 0.003347  loss: 3.6836 (3.2926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8342 (1.1188)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [94]  [ 400/1251]  eta: 0:07:33  lr: 0.003344  min_lr: 0.003344  loss: 3.1754 (3.2958)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [94]  [ 600/1251]  eta: 0:05:44  lr: 0.003342  min_lr: 0.003342  loss: 3.5275 (3.3088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8609 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [94]  [ 800/1251]  eta: 0:03:58  lr: 0.003339  min_lr: 0.003339  loss: 3.5171 (3.3133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8828 (nan)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [94]  [1000/1251]  eta: 0:02:12  lr: 0.003336  min_lr: 0.003336  loss: 3.5814 (3.3200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1199 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [94]  [1200/1251]  eta: 0:00:26  lr: 0.003334  min_lr: 0.003334  loss: 3.1800 (3.3182)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6421 (nan)  time: 0.5289  data: 0.0005  max mem: 43713
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 3.3348 (3.3169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (nan)  time: 0.4484  data: 0.0006  max mem: 43713
Epoch: [94] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 3.3348 (3.3035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6400 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7265 (0.7265)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.5019  data: 5.1981  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8802 (0.8750)  acc1: 84.4000 (83.2364)  acc5: 97.2000 (97.2727)  time: 0.7408  data: 0.4729  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0591 (1.0327)  acc1: 77.6000 (79.5810)  acc5: 94.4000 (95.2381)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1276 (1.0404)  acc1: 77.6000 (79.2800)  acc5: 94.4000 (95.1520)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4797 s / it)
* Acc@1 79.188 Acc@5 94.974 loss 1.043
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.19%
Epoch: [95]  [   0/1251]  eta: 0:57:10  lr: 0.003333  min_lr: 0.003333  loss: 2.2271 (2.2271)  weight_decay: 0.0500 (0.0500)  time: 2.7425  data: 2.2129  max mem: 43713
Epoch: [95]  [ 200/1251]  eta: 0:09:22  lr: 0.003330  min_lr: 0.003330  loss: 3.2561 (3.2629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8297 (0.9095)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [95]  [ 400/1251]  eta: 0:07:30  lr: 0.003327  min_lr: 0.003327  loss: 3.5754 (3.2784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7761 (0.8722)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [95]  [ 600/1251]  eta: 0:05:43  lr: 0.003325  min_lr: 0.003325  loss: 3.5488 (3.2995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7306 (0.8665)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [95]  [ 800/1251]  eta: 0:03:57  lr: 0.003322  min_lr: 0.003322  loss: 3.2967 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7583 (0.8775)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [95]  [1000/1251]  eta: 0:02:12  lr: 0.003319  min_lr: 0.003319  loss: 3.3448 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9380 (0.9147)  time: 0.5279  data: 0.0005  max mem: 43713
Epoch: [95]  [1200/1251]  eta: 0:00:26  lr: 0.003317  min_lr: 0.003317  loss: 3.4937 (3.3018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6465 (0.9123)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 3.4201 (3.3048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.9063)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [95] Total time: 0:10:56 (0.5250 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 3.4201 (3.3114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.9063)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6423 (0.6423)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.1638  data: 4.8691  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8554 (0.8046)  acc1: 83.2000 (82.7636)  acc5: 98.0000 (97.4182)  time: 0.7194  data: 0.4523  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0073 (0.9709)  acc1: 76.8000 (79.2191)  acc5: 94.4000 (95.1810)  time: 0.2699  data: 0.0053  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0663 (0.9840)  acc1: 76.4000 (78.7520)  acc5: 93.6000 (95.0080)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4717 s / it)
* Acc@1 79.026 Acc@5 94.974 loss 0.979
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.19%
Epoch: [96]  [   0/1251]  eta: 1:13:09  lr: 0.003316  min_lr: 0.003316  loss: 3.1652 (3.1652)  weight_decay: 0.0500 (0.0500)  time: 3.5087  data: 2.3044  max mem: 43713
Epoch: [96]  [ 200/1251]  eta: 0:09:25  lr: 0.003313  min_lr: 0.003313  loss: 2.9552 (3.2400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7369 (0.7516)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [96]  [ 400/1251]  eta: 0:07:33  lr: 0.003311  min_lr: 0.003311  loss: 3.2622 (3.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0687 (0.8687)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [96]  [ 600/1251]  eta: 0:05:44  lr: 0.003308  min_lr: 0.003308  loss: 3.5408 (3.2819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.8601)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [96]  [ 800/1251]  eta: 0:03:58  lr: 0.003305  min_lr: 0.003305  loss: 3.4208 (3.2851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9178 (0.8619)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [96]  [1000/1251]  eta: 0:02:12  lr: 0.003302  min_lr: 0.003302  loss: 3.5351 (3.3006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6644 (0.8954)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [96]  [1200/1251]  eta: 0:00:26  lr: 0.003300  min_lr: 0.003300  loss: 3.5089 (3.3130)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6613 (0.8849)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.3248 (3.3126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.8888)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [96] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.3248 (3.3024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.8888)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7207 (0.7207)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.7388  data: 5.4455  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8329 (0.8636)  acc1: 82.8000 (83.5273)  acc5: 97.2000 (97.1636)  time: 0.7624  data: 0.4953  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0724 (1.0207)  acc1: 78.0000 (79.8857)  acc5: 94.8000 (95.2191)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1271 (1.0367)  acc1: 77.6000 (79.6160)  acc5: 94.0000 (95.0080)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4894 s / it)
* Acc@1 79.146 Acc@5 95.122 loss 1.040
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.19%
Epoch: [97]  [   0/1251]  eta: 1:13:25  lr: 0.003299  min_lr: 0.003299  loss: 2.5308 (2.5308)  weight_decay: 0.0500 (0.0500)  time: 3.5214  data: 2.6237  max mem: 43713
Epoch: [97]  [ 200/1251]  eta: 0:09:28  lr: 0.003296  min_lr: 0.003296  loss: 3.3736 (3.3069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9531 (0.9823)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [97]  [ 400/1251]  eta: 0:07:33  lr: 0.003294  min_lr: 0.003294  loss: 3.6108 (3.3120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8763 (0.9459)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [97]  [ 600/1251]  eta: 0:05:44  lr: 0.003291  min_lr: 0.003291  loss: 3.5521 (3.3333)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7874 (0.9346)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [97]  [ 800/1251]  eta: 0:03:58  lr: 0.003288  min_lr: 0.003288  loss: 3.2755 (3.3231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5957 (0.9234)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [97]  [1000/1251]  eta: 0:02:12  lr: 0.003285  min_lr: 0.003285  loss: 2.9427 (3.3183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (0.9132)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [97]  [1200/1251]  eta: 0:00:26  lr: 0.003283  min_lr: 0.003283  loss: 3.4709 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7949 (0.9338)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.2630 (3.3143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6525 (0.9220)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [97] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.2630 (3.2997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6525 (0.9220)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6152 (0.6152)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.3983  data: 5.0629  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7874 (0.8279)  acc1: 83.6000 (83.1636)  acc5: 97.2000 (96.9455)  time: 0.7314  data: 0.4606  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0508 (0.9768)  acc1: 76.8000 (79.5238)  acc5: 94.4000 (95.1619)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0508 (0.9933)  acc1: 76.8000 (78.9440)  acc5: 94.0000 (94.9760)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4748 s / it)
* Acc@1 79.276 Acc@5 95.038 loss 0.984
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.28%
Epoch: [98]  [   0/1251]  eta: 0:58:23  lr: 0.003282  min_lr: 0.003282  loss: 3.3082 (3.3082)  weight_decay: 0.0500 (0.0500)  time: 2.8002  data: 2.2684  max mem: 43713
Epoch: [98]  [ 200/1251]  eta: 0:09:26  lr: 0.003279  min_lr: 0.003279  loss: 3.2031 (3.2485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (0.9884)  time: 0.5250  data: 0.0004  max mem: 43713
Epoch: [98]  [ 400/1251]  eta: 0:07:31  lr: 0.003276  min_lr: 0.003276  loss: 3.4350 (3.2663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6122 (0.8410)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [98]  [ 600/1251]  eta: 0:05:45  lr: 0.003274  min_lr: 0.003274  loss: 3.2607 (3.2766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7352 (0.8799)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [98]  [ 800/1251]  eta: 0:03:58  lr: 0.003271  min_lr: 0.003271  loss: 3.1803 (3.2800)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7032 (0.8665)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [98]  [1000/1251]  eta: 0:02:12  lr: 0.003268  min_lr: 0.003268  loss: 3.4538 (3.2805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9166 (0.9000)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [98]  [1200/1251]  eta: 0:00:26  lr: 0.003265  min_lr: 0.003265  loss: 3.5904 (3.2835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5879 (0.8692)  time: 0.5255  data: 0.0004  max mem: 43713
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.6320 (3.2892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5664 (0.8701)  time: 0.4444  data: 0.0005  max mem: 43713
Epoch: [98] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.6320 (3.2974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5664 (0.8701)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6851 (0.6851)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.5250  data: 5.2174  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8213 (0.8592)  acc1: 83.6000 (82.8727)  acc5: 97.6000 (97.2364)  time: 0.7429  data: 0.4746  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0916 (1.0508)  acc1: 77.2000 (78.9714)  acc5: 93.2000 (94.9143)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1420 (1.0620)  acc1: 77.2000 (78.6880)  acc5: 93.2000 (94.8320)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4802 s / it)
* Acc@1 79.042 Acc@5 94.974 loss 1.062
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.28%
Epoch: [99]  [   0/1251]  eta: 1:10:23  lr: 0.003265  min_lr: 0.003265  loss: 3.5908 (3.5908)  weight_decay: 0.0500 (0.0500)  time: 3.3759  data: 2.7268  max mem: 43713
Epoch: [99]  [ 200/1251]  eta: 0:09:25  lr: 0.003262  min_lr: 0.003262  loss: 3.3800 (3.2974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8741 (0.9454)  time: 0.5250  data: 0.0004  max mem: 43713
Epoch: [99]  [ 400/1251]  eta: 0:07:33  lr: 0.003259  min_lr: 0.003259  loss: 2.9676 (3.2821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5889 (0.8823)  time: 0.5400  data: 0.0004  max mem: 43713
Epoch: [99]  [ 600/1251]  eta: 0:05:44  lr: 0.003256  min_lr: 0.003256  loss: 3.0368 (3.2705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6832 (0.9141)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [99]  [ 800/1251]  eta: 0:03:58  lr: 0.003253  min_lr: 0.003253  loss: 3.4222 (3.2875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6903 (0.8844)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [99]  [1000/1251]  eta: 0:02:12  lr: 0.003251  min_lr: 0.003251  loss: 3.2021 (3.2808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.8949)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [99]  [1200/1251]  eta: 0:00:26  lr: 0.003248  min_lr: 0.003248  loss: 3.4504 (3.2934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8984 (0.9195)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 3.6394 (3.2964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0812 (0.9293)  time: 0.4440  data: 0.0006  max mem: 43713
Epoch: [99] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 3.6394 (3.2941)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0812 (0.9293)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7716 (0.7716)  acc1: 86.0000 (86.0000)  acc5: 96.8000 (96.8000)  time: 5.7327  data: 5.4284  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8884 (0.9134)  acc1: 84.8000 (82.2909)  acc5: 96.8000 (96.9455)  time: 0.7614  data: 0.4938  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0900 (1.0822)  acc1: 76.4000 (78.8000)  acc5: 94.8000 (94.8571)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1860 (1.0882)  acc1: 76.4000 (78.4800)  acc5: 93.2000 (94.7200)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4889 s / it)
* Acc@1 79.226 Acc@5 95.004 loss 1.078
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.28%
Epoch: [100]  [   0/1251]  eta: 1:12:14  lr: 0.003247  min_lr: 0.003247  loss: 3.2528 (3.2528)  weight_decay: 0.0500 (0.0500)  time: 3.4651  data: 2.2244  max mem: 43713
Epoch: [100]  [ 200/1251]  eta: 0:09:28  lr: 0.003244  min_lr: 0.003244  loss: 3.4906 (3.2817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (0.8668)  time: 0.5316  data: 0.0005  max mem: 43713
Epoch: [100]  [ 400/1251]  eta: 0:07:33  lr: 0.003242  min_lr: 0.003242  loss: 3.5960 (3.3102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9417 (0.8524)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [100]  [ 600/1251]  eta: 0:05:44  lr: 0.003239  min_lr: 0.003239  loss: 3.4289 (3.2776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7896 (0.8790)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [100]  [ 800/1251]  eta: 0:03:58  lr: 0.003236  min_lr: 0.003236  loss: 3.6089 (3.2902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9350 (0.9019)  time: 0.5293  data: 0.0005  max mem: 43713
Epoch: [100]  [1000/1251]  eta: 0:02:12  lr: 0.003233  min_lr: 0.003233  loss: 3.4248 (3.3048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (0.8869)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [100]  [1200/1251]  eta: 0:00:26  lr: 0.003230  min_lr: 0.003230  loss: 3.4032 (3.2962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7023 (0.8883)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 3.3927 (3.2974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.8837)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [100] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 3.3927 (3.2910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.8837)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6927 (0.6927)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.3856  data: 5.0824  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8764 (0.8733)  acc1: 83.2000 (82.6546)  acc5: 97.2000 (97.0909)  time: 0.7300  data: 0.4624  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0813 (1.0224)  acc1: 77.6000 (78.8381)  acc5: 94.8000 (95.0857)  time: 0.2643  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1082 (1.0285)  acc1: 77.6000 (78.6720)  acc5: 94.0000 (95.0400)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4736 s / it)
* Acc@1 79.088 Acc@5 95.120 loss 1.023
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.28%
Epoch: [101]  [   0/1251]  eta: 1:14:04  lr: 0.003230  min_lr: 0.003230  loss: 2.8433 (2.8433)  weight_decay: 0.0500 (0.0500)  time: 3.5529  data: 2.3031  max mem: 43713
Epoch: [101]  [ 200/1251]  eta: 0:09:31  lr: 0.003227  min_lr: 0.003227  loss: 3.5034 (3.2886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8738 (0.9298)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [101]  [ 400/1251]  eta: 0:07:33  lr: 0.003224  min_lr: 0.003224  loss: 3.4260 (3.2737)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [101]  [ 600/1251]  eta: 0:05:45  lr: 0.003221  min_lr: 0.003221  loss: 3.0916 (3.2714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7751 (nan)  time: 0.5298  data: 0.0004  max mem: 43713
Epoch: [101]  [ 800/1251]  eta: 0:03:58  lr: 0.003218  min_lr: 0.003218  loss: 3.2926 (3.2791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7950 (nan)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [101]  [1000/1251]  eta: 0:02:12  lr: 0.003215  min_lr: 0.003215  loss: 3.5160 (3.2822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7615 (nan)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [101]  [1200/1251]  eta: 0:00:26  lr: 0.003212  min_lr: 0.003212  loss: 3.2682 (3.2910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8610 (nan)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.3880 (3.2910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (nan)  time: 0.4439  data: 0.0005  max mem: 43713
Epoch: [101] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.3880 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6782 (0.6782)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.7890  data: 5.4963  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8537 (0.8553)  acc1: 84.8000 (83.1273)  acc5: 97.6000 (97.1636)  time: 0.7676  data: 0.4999  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0787 (1.0062)  acc1: 77.6000 (79.2000)  acc5: 94.0000 (95.1810)  time: 0.2654  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1084 (1.0194)  acc1: 76.8000 (78.7360)  acc5: 94.0000 (95.0560)  time: 0.2653  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4908 s / it)
* Acc@1 79.364 Acc@5 95.272 loss 1.011
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.36%
Epoch: [102]  [   0/1251]  eta: 1:06:44  lr: 0.003212  min_lr: 0.003212  loss: 3.0345 (3.0345)  weight_decay: 0.0500 (0.0500)  time: 3.2012  data: 2.6665  max mem: 43713
Epoch: [102]  [ 200/1251]  eta: 0:09:24  lr: 0.003209  min_lr: 0.003209  loss: 3.2922 (3.2696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7538 (0.7867)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [102]  [ 400/1251]  eta: 0:07:31  lr: 0.003206  min_lr: 0.003206  loss: 3.3015 (3.2706)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7326 (0.8551)  time: 0.5336  data: 0.0004  max mem: 43713
Epoch: [102]  [ 600/1251]  eta: 0:05:44  lr: 0.003203  min_lr: 0.003203  loss: 3.4430 (3.2766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.8451)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [102]  [ 800/1251]  eta: 0:03:58  lr: 0.003200  min_lr: 0.003200  loss: 3.1172 (3.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.8365)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [102]  [1000/1251]  eta: 0:02:12  lr: 0.003197  min_lr: 0.003197  loss: 3.3399 (3.2844)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1864 (0.8624)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [102]  [1200/1251]  eta: 0:00:26  lr: 0.003195  min_lr: 0.003195  loss: 3.4110 (3.2869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (0.8518)  time: 0.5290  data: 0.0004  max mem: 43713
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 3.3374 (3.2923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0593 (0.8638)  time: 0.4442  data: 0.0005  max mem: 43713
Epoch: [102] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 3.3374 (3.2814)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0593 (0.8638)
Test:  [ 0/25]  eta: 0:02:03  loss: 0.7697 (0.7697)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 4.9435  data: 4.6239  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9533 (0.9343)  acc1: 82.0000 (82.5455)  acc5: 97.2000 (97.0909)  time: 0.7432  data: 0.4641  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1552 (1.0836)  acc1: 77.2000 (79.5619)  acc5: 94.0000 (95.1048)  time: 0.2965  data: 0.0241  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1777 (1.0969)  acc1: 77.6000 (79.2640)  acc5: 93.2000 (95.0240)  time: 0.2702  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4811 s / it)
* Acc@1 79.050 Acc@5 94.992 loss 1.100
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.36%
Epoch: [103]  [   0/1251]  eta: 1:12:27  lr: 0.003194  min_lr: 0.003194  loss: 3.2857 (3.2857)  weight_decay: 0.0500 (0.0500)  time: 3.4756  data: 2.8049  max mem: 43713
Epoch: [103]  [ 200/1251]  eta: 0:09:25  lr: 0.003191  min_lr: 0.003191  loss: 3.2578 (3.2438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0532 (nan)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [103]  [ 400/1251]  eta: 0:07:33  lr: 0.003188  min_lr: 0.003188  loss: 3.4453 (3.2556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5015 (nan)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [103]  [ 600/1251]  eta: 0:05:45  lr: 0.003185  min_lr: 0.003185  loss: 3.4121 (3.2560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7162 (nan)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [103]  [ 800/1251]  eta: 0:03:58  lr: 0.003182  min_lr: 0.003182  loss: 3.1884 (3.2507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (nan)  time: 0.5306  data: 0.0005  max mem: 43713
Epoch: [103]  [1000/1251]  eta: 0:02:12  lr: 0.003179  min_lr: 0.003179  loss: 3.4822 (3.2587)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7711 (nan)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [103]  [1200/1251]  eta: 0:00:26  lr: 0.003176  min_lr: 0.003176  loss: 3.4223 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7259 (nan)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 3.5571 (3.2567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (nan)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [103] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 3.5571 (3.2765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (nan)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.6606 (0.6606)  acc1: 88.0000 (88.0000)  acc5: 97.2000 (97.2000)  time: 4.2986  data: 4.0039  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8359 (0.8220)  acc1: 82.4000 (82.6909)  acc5: 97.2000 (97.2364)  time: 0.7007  data: 0.4339  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0823 (0.9735)  acc1: 77.6000 (79.5238)  acc5: 94.0000 (95.1619)  time: 0.3043  data: 0.0385  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0810 (0.9846)  acc1: 77.6000 (79.4240)  acc5: 94.0000 (95.0400)  time: 0.2664  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4612 s / it)
* Acc@1 79.286 Acc@5 95.216 loss 0.984
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.36%
Epoch: [104]  [   0/1251]  eta: 1:09:49  lr: 0.003176  min_lr: 0.003176  loss: 3.5677 (3.5677)  weight_decay: 0.0500 (0.0500)  time: 3.3486  data: 2.1783  max mem: 43713
Epoch: [104]  [ 200/1251]  eta: 0:09:30  lr: 0.003173  min_lr: 0.003173  loss: 3.2471 (3.3037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8497 (0.9909)  time: 0.5325  data: 0.0005  max mem: 43713
Epoch: [104]  [ 400/1251]  eta: 0:07:33  lr: 0.003170  min_lr: 0.003170  loss: 3.4402 (3.2821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8697 (0.9460)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [104]  [ 600/1251]  eta: 0:05:44  lr: 0.003167  min_lr: 0.003167  loss: 3.5396 (3.2932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (0.9268)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [104]  [ 800/1251]  eta: 0:03:58  lr: 0.003164  min_lr: 0.003164  loss: 3.5768 (3.2839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.8938)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [104]  [1000/1251]  eta: 0:02:12  lr: 0.003161  min_lr: 0.003161  loss: 3.4112 (3.2896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8467 (0.9185)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [104]  [1200/1251]  eta: 0:00:26  lr: 0.003158  min_lr: 0.003158  loss: 3.5274 (3.2872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1960 (0.9340)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 3.2488 (3.2848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7986 (0.9286)  time: 0.4538  data: 0.0006  max mem: 43713
Epoch: [104] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 3.2488 (3.2840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7986 (0.9286)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6620 (0.6620)  acc1: 88.4000 (88.4000)  acc5: 98.0000 (98.0000)  time: 5.3868  data: 5.0915  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7846 (0.8050)  acc1: 83.6000 (82.5091)  acc5: 97.2000 (96.9818)  time: 0.7306  data: 0.4632  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0151 (0.9569)  acc1: 77.6000 (79.7143)  acc5: 95.2000 (95.3333)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0731 (0.9664)  acc1: 77.6000 (79.3600)  acc5: 94.8000 (95.2640)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4745 s / it)
* Acc@1 79.418 Acc@5 95.196 loss 0.968
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.42%
Epoch: [105]  [   0/1251]  eta: 1:07:25  lr: 0.003158  min_lr: 0.003158  loss: 3.2182 (3.2182)  weight_decay: 0.0500 (0.0500)  time: 3.2337  data: 2.7008  max mem: 43713
Epoch: [105]  [ 200/1251]  eta: 0:09:24  lr: 0.003155  min_lr: 0.003155  loss: 3.2285 (3.2217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7667 (0.9031)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [105]  [ 400/1251]  eta: 0:07:31  lr: 0.003152  min_lr: 0.003152  loss: 2.9183 (3.2353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0283 (nan)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [105]  [ 600/1251]  eta: 0:05:44  lr: 0.003149  min_lr: 0.003149  loss: 3.4527 (3.2471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6684 (nan)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [105]  [ 800/1251]  eta: 0:03:58  lr: 0.003146  min_lr: 0.003146  loss: 3.3992 (3.2485)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (nan)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [105]  [1000/1251]  eta: 0:02:12  lr: 0.003143  min_lr: 0.003143  loss: 3.3094 (3.2471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7147 (nan)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [105]  [1200/1251]  eta: 0:00:26  lr: 0.003140  min_lr: 0.003140  loss: 3.3494 (3.2544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8087 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.2298 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (nan)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [105] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.2298 (3.2679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7133 (0.7133)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6269  data: 5.3134  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8233 (0.8176)  acc1: 83.2000 (82.6909)  acc5: 97.2000 (97.1636)  time: 0.7522  data: 0.4834  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0174 (0.9647)  acc1: 77.6000 (79.5619)  acc5: 94.4000 (95.2952)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0699 (0.9801)  acc1: 77.6000 (79.0240)  acc5: 93.6000 (95.2480)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4837 s / it)
* Acc@1 79.356 Acc@5 95.238 loss 0.982
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.42%
Epoch: [106]  [   0/1251]  eta: 1:14:09  lr: 0.003139  min_lr: 0.003139  loss: 3.8209 (3.8209)  weight_decay: 0.0500 (0.0500)  time: 3.5566  data: 2.0975  max mem: 43713
Epoch: [106]  [ 200/1251]  eta: 0:09:26  lr: 0.003136  min_lr: 0.003136  loss: 3.3402 (3.2639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9817 (0.9428)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [106]  [ 400/1251]  eta: 0:07:34  lr: 0.003133  min_lr: 0.003133  loss: 3.0426 (3.2657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7790 (0.9788)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [106]  [ 600/1251]  eta: 0:05:45  lr: 0.003130  min_lr: 0.003130  loss: 3.4593 (3.2716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.9157)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [106]  [ 800/1251]  eta: 0:03:58  lr: 0.003127  min_lr: 0.003127  loss: 3.2640 (3.2776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8513 (0.8903)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [106]  [1000/1251]  eta: 0:02:12  lr: 0.003124  min_lr: 0.003124  loss: 3.5714 (3.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8619 (0.9304)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [106]  [1200/1251]  eta: 0:00:26  lr: 0.003121  min_lr: 0.003121  loss: 3.5533 (3.2883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0470 (0.9333)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 3.2596 (3.2881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.9246)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [106] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 3.2596 (3.2766)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7672 (0.9246)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6435 (0.6435)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5558  data: 5.2652  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7934 (0.8043)  acc1: 84.0000 (83.3818)  acc5: 97.6000 (97.0546)  time: 0.7456  data: 0.4789  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9783 (0.9674)  acc1: 76.8000 (79.9810)  acc5: 94.8000 (95.2381)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0554 (0.9789)  acc1: 76.4000 (79.4720)  acc5: 94.0000 (95.1520)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4815 s / it)
* Acc@1 79.492 Acc@5 95.216 loss 0.975
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.49%
Epoch: [107]  [   0/1251]  eta: 1:00:38  lr: 0.003121  min_lr: 0.003121  loss: 1.9679 (1.9679)  weight_decay: 0.0500 (0.0500)  time: 2.9082  data: 2.3749  max mem: 43713
Epoch: [107]  [ 200/1251]  eta: 0:09:27  lr: 0.003118  min_lr: 0.003118  loss: 3.2870 (3.2615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7399 (0.8751)  time: 0.5472  data: 0.0004  max mem: 43713
Epoch: [107]  [ 400/1251]  eta: 0:07:32  lr: 0.003115  min_lr: 0.003115  loss: 3.3160 (3.2341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6855 (0.9251)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [107]  [ 600/1251]  eta: 0:05:44  lr: 0.003112  min_lr: 0.003112  loss: 3.2123 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.9020)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [107]  [ 800/1251]  eta: 0:03:58  lr: 0.003109  min_lr: 0.003109  loss: 3.3721 (3.2446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7403 (0.9254)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [107]  [1000/1251]  eta: 0:02:12  lr: 0.003106  min_lr: 0.003106  loss: 3.3548 (3.2546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8015 (0.9455)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [107]  [1200/1251]  eta: 0:00:26  lr: 0.003103  min_lr: 0.003103  loss: 3.3483 (3.2512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6584 (0.9344)  time: 0.5291  data: 0.0005  max mem: 43713
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 3.4391 (3.2546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.9347)  time: 0.4500  data: 0.0006  max mem: 43713
Epoch: [107] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 3.4391 (3.2709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7014 (0.9347)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7196 (0.7196)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.3346  data: 5.0376  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8999 (0.9033)  acc1: 83.6000 (82.8000)  acc5: 97.6000 (97.3818)  time: 0.7255  data: 0.4583  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1263 (1.0618)  acc1: 76.8000 (79.3714)  acc5: 94.4000 (95.2571)  time: 0.2643  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1680 (1.0722)  acc1: 76.4000 (78.9120)  acc5: 93.6000 (95.0720)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4706 s / it)
* Acc@1 79.420 Acc@5 95.256 loss 1.060
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.49%
Epoch: [108]  [   0/1251]  eta: 1:10:49  lr: 0.003102  min_lr: 0.003102  loss: 3.8199 (3.8199)  weight_decay: 0.0500 (0.0500)  time: 3.3970  data: 2.4544  max mem: 43713
Epoch: [108]  [ 200/1251]  eta: 0:09:28  lr: 0.003099  min_lr: 0.003099  loss: 3.0417 (3.2577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9201 (0.8548)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [108]  [ 400/1251]  eta: 0:07:33  lr: 0.003096  min_lr: 0.003096  loss: 3.2500 (3.2621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7131 (0.8320)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [108]  [ 600/1251]  eta: 0:05:45  lr: 0.003093  min_lr: 0.003093  loss: 3.2757 (3.2721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8736 (0.8509)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [108]  [ 800/1251]  eta: 0:03:58  lr: 0.003090  min_lr: 0.003090  loss: 3.0668 (3.2713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.8930)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [108]  [1000/1251]  eta: 0:02:12  lr: 0.003087  min_lr: 0.003087  loss: 3.5152 (3.2826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9513 (0.8875)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [108]  [1200/1251]  eta: 0:00:26  lr: 0.003084  min_lr: 0.003084  loss: 3.3239 (3.2859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6436 (0.9053)  time: 0.5249  data: 0.0005  max mem: 43713
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 3.2185 (3.2836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6176 (0.8975)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [108] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 3.2185 (3.2656)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6176 (0.8975)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6874 (0.6874)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.3377  data: 5.0189  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7899 (0.8125)  acc1: 84.8000 (83.6000)  acc5: 97.2000 (97.3091)  time: 0.7263  data: 0.4567  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0480 (0.9712)  acc1: 78.4000 (79.5238)  acc5: 95.6000 (95.3905)  time: 0.2648  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0632 (0.9835)  acc1: 77.6000 (79.1200)  acc5: 94.8000 (95.3600)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4727 s / it)
* Acc@1 79.788 Acc@5 95.294 loss 0.967
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.79%
Epoch: [109]  [   0/1251]  eta: 0:50:38  lr: 0.003083  min_lr: 0.003083  loss: 3.2194 (3.2194)  weight_decay: 0.0500 (0.0500)  time: 2.4292  data: 1.8890  max mem: 43713
Epoch: [109]  [ 200/1251]  eta: 0:09:19  lr: 0.003080  min_lr: 0.003080  loss: 3.4477 (3.2724)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7101 (0.8551)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [109]  [ 400/1251]  eta: 0:07:29  lr: 0.003077  min_lr: 0.003077  loss: 3.4043 (3.2845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.8372)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [109]  [ 600/1251]  eta: 0:05:43  lr: 0.003074  min_lr: 0.003074  loss: 3.1949 (3.2681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2756 (0.9112)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [109]  [ 800/1251]  eta: 0:03:57  lr: 0.003071  min_lr: 0.003071  loss: 3.4126 (3.2598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7452 (0.8783)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [109]  [1000/1251]  eta: 0:02:12  lr: 0.003068  min_lr: 0.003068  loss: 3.3382 (3.2678)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0246 (0.9063)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [109]  [1200/1251]  eta: 0:00:26  lr: 0.003065  min_lr: 0.003065  loss: 3.5252 (3.2658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9556 (0.9070)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 3.4825 (3.2672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7582 (0.9058)  time: 0.4435  data: 0.0008  max mem: 43713
Epoch: [109] Total time: 0:10:56 (0.5251 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 3.4825 (3.2642)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7582 (0.9058)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7038 (0.7038)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.6038  data: 5.3205  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.9277 (0.9040)  acc1: 82.0000 (82.6545)  acc5: 97.2000 (97.1273)  time: 0.7500  data: 0.4840  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0770 (1.0554)  acc1: 78.0000 (79.0476)  acc5: 94.8000 (95.2191)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1442 (1.0682)  acc1: 76.4000 (78.7040)  acc5: 94.0000 (95.0880)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4834 s / it)
* Acc@1 79.498 Acc@5 95.230 loss 1.052
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.79%
Epoch: [110]  [   0/1251]  eta: 1:13:09  lr: 0.003064  min_lr: 0.003064  loss: 3.6311 (3.6311)  weight_decay: 0.0500 (0.0500)  time: 3.5087  data: 2.6944  max mem: 43713
Epoch: [110]  [ 200/1251]  eta: 0:09:27  lr: 0.003061  min_lr: 0.003061  loss: 3.4902 (3.2319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7393 (0.8068)  time: 0.5314  data: 0.0004  max mem: 43713
Epoch: [110]  [ 400/1251]  eta: 0:07:33  lr: 0.003058  min_lr: 0.003058  loss: 3.2497 (3.2548)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8699 (0.8582)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [110]  [ 600/1251]  eta: 0:05:44  lr: 0.003055  min_lr: 0.003055  loss: 3.3030 (3.2542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7816 (0.8904)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [110]  [ 800/1251]  eta: 0:03:58  lr: 0.003052  min_lr: 0.003052  loss: 3.3540 (3.2525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7887 (0.8969)  time: 0.5329  data: 0.0005  max mem: 43713
Epoch: [110]  [1000/1251]  eta: 0:02:12  lr: 0.003049  min_lr: 0.003049  loss: 3.2482 (3.2574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9964 (0.9302)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [110]  [1200/1251]  eta: 0:00:26  lr: 0.003046  min_lr: 0.003046  loss: 3.3745 (3.2492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7036 (0.9126)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 3.5147 (3.2474)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8908 (0.9187)  time: 0.4495  data: 0.0007  max mem: 43713
Epoch: [110] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 3.5147 (3.2530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8908 (0.9187)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6279 (0.6279)  acc1: 86.0000 (86.0000)  acc5: 98.4000 (98.4000)  time: 5.4735  data: 5.1651  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8356 (0.7991)  acc1: 83.6000 (82.6182)  acc5: 96.8000 (97.0546)  time: 0.7383  data: 0.4698  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9732 (0.9557)  acc1: 77.6000 (79.3714)  acc5: 95.2000 (95.0667)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0290 (0.9617)  acc1: 77.6000 (79.2000)  acc5: 94.0000 (94.9760)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4789 s / it)
* Acc@1 79.580 Acc@5 95.372 loss 0.950
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.79%
Epoch: [111]  [   0/1251]  eta: 1:07:59  lr: 0.003045  min_lr: 0.003045  loss: 3.3411 (3.3411)  weight_decay: 0.0500 (0.0500)  time: 3.2607  data: 1.7860  max mem: 43713
Epoch: [111]  [ 200/1251]  eta: 0:09:28  lr: 0.003042  min_lr: 0.003042  loss: 3.3365 (3.1724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0570 (0.8439)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [111]  [ 400/1251]  eta: 0:07:32  lr: 0.003039  min_lr: 0.003039  loss: 3.4278 (3.2021)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9868 (0.9069)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [111]  [ 600/1251]  eta: 0:05:44  lr: 0.003036  min_lr: 0.003036  loss: 3.4683 (3.2237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8314 (0.9067)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [111]  [ 800/1251]  eta: 0:03:58  lr: 0.003033  min_lr: 0.003033  loss: 3.3042 (3.2325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7409 (0.8930)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [111]  [1000/1251]  eta: 0:02:12  lr: 0.003030  min_lr: 0.003030  loss: 3.4234 (3.2422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.8844)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [111]  [1200/1251]  eta: 0:00:26  lr: 0.003027  min_lr: 0.003027  loss: 3.4450 (3.2450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9056 (0.8874)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 3.4652 (3.2447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.8836)  time: 0.4432  data: 0.0005  max mem: 43713
Epoch: [111] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 3.4652 (3.2558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7423 (0.8836)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6961 (0.6961)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.6410  data: 5.3580  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8450 (0.8650)  acc1: 84.0000 (82.9818)  acc5: 97.6000 (97.2364)  time: 0.7536  data: 0.4873  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0949 (1.0130)  acc1: 77.2000 (79.6381)  acc5: 94.8000 (95.2571)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1386 (1.0309)  acc1: 77.2000 (79.0880)  acc5: 93.6000 (95.0560)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4854 s / it)
* Acc@1 79.612 Acc@5 95.262 loss 1.025
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.79%
Epoch: [112]  [   0/1251]  eta: 1:17:09  lr: 0.003026  min_lr: 0.003026  loss: 3.5290 (3.5290)  weight_decay: 0.0500 (0.0500)  time: 3.7006  data: 1.8407  max mem: 43713
Epoch: [112]  [ 200/1251]  eta: 0:09:26  lr: 0.003023  min_lr: 0.003023  loss: 3.0422 (3.2593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8979 (0.9371)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [112]  [ 400/1251]  eta: 0:07:32  lr: 0.003020  min_lr: 0.003020  loss: 3.3836 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7571 (0.9689)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [112]  [ 600/1251]  eta: 0:05:44  lr: 0.003017  min_lr: 0.003017  loss: 3.0566 (3.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9158 (0.9512)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [112]  [ 800/1251]  eta: 0:03:58  lr: 0.003014  min_lr: 0.003014  loss: 3.2040 (3.2526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7940 (0.9563)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [112]  [1000/1251]  eta: 0:02:12  lr: 0.003011  min_lr: 0.003011  loss: 3.2698 (3.2499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7419 (0.9444)  time: 0.5314  data: 0.0004  max mem: 43713
Epoch: [112]  [1200/1251]  eta: 0:00:26  lr: 0.003007  min_lr: 0.003007  loss: 3.3848 (3.2545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8071 (0.9326)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 3.2485 (3.2518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.9272)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [112] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 3.2485 (3.2536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.9272)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6723 (0.6723)  acc1: 86.8000 (86.8000)  acc5: 99.2000 (99.2000)  time: 5.3929  data: 5.0954  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8397 (0.8174)  acc1: 83.6000 (83.2727)  acc5: 97.2000 (97.1636)  time: 0.7309  data: 0.4635  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0022 (0.9660)  acc1: 78.4000 (79.7905)  acc5: 94.4000 (95.2381)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0801 (0.9762)  acc1: 78.0000 (79.4240)  acc5: 94.4000 (95.1840)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4741 s / it)
* Acc@1 79.476 Acc@5 95.234 loss 0.975
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.79%
Epoch: [113]  [   0/1251]  eta: 1:14:10  lr: 0.003007  min_lr: 0.003007  loss: 3.4843 (3.4843)  weight_decay: 0.0500 (0.0500)  time: 3.5572  data: 2.3257  max mem: 43713
Epoch: [113]  [ 200/1251]  eta: 0:09:27  lr: 0.003004  min_lr: 0.003004  loss: 2.9017 (3.2094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0267 (0.9563)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [113]  [ 400/1251]  eta: 0:07:33  lr: 0.003000  min_lr: 0.003000  loss: 3.3084 (3.1862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6754 (0.9831)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [113]  [ 600/1251]  eta: 0:05:44  lr: 0.002997  min_lr: 0.002997  loss: 3.3165 (3.2062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.9418)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [113]  [ 800/1251]  eta: 0:03:58  lr: 0.002994  min_lr: 0.002994  loss: 3.3223 (3.2245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (0.9394)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [113]  [1000/1251]  eta: 0:02:12  lr: 0.002991  min_lr: 0.002991  loss: 3.1777 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7592 (0.9263)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [113]  [1200/1251]  eta: 0:00:26  lr: 0.002988  min_lr: 0.002988  loss: 3.3516 (3.2288)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8636 (0.9248)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 3.2443 (3.2277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.9146)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [113] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 3.2443 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.9146)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6665 (0.6665)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5680  data: 5.2657  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8621 (0.8390)  acc1: 83.6000 (83.4182)  acc5: 97.6000 (97.3091)  time: 0.7468  data: 0.4790  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0714 (1.0006)  acc1: 78.0000 (79.5238)  acc5: 94.8000 (95.4476)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0999 (1.0070)  acc1: 76.8000 (79.4080)  acc5: 94.8000 (95.3600)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4830 s / it)
* Acc@1 79.692 Acc@5 95.326 loss 0.990
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.79%
Epoch: [114]  [   0/1251]  eta: 1:13:13  lr: 0.002987  min_lr: 0.002987  loss: 2.9995 (2.9995)  weight_decay: 0.0500 (0.0500)  time: 3.5120  data: 2.8988  max mem: 43713
Epoch: [114]  [ 200/1251]  eta: 0:09:30  lr: 0.002984  min_lr: 0.002984  loss: 2.9011 (3.2196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8156 (0.9521)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [114]  [ 400/1251]  eta: 0:07:33  lr: 0.002981  min_lr: 0.002981  loss: 3.2201 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7142 (0.9154)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [114]  [ 600/1251]  eta: 0:05:44  lr: 0.002978  min_lr: 0.002978  loss: 3.3245 (3.2390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8550 (0.9207)  time: 0.5281  data: 0.0004  max mem: 43713
Epoch: [114]  [ 800/1251]  eta: 0:03:58  lr: 0.002975  min_lr: 0.002975  loss: 3.2341 (3.2401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8083 (0.9395)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [114]  [1000/1251]  eta: 0:02:12  lr: 0.002972  min_lr: 0.002972  loss: 3.2528 (3.2342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (0.9286)  time: 0.5219  data: 0.0004  max mem: 43713
Epoch: [114]  [1200/1251]  eta: 0:00:26  lr: 0.002968  min_lr: 0.002968  loss: 3.4327 (3.2390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9642 (0.9370)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.1325 (3.2383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0693 (0.9433)  time: 0.4492  data: 0.0005  max mem: 43713
Epoch: [114] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.1325 (3.2505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0693 (0.9433)
Test:  [ 0/25]  eta: 0:01:57  loss: 0.6294 (0.6294)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 4.6903  data: 4.3573  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7986 (0.7766)  acc1: 85.2000 (83.9636)  acc5: 97.2000 (97.2727)  time: 0.6743  data: 0.4039  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9318 (0.9398)  acc1: 78.0000 (80.2095)  acc5: 95.2000 (95.2762)  time: 0.2684  data: 0.0043  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0177 (0.9527)  acc1: 78.0000 (79.9200)  acc5: 94.8000 (95.2000)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4514 s / it)
* Acc@1 79.820 Acc@5 95.368 loss 0.943
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.82%
Epoch: [115]  [   0/1251]  eta: 1:03:08  lr: 0.002968  min_lr: 0.002968  loss: 3.2346 (3.2346)  weight_decay: 0.0500 (0.0500)  time: 3.0283  data: 2.4939  max mem: 43713
Epoch: [115]  [ 200/1251]  eta: 0:09:23  lr: 0.002965  min_lr: 0.002965  loss: 3.3127 (3.1942)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8594 (0.9127)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [115]  [ 400/1251]  eta: 0:07:31  lr: 0.002961  min_lr: 0.002961  loss: 3.1902 (3.1964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8212 (0.8983)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [115]  [ 600/1251]  eta: 0:05:44  lr: 0.002958  min_lr: 0.002958  loss: 3.3318 (3.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.9296)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [115]  [ 800/1251]  eta: 0:03:58  lr: 0.002955  min_lr: 0.002955  loss: 3.4105 (3.2091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9883 (0.9260)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [115]  [1000/1251]  eta: 0:02:12  lr: 0.002952  min_lr: 0.002952  loss: 3.0725 (3.2065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0033 (0.9612)  time: 0.5281  data: 0.0005  max mem: 43713
Epoch: [115]  [1200/1251]  eta: 0:00:26  lr: 0.002949  min_lr: 0.002949  loss: 3.1242 (3.2144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.9376)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 3.3571 (3.2125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.9397)  time: 0.4432  data: 0.0006  max mem: 43713
Epoch: [115] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 3.3571 (3.2371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.9397)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6214 (0.6214)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5177  data: 5.2135  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7803 (0.7851)  acc1: 83.2000 (83.7091)  acc5: 97.2000 (97.2364)  time: 0.7422  data: 0.4742  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9398 (0.9468)  acc1: 78.8000 (80.2476)  acc5: 94.8000 (95.0667)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0746 (0.9591)  acc1: 77.2000 (79.8720)  acc5: 94.0000 (94.9600)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4801 s / it)
* Acc@1 79.828 Acc@5 95.366 loss 0.953
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.83%
Epoch: [116]  [   0/1251]  eta: 0:52:52  lr: 0.002948  min_lr: 0.002948  loss: 3.6754 (3.6754)  weight_decay: 0.0500 (0.0500)  time: 2.5362  data: 2.0004  max mem: 43713
Epoch: [116]  [ 200/1251]  eta: 0:09:20  lr: 0.002945  min_lr: 0.002945  loss: 3.1547 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.8112)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [116]  [ 400/1251]  eta: 0:07:31  lr: 0.002942  min_lr: 0.002942  loss: 3.4108 (3.2090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7309 (0.8997)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [116]  [ 600/1251]  eta: 0:05:44  lr: 0.002938  min_lr: 0.002938  loss: 3.0493 (3.2148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8625 (0.9262)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [116]  [ 800/1251]  eta: 0:03:57  lr: 0.002935  min_lr: 0.002935  loss: 3.3436 (3.2231)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9429 (0.9289)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [116]  [1000/1251]  eta: 0:02:12  lr: 0.002932  min_lr: 0.002932  loss: 3.2522 (3.2278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8595 (0.9136)  time: 0.5252  data: 0.0005  max mem: 43713
Epoch: [116]  [1200/1251]  eta: 0:00:26  lr: 0.002929  min_lr: 0.002929  loss: 3.3310 (3.2297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7138 (0.9177)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 3.5221 (3.2332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6502 (0.9119)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [116] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 3.5221 (3.2432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6502 (0.9119)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.7339 (0.7339)  acc1: 86.0000 (86.0000)  acc5: 99.2000 (99.2000)  time: 5.4156  data: 5.0977  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8515 (0.8479)  acc1: 82.4000 (82.6182)  acc5: 97.2000 (97.3091)  time: 0.7330  data: 0.4637  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0163 (1.0156)  acc1: 77.2000 (79.5429)  acc5: 94.8000 (95.2000)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1102 (1.0212)  acc1: 77.2000 (79.3760)  acc5: 94.0000 (95.1680)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4756 s / it)
* Acc@1 79.928 Acc@5 95.446 loss 1.005
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [117]  [   0/1251]  eta: 0:53:22  lr: 0.002928  min_lr: 0.002928  loss: 3.5666 (3.5666)  weight_decay: 0.0500 (0.0500)  time: 2.5601  data: 2.0177  max mem: 43713
Epoch: [117]  [ 200/1251]  eta: 0:09:28  lr: 0.002925  min_lr: 0.002925  loss: 3.4193 (3.2456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0575 (0.9526)  time: 0.5394  data: 0.0005  max mem: 43713
Epoch: [117]  [ 400/1251]  eta: 0:07:32  lr: 0.002922  min_lr: 0.002922  loss: 3.3983 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6200 (0.8634)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [117]  [ 600/1251]  eta: 0:05:44  lr: 0.002919  min_lr: 0.002919  loss: 3.3198 (3.2352)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7030 (0.8507)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [117]  [ 800/1251]  eta: 0:03:58  lr: 0.002915  min_lr: 0.002915  loss: 3.3829 (3.2320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7771 (0.8532)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [117]  [1000/1251]  eta: 0:02:12  lr: 0.002912  min_lr: 0.002912  loss: 3.2916 (3.2368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8720 (0.8667)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [117]  [1200/1251]  eta: 0:00:26  lr: 0.002909  min_lr: 0.002909  loss: 3.4943 (3.2348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7662 (0.9050)  time: 0.5298  data: 0.0005  max mem: 43713
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.0147 (3.2332)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.9011)  time: 0.4476  data: 0.0005  max mem: 43713
Epoch: [117] Total time: 0:10:57 (0.5260 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.0147 (3.2308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.9011)
Test:  [ 0/25]  eta: 0:01:37  loss: 0.6108 (0.6108)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 3.9155  data: 3.5764  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7523 (0.7694)  acc1: 84.4000 (83.3455)  acc5: 98.0000 (97.5636)  time: 0.6936  data: 0.4227  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9406 (0.9334)  acc1: 78.0000 (80.0571)  acc5: 94.8000 (95.5238)  time: 0.3177  data: 0.0537  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0819 (0.9446)  acc1: 76.8000 (79.6640)  acc5: 94.0000 (95.3280)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4570 s / it)
* Acc@1 79.800 Acc@5 95.502 loss 0.942
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.93%
Epoch: [118]  [   0/1251]  eta: 1:18:15  lr: 0.002908  min_lr: 0.002908  loss: 2.3934 (2.3934)  weight_decay: 0.0500 (0.0500)  time: 3.7536  data: 3.1152  max mem: 43713
Epoch: [118]  [ 200/1251]  eta: 0:09:30  lr: 0.002905  min_lr: 0.002905  loss: 3.4865 (3.2222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7682 (0.8457)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [118]  [ 400/1251]  eta: 0:07:33  lr: 0.002902  min_lr: 0.002902  loss: 3.5104 (3.2495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6707 (0.8668)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [118]  [ 600/1251]  eta: 0:05:46  lr: 0.002899  min_lr: 0.002899  loss: 3.4510 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9613 (0.9078)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [118]  [ 800/1251]  eta: 0:03:58  lr: 0.002895  min_lr: 0.002895  loss: 3.2918 (3.2411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8038 (0.9040)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [118]  [1000/1251]  eta: 0:02:12  lr: 0.002892  min_lr: 0.002892  loss: 3.3359 (3.2449)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.9100)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [118]  [1200/1251]  eta: 0:00:26  lr: 0.002889  min_lr: 0.002889  loss: 3.3111 (3.2494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7367 (0.9134)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 2.8579 (3.2450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.9133)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [118] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 2.8579 (3.2403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7564 (0.9133)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6335 (0.6335)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.5564  data: 5.2727  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7822 (0.7679)  acc1: 83.2000 (83.6727)  acc5: 98.0000 (97.4546)  time: 0.7457  data: 0.4796  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9474 (0.9298)  acc1: 78.8000 (79.7524)  acc5: 94.8000 (95.3714)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0478 (0.9412)  acc1: 76.8000 (79.3600)  acc5: 94.0000 (95.2480)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4820 s / it)
* Acc@1 79.978 Acc@5 95.416 loss 0.929
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 79.98%
Epoch: [119]  [   0/1251]  eta: 0:58:52  lr: 0.002888  min_lr: 0.002888  loss: 3.6583 (3.6583)  weight_decay: 0.0500 (0.0500)  time: 2.8237  data: 2.2979  max mem: 43713
Epoch: [119]  [ 200/1251]  eta: 0:09:23  lr: 0.002885  min_lr: 0.002885  loss: 3.0135 (3.2026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.8985)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [119]  [ 400/1251]  eta: 0:07:32  lr: 0.002882  min_lr: 0.002882  loss: 3.3562 (3.2153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0201 (0.9538)  time: 0.5397  data: 0.0005  max mem: 43713
Epoch: [119]  [ 600/1251]  eta: 0:05:44  lr: 0.002879  min_lr: 0.002879  loss: 3.2873 (3.2099)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [119]  [ 800/1251]  eta: 0:03:58  lr: 0.002875  min_lr: 0.002875  loss: 3.1896 (3.2007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8447 (nan)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [119]  [1000/1251]  eta: 0:02:12  lr: 0.002872  min_lr: 0.002872  loss: 3.4028 (3.2153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7234 (nan)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [119]  [1200/1251]  eta: 0:00:26  lr: 0.002869  min_lr: 0.002869  loss: 3.3225 (3.2183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (nan)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 3.3845 (3.2216)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7577 (nan)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [119] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 3.3845 (3.2293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7577 (nan)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.6919 (0.6919)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.0115  data: 4.6961  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8377 (0.8463)  acc1: 84.8000 (83.6000)  acc5: 97.6000 (97.3818)  time: 0.6962  data: 0.4272  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0074 (1.0083)  acc1: 78.0000 (80.1524)  acc5: 94.8000 (95.3333)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0668 (1.0137)  acc1: 77.6000 (80.0000)  acc5: 93.6000 (95.2480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4597 s / it)
* Acc@1 79.842 Acc@5 95.296 loss 1.009
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.98%
Epoch: [120]  [   0/1251]  eta: 1:12:46  lr: 0.002868  min_lr: 0.002868  loss: 3.4598 (3.4598)  weight_decay: 0.0500 (0.0500)  time: 3.4907  data: 2.8369  max mem: 43713
Epoch: [120]  [ 200/1251]  eta: 0:09:28  lr: 0.002865  min_lr: 0.002865  loss: 3.5098 (3.2380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8292 (0.9129)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [120]  [ 400/1251]  eta: 0:07:34  lr: 0.002862  min_lr: 0.002862  loss: 3.2702 (3.2542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8410 (0.9343)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [120]  [ 600/1251]  eta: 0:05:45  lr: 0.002858  min_lr: 0.002858  loss: 3.3462 (3.2413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (0.9179)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [120]  [ 800/1251]  eta: 0:03:58  lr: 0.002855  min_lr: 0.002855  loss: 3.4238 (3.2530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7596 (0.9361)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [120]  [1000/1251]  eta: 0:02:12  lr: 0.002852  min_lr: 0.002852  loss: 3.3634 (3.2522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9081 (0.9273)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [120]  [1200/1251]  eta: 0:00:26  lr: 0.002849  min_lr: 0.002849  loss: 3.4717 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7567 (0.9237)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.2921 (3.2500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8363 (0.9355)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [120] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.2921 (3.2295)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8363 (0.9355)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7161 (0.7161)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.4913  data: 5.1902  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8192 (0.8200)  acc1: 84.0000 (83.0545)  acc5: 98.0000 (97.3091)  time: 0.7400  data: 0.4721  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9992 (0.9625)  acc1: 77.6000 (80.1524)  acc5: 95.2000 (95.4857)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0925 (0.9768)  acc1: 77.2000 (79.7120)  acc5: 94.4000 (95.3280)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4784 s / it)
* Acc@1 79.952 Acc@5 95.352 loss 0.965
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 79.98%
Epoch: [121]  [   0/1251]  eta: 1:15:31  lr: 0.002848  min_lr: 0.002848  loss: 3.1418 (3.1418)  weight_decay: 0.0500 (0.0500)  time: 3.6222  data: 3.0233  max mem: 43713
Epoch: [121]  [ 200/1251]  eta: 0:09:30  lr: 0.002845  min_lr: 0.002845  loss: 3.0740 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.9123)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [121]  [ 400/1251]  eta: 0:07:34  lr: 0.002841  min_lr: 0.002841  loss: 3.3690 (3.2329)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7442 (0.9296)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [121]  [ 600/1251]  eta: 0:05:45  lr: 0.002838  min_lr: 0.002838  loss: 3.4124 (3.1987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6659 (0.9169)  time: 0.5420  data: 0.0005  max mem: 43713
Epoch: [121]  [ 800/1251]  eta: 0:03:58  lr: 0.002835  min_lr: 0.002835  loss: 3.3093 (3.2080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.9215)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [121]  [1000/1251]  eta: 0:02:12  lr: 0.002831  min_lr: 0.002831  loss: 3.1044 (3.2146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1128 (0.9256)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [121]  [1200/1251]  eta: 0:00:26  lr: 0.002828  min_lr: 0.002828  loss: 3.1267 (3.2175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7104 (0.9066)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.2707 (3.2197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6527 (0.9019)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [121] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.2707 (3.2162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6527 (0.9019)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7072 (0.7072)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.2646  data: 4.9631  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8891 (0.8740)  acc1: 84.0000 (83.2364)  acc5: 97.6000 (97.6000)  time: 0.7193  data: 0.4515  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0557 (1.0298)  acc1: 76.8000 (79.5238)  acc5: 95.2000 (95.5048)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1195 (1.0410)  acc1: 76.8000 (79.2800)  acc5: 94.0000 (95.3120)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4711 s / it)
* Acc@1 79.794 Acc@5 95.362 loss 1.028
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.98%
Epoch: [122]  [   0/1251]  eta: 1:12:45  lr: 0.002827  min_lr: 0.002827  loss: 2.5256 (2.5256)  weight_decay: 0.0500 (0.0500)  time: 3.4898  data: 2.4942  max mem: 43713
Epoch: [122]  [ 200/1251]  eta: 0:09:26  lr: 0.002824  min_lr: 0.002824  loss: 3.1464 (3.2027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9149 (0.9684)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [122]  [ 400/1251]  eta: 0:07:32  lr: 0.002821  min_lr: 0.002821  loss: 3.3951 (3.2110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7773 (0.9867)  time: 0.5308  data: 0.0004  max mem: 43713
Epoch: [122]  [ 600/1251]  eta: 0:05:45  lr: 0.002818  min_lr: 0.002818  loss: 3.1687 (3.1853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8209 (0.9474)  time: 0.5298  data: 0.0004  max mem: 43713
Epoch: [122]  [ 800/1251]  eta: 0:03:58  lr: 0.002814  min_lr: 0.002814  loss: 3.4194 (3.1897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7800 (0.9580)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [122]  [1000/1251]  eta: 0:02:12  lr: 0.002811  min_lr: 0.002811  loss: 3.3960 (3.1843)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.9501)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [122]  [1200/1251]  eta: 0:00:26  lr: 0.002808  min_lr: 0.002808  loss: 3.3765 (3.2088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.9359)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 3.2717 (3.2089)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0235 (0.9410)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [122] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 3.2717 (3.2266)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0235 (0.9410)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7399 (0.7399)  acc1: 85.6000 (85.6000)  acc5: 98.4000 (98.4000)  time: 5.5471  data: 5.2591  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8308 (0.8597)  acc1: 84.4000 (82.7636)  acc5: 97.2000 (97.2000)  time: 0.7452  data: 0.4784  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0586 (0.9948)  acc1: 76.8000 (79.8286)  acc5: 95.2000 (95.5238)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0795 (1.0057)  acc1: 76.8000 (79.3600)  acc5: 94.8000 (95.3920)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4810 s / it)
* Acc@1 79.996 Acc@5 95.492 loss 0.996
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.00%
Epoch: [123]  [   0/1251]  eta: 0:55:17  lr: 0.002807  min_lr: 0.002807  loss: 3.4080 (3.4080)  weight_decay: 0.0500 (0.0500)  time: 2.6520  data: 2.1094  max mem: 43713
Epoch: [123]  [ 200/1251]  eta: 0:09:22  lr: 0.002804  min_lr: 0.002804  loss: 3.0928 (3.1668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9225 (0.9095)  time: 0.5346  data: 0.0005  max mem: 43713
Epoch: [123]  [ 400/1251]  eta: 0:07:32  lr: 0.002800  min_lr: 0.002800  loss: 3.4115 (3.1923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8984 (0.9367)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [123]  [ 600/1251]  eta: 0:05:44  lr: 0.002797  min_lr: 0.002797  loss: 3.3569 (3.2116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1270 (0.9503)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [123]  [ 800/1251]  eta: 0:03:58  lr: 0.002794  min_lr: 0.002794  loss: 3.2121 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7490 (0.9580)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [123]  [1000/1251]  eta: 0:02:12  lr: 0.002790  min_lr: 0.002790  loss: 3.4768 (3.2191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8933 (0.9575)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [123]  [1200/1251]  eta: 0:00:26  lr: 0.002787  min_lr: 0.002787  loss: 3.2634 (3.2265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.9276)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 3.1447 (3.2255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.9216)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [123] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 3.1447 (3.2236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7418 (0.9216)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6374 (0.6374)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.2969  data: 4.9886  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7908 (0.8035)  acc1: 85.2000 (83.5273)  acc5: 96.4000 (96.6909)  time: 0.7221  data: 0.4538  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0454 (0.9407)  acc1: 77.6000 (79.8286)  acc5: 95.2000 (95.2952)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0195 (0.9501)  acc1: 77.6000 (79.7120)  acc5: 95.2000 (95.2640)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4706 s / it)
* Acc@1 80.170 Acc@5 95.526 loss 0.941
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.17%
Epoch: [124]  [   0/1251]  eta: 0:51:01  lr: 0.002786  min_lr: 0.002786  loss: 3.7333 (3.7333)  weight_decay: 0.0500 (0.0500)  time: 2.4474  data: 1.9169  max mem: 43713
Epoch: [124]  [ 200/1251]  eta: 0:09:26  lr: 0.002783  min_lr: 0.002783  loss: 3.3546 (3.2056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8389 (0.9336)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [124]  [ 400/1251]  eta: 0:07:31  lr: 0.002780  min_lr: 0.002780  loss: 3.3030 (3.1941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (nan)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [124]  [ 600/1251]  eta: 0:05:44  lr: 0.002776  min_lr: 0.002776  loss: 3.4147 (3.2015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1158 (nan)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [124]  [ 800/1251]  eta: 0:03:58  lr: 0.002773  min_lr: 0.002773  loss: 3.3753 (3.2118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9503 (nan)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [124]  [1000/1251]  eta: 0:02:12  lr: 0.002770  min_lr: 0.002770  loss: 3.3428 (3.2208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6088 (nan)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [124]  [1200/1251]  eta: 0:00:26  lr: 0.002766  min_lr: 0.002766  loss: 3.3501 (3.2232)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9744 (nan)  time: 0.5368  data: 0.0004  max mem: 43713
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.3205 (3.2181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9298 (nan)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [124] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.3205 (3.2176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9298 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6081 (0.6081)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.4316  data: 5.1121  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7388 (0.7718)  acc1: 84.4000 (83.6000)  acc5: 98.0000 (97.2000)  time: 0.7338  data: 0.4651  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9737 (0.9164)  acc1: 78.4000 (80.0952)  acc5: 94.0000 (95.4476)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9849 (0.9283)  acc1: 77.2000 (79.5840)  acc5: 95.2000 (95.4880)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4738 s / it)
* Acc@1 79.904 Acc@5 95.544 loss 0.918
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.17%
Epoch: [125]  [   0/1251]  eta: 1:13:57  lr: 0.002766  min_lr: 0.002766  loss: 3.3992 (3.3992)  weight_decay: 0.0500 (0.0500)  time: 3.5470  data: 1.6913  max mem: 43713
Epoch: [125]  [ 200/1251]  eta: 0:09:26  lr: 0.002762  min_lr: 0.002762  loss: 3.5038 (3.2363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0674 (1.0695)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [125]  [ 400/1251]  eta: 0:07:32  lr: 0.002759  min_lr: 0.002759  loss: 3.3005 (3.2309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.9818)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [125]  [ 600/1251]  eta: 0:05:45  lr: 0.002756  min_lr: 0.002756  loss: 3.3653 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0610 (0.9630)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [125]  [ 800/1251]  eta: 0:03:58  lr: 0.002752  min_lr: 0.002752  loss: 3.0474 (3.2354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7716 (0.9521)  time: 0.5249  data: 0.0004  max mem: 43713
Epoch: [125]  [1000/1251]  eta: 0:02:12  lr: 0.002749  min_lr: 0.002749  loss: 3.5071 (3.2300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8207 (0.9305)  time: 0.5331  data: 0.0005  max mem: 43713
Epoch: [125]  [1200/1251]  eta: 0:00:26  lr: 0.002746  min_lr: 0.002746  loss: 3.2294 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6744 (0.9096)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 3.1065 (3.2175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.9006)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [125] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 3.1065 (3.2119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.9006)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6104 (0.6104)  acc1: 87.6000 (87.6000)  acc5: 99.6000 (99.6000)  time: 5.8196  data: 5.5242  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7794 (0.7820)  acc1: 84.0000 (83.2727)  acc5: 98.0000 (97.3455)  time: 0.7695  data: 0.5025  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9976 (0.9305)  acc1: 78.4000 (80.5143)  acc5: 94.8000 (95.4095)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0193 (0.9420)  acc1: 78.4000 (80.2560)  acc5: 94.8000 (95.4400)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4928 s / it)
* Acc@1 80.370 Acc@5 95.610 loss 0.933
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.37%
Epoch: [126]  [   0/1251]  eta: 0:58:18  lr: 0.002745  min_lr: 0.002745  loss: 3.2451 (3.2451)  weight_decay: 0.0500 (0.0500)  time: 2.7967  data: 2.2563  max mem: 43713
Epoch: [126]  [ 200/1251]  eta: 0:09:22  lr: 0.002742  min_lr: 0.002742  loss: 3.1558 (3.1960)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.8661)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [126]  [ 400/1251]  eta: 0:07:32  lr: 0.002738  min_lr: 0.002738  loss: 3.2966 (3.2235)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4792 (0.9620)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [126]  [ 600/1251]  eta: 0:05:44  lr: 0.002735  min_lr: 0.002735  loss: 3.5489 (3.2339)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9370 (1.0013)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [126]  [ 800/1251]  eta: 0:03:57  lr: 0.002732  min_lr: 0.002732  loss: 3.2254 (3.2340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8465 (0.9596)  time: 0.5280  data: 0.0005  max mem: 43713
Epoch: [126]  [1000/1251]  eta: 0:02:12  lr: 0.002728  min_lr: 0.002728  loss: 3.3194 (3.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9810 (0.9310)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [126]  [1200/1251]  eta: 0:00:26  lr: 0.002725  min_lr: 0.002725  loss: 3.2552 (3.2241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8795 (0.9447)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 3.2076 (3.2278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.9385)  time: 0.4433  data: 0.0006  max mem: 43713
Epoch: [126] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 3.2076 (3.2112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.9385)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6815 (0.6815)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.7267  data: 5.4307  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8135 (0.8342)  acc1: 84.4000 (83.7818)  acc5: 97.2000 (97.3818)  time: 0.7615  data: 0.4941  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0117 (0.9837)  acc1: 78.4000 (80.2095)  acc5: 94.8000 (95.4286)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0917 (0.9972)  acc1: 76.8000 (79.7760)  acc5: 94.4000 (95.3280)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4881 s / it)
* Acc@1 80.234 Acc@5 95.438 loss 0.979
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.37%
Epoch: [127]  [   0/1251]  eta: 1:12:36  lr: 0.002724  min_lr: 0.002724  loss: 3.1658 (3.1658)  weight_decay: 0.0500 (0.0500)  time: 3.4820  data: 1.5213  max mem: 43713
Epoch: [127]  [ 200/1251]  eta: 0:09:30  lr: 0.002721  min_lr: 0.002721  loss: 3.1840 (3.1668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7847 (0.8168)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [127]  [ 400/1251]  eta: 0:07:33  lr: 0.002717  min_lr: 0.002717  loss: 3.2942 (3.1949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1672 (0.9328)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [127]  [ 600/1251]  eta: 0:05:44  lr: 0.002714  min_lr: 0.002714  loss: 3.2378 (3.2002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7759 (0.9291)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [127]  [ 800/1251]  eta: 0:03:58  lr: 0.002711  min_lr: 0.002711  loss: 3.1269 (3.2061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6281 (0.8879)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [127]  [1000/1251]  eta: 0:02:12  lr: 0.002707  min_lr: 0.002707  loss: 3.2634 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9305 (0.9002)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [127]  [1200/1251]  eta: 0:00:26  lr: 0.002704  min_lr: 0.002704  loss: 3.2429 (3.1981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (0.8965)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 3.1367 (3.1946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8847 (0.9002)  time: 0.4522  data: 0.0005  max mem: 43713
Epoch: [127] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 3.1367 (3.2027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8847 (0.9002)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5796 (0.5796)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.5617  data: 5.2591  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8129 (0.7686)  acc1: 83.6000 (83.9636)  acc5: 97.6000 (97.2727)  time: 0.7460  data: 0.4784  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9691 (0.9132)  acc1: 79.6000 (80.5524)  acc5: 95.6000 (95.6762)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0048 (0.9232)  acc1: 78.4000 (80.1440)  acc5: 95.2000 (95.6160)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4812 s / it)
* Acc@1 80.342 Acc@5 95.554 loss 0.918
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.37%
Epoch: [128]  [   0/1251]  eta: 1:14:29  lr: 0.002703  min_lr: 0.002703  loss: 3.0057 (3.0057)  weight_decay: 0.0500 (0.0500)  time: 3.5726  data: 2.7695  max mem: 43713
Epoch: [128]  [ 200/1251]  eta: 0:09:29  lr: 0.002700  min_lr: 0.002700  loss: 3.2604 (3.1701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7448 (0.8083)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [128]  [ 400/1251]  eta: 0:07:33  lr: 0.002696  min_lr: 0.002696  loss: 3.3927 (3.1677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0526 (0.8916)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [128]  [ 600/1251]  eta: 0:05:45  lr: 0.002693  min_lr: 0.002693  loss: 3.4164 (3.1875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (0.8923)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [128]  [ 800/1251]  eta: 0:03:58  lr: 0.002690  min_lr: 0.002690  loss: 3.3298 (3.1825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0172 (0.9172)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [128]  [1000/1251]  eta: 0:02:12  lr: 0.002686  min_lr: 0.002686  loss: 3.3571 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.9162)  time: 0.5298  data: 0.0004  max mem: 43713
Epoch: [128]  [1200/1251]  eta: 0:00:26  lr: 0.002683  min_lr: 0.002683  loss: 3.1729 (3.1837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.9191)  time: 0.5285  data: 0.0004  max mem: 43713
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 3.4432 (3.1872)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (0.9143)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [128] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 3.4432 (3.2018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (0.9143)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6458 (0.6458)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.4474  data: 5.1523  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8389 (0.8251)  acc1: 84.8000 (83.6364)  acc5: 97.6000 (97.4545)  time: 0.7355  data: 0.4687  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0397 (0.9764)  acc1: 78.4000 (79.9048)  acc5: 95.2000 (95.5810)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0704 (0.9898)  acc1: 77.2000 (79.4880)  acc5: 94.8000 (95.4080)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4746 s / it)
* Acc@1 80.162 Acc@5 95.580 loss 0.973
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.37%
Epoch: [129]  [   0/1251]  eta: 1:12:48  lr: 0.002682  min_lr: 0.002682  loss: 2.9783 (2.9783)  weight_decay: 0.0500 (0.0500)  time: 3.4917  data: 2.6407  max mem: 43713
Epoch: [129]  [ 200/1251]  eta: 0:09:25  lr: 0.002679  min_lr: 0.002679  loss: 3.1589 (3.1771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (0.9611)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [129]  [ 400/1251]  eta: 0:07:33  lr: 0.002675  min_lr: 0.002675  loss: 3.3063 (3.1842)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9781 (0.9311)  time: 0.5311  data: 0.0004  max mem: 43713
Epoch: [129]  [ 600/1251]  eta: 0:05:44  lr: 0.002672  min_lr: 0.002672  loss: 3.1820 (3.1848)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8070 (0.8860)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [129]  [ 800/1251]  eta: 0:03:58  lr: 0.002668  min_lr: 0.002668  loss: 3.1802 (3.2034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.8981)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [129]  [1000/1251]  eta: 0:02:12  lr: 0.002665  min_lr: 0.002665  loss: 3.1987 (3.2022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (0.9157)  time: 0.5289  data: 0.0004  max mem: 43713
Epoch: [129]  [1200/1251]  eta: 0:00:26  lr: 0.002662  min_lr: 0.002662  loss: 3.0495 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.9340)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 3.5190 (3.2085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.9267)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [129] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 3.5190 (3.1992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.9267)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7023 (0.7023)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.5881  data: 5.2837  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8604 (0.8776)  acc1: 84.8000 (83.7818)  acc5: 97.6000 (97.3091)  time: 0.7485  data: 0.4806  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0795 (1.0277)  acc1: 78.0000 (80.0762)  acc5: 94.8000 (95.2952)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1220 (1.0365)  acc1: 78.4000 (79.8080)  acc5: 94.4000 (95.2800)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4822 s / it)
* Acc@1 80.070 Acc@5 95.496 loss 1.032
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.37%
Epoch: [130]  [   0/1251]  eta: 1:15:12  lr: 0.002661  min_lr: 0.002661  loss: 3.2148 (3.2148)  weight_decay: 0.0500 (0.0500)  time: 3.6068  data: 2.3708  max mem: 43713
Epoch: [130]  [ 200/1251]  eta: 0:09:28  lr: 0.002657  min_lr: 0.002657  loss: 3.2972 (3.2288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (1.0605)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [130]  [ 400/1251]  eta: 0:07:33  lr: 0.002654  min_lr: 0.002654  loss: 3.3874 (3.1958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7408 (0.9353)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [130]  [ 600/1251]  eta: 0:05:44  lr: 0.002651  min_lr: 0.002651  loss: 3.2403 (3.2206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8943 (0.9536)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [130]  [ 800/1251]  eta: 0:03:58  lr: 0.002647  min_lr: 0.002647  loss: 3.1319 (3.2192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7849 (0.9409)  time: 0.5311  data: 0.0004  max mem: 43713
Epoch: [130]  [1000/1251]  eta: 0:02:12  lr: 0.002644  min_lr: 0.002644  loss: 3.1761 (3.2257)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7452 (0.9448)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [130]  [1200/1251]  eta: 0:00:26  lr: 0.002640  min_lr: 0.002640  loss: 3.0792 (3.2224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8888 (0.9422)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 3.2460 (3.2183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.9369)  time: 0.4433  data: 0.0007  max mem: 43713
Epoch: [130] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 3.2460 (3.2000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.9369)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.6812 (0.6812)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.0205  data: 4.7187  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8101 (0.8066)  acc1: 84.4000 (83.3455)  acc5: 98.0000 (97.4909)  time: 0.6970  data: 0.4292  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0152 (0.9484)  acc1: 78.0000 (80.3238)  acc5: 94.8000 (95.5048)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0152 (0.9619)  acc1: 78.0000 (79.8720)  acc5: 94.4000 (95.4240)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4600 s / it)
* Acc@1 80.334 Acc@5 95.518 loss 0.949
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.37%
Epoch: [131]  [   0/1251]  eta: 1:10:20  lr: 0.002640  min_lr: 0.002640  loss: 3.7214 (3.7214)  weight_decay: 0.0500 (0.0500)  time: 3.3737  data: 2.6721  max mem: 43713
Epoch: [131]  [ 200/1251]  eta: 0:09:29  lr: 0.002636  min_lr: 0.002636  loss: 3.1545 (3.2255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8283 (0.9386)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [131]  [ 400/1251]  eta: 0:07:32  lr: 0.002633  min_lr: 0.002633  loss: 3.3117 (3.2343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7898 (0.8772)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [131]  [ 600/1251]  eta: 0:05:44  lr: 0.002629  min_lr: 0.002629  loss: 3.1254 (3.2119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9232 (0.9048)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [131]  [ 800/1251]  eta: 0:03:58  lr: 0.002626  min_lr: 0.002626  loss: 3.3127 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7826 (0.9365)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [131]  [1000/1251]  eta: 0:02:12  lr: 0.002623  min_lr: 0.002623  loss: 3.4538 (3.1998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8478 (0.9368)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [131]  [1200/1251]  eta: 0:00:26  lr: 0.002619  min_lr: 0.002619  loss: 3.3652 (3.2004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8630 (0.9310)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 3.2747 (3.2027)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (0.9490)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [131] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 3.2747 (3.1951)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0584 (0.9490)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6298 (0.6298)  acc1: 87.2000 (87.2000)  acc5: 99.6000 (99.6000)  time: 5.7263  data: 5.4383  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8390 (0.8020)  acc1: 82.8000 (83.4182)  acc5: 97.6000 (97.3091)  time: 0.7612  data: 0.4947  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0237 (0.9361)  acc1: 78.8000 (80.2857)  acc5: 95.2000 (95.6952)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0237 (0.9470)  acc1: 78.4000 (80.0000)  acc5: 95.2000 (95.4720)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4881 s / it)
* Acc@1 80.414 Acc@5 95.716 loss 0.941
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.41%
Epoch: [132]  [   0/1251]  eta: 1:08:46  lr: 0.002618  min_lr: 0.002618  loss: 2.7792 (2.7792)  weight_decay: 0.0500 (0.0500)  time: 3.2982  data: 2.7670  max mem: 43713
Epoch: [132]  [ 200/1251]  eta: 0:09:24  lr: 0.002615  min_lr: 0.002615  loss: 3.3469 (3.1802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7425 (0.8405)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [132]  [ 400/1251]  eta: 0:07:32  lr: 0.002612  min_lr: 0.002612  loss: 3.0815 (3.1841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9741 (0.9697)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [132]  [ 600/1251]  eta: 0:05:44  lr: 0.002608  min_lr: 0.002608  loss: 3.1288 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7318 (0.9212)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [132]  [ 800/1251]  eta: 0:03:58  lr: 0.002605  min_lr: 0.002605  loss: 3.4798 (3.1961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6886 (0.9415)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [132]  [1000/1251]  eta: 0:02:12  lr: 0.002601  min_lr: 0.002601  loss: 3.2336 (3.2058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9248 (0.9340)  time: 0.5316  data: 0.0004  max mem: 43713
Epoch: [132]  [1200/1251]  eta: 0:00:26  lr: 0.002598  min_lr: 0.002598  loss: 3.4011 (3.1976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9416 (nan)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.0930 (3.1987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (nan)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [132] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.0930 (3.1897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (nan)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.5776 (0.5776)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 4.2928  data: 3.9974  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7928 (0.7907)  acc1: 84.8000 (83.9273)  acc5: 97.6000 (97.2727)  time: 0.6820  data: 0.4119  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0071 (0.9280)  acc1: 78.4000 (80.5524)  acc5: 95.2000 (95.8476)  time: 0.2973  data: 0.0267  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0318 (0.9413)  acc1: 78.0000 (80.0320)  acc5: 95.2000 (95.7120)  time: 0.2705  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4552 s / it)
* Acc@1 80.338 Acc@5 95.680 loss 0.937
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.41%
Epoch: [133]  [   0/1251]  eta: 1:14:53  lr: 0.002597  min_lr: 0.002597  loss: 2.4553 (2.4553)  weight_decay: 0.0500 (0.0500)  time: 3.5916  data: 2.9823  max mem: 43713
Epoch: [133]  [ 200/1251]  eta: 0:09:28  lr: 0.002594  min_lr: 0.002594  loss: 3.1612 (3.1802)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9847 (1.0980)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [133]  [ 400/1251]  eta: 0:07:34  lr: 0.002590  min_lr: 0.002590  loss: 3.2609 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8922 (1.0020)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [133]  [ 600/1251]  eta: 0:05:45  lr: 0.002587  min_lr: 0.002587  loss: 3.2885 (3.1881)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6618 (0.9147)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [133]  [ 800/1251]  eta: 0:03:58  lr: 0.002583  min_lr: 0.002583  loss: 3.4610 (3.1836)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0068 (0.9213)  time: 0.5305  data: 0.0005  max mem: 43713
Epoch: [133]  [1000/1251]  eta: 0:02:12  lr: 0.002580  min_lr: 0.002580  loss: 3.2915 (3.1928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7967 (0.9234)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [133]  [1200/1251]  eta: 0:00:26  lr: 0.002576  min_lr: 0.002576  loss: 3.3742 (3.1864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.9054)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.3016 (3.1863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.9049)  time: 0.4488  data: 0.0007  max mem: 43713
Epoch: [133] Total time: 0:10:58 (0.5268 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.3016 (3.1892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6897 (0.9049)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6353 (0.6353)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5339  data: 5.2462  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7730 (0.7873)  acc1: 84.0000 (83.9273)  acc5: 98.0000 (97.4546)  time: 0.7440  data: 0.4772  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0167 (0.9342)  acc1: 78.4000 (80.4191)  acc5: 95.2000 (95.5238)  time: 0.2651  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9797 (0.9381)  acc1: 78.0000 (80.2240)  acc5: 94.8000 (95.5520)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4806 s / it)
* Acc@1 80.460 Acc@5 95.596 loss 0.932
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.46%
Epoch: [134]  [   0/1251]  eta: 0:58:41  lr: 0.002576  min_lr: 0.002576  loss: 3.7171 (3.7171)  weight_decay: 0.0500 (0.0500)  time: 2.8147  data: 2.2697  max mem: 43713
Epoch: [134]  [ 200/1251]  eta: 0:09:26  lr: 0.002572  min_lr: 0.002572  loss: 3.2969 (3.1416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7141 (0.9689)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [134]  [ 400/1251]  eta: 0:07:32  lr: 0.002569  min_lr: 0.002569  loss: 3.3988 (3.1710)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0536 (0.9594)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [134]  [ 600/1251]  eta: 0:05:44  lr: 0.002565  min_lr: 0.002565  loss: 3.2541 (3.1653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6397 (0.9294)  time: 0.5311  data: 0.0006  max mem: 43713
Epoch: [134]  [ 800/1251]  eta: 0:03:58  lr: 0.002562  min_lr: 0.002562  loss: 3.4847 (3.1797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9623 (0.9634)  time: 0.5297  data: 0.0005  max mem: 43713
Epoch: [134]  [1000/1251]  eta: 0:02:12  lr: 0.002558  min_lr: 0.002558  loss: 3.2088 (3.1746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7908 (0.9637)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [134]  [1200/1251]  eta: 0:00:26  lr: 0.002555  min_lr: 0.002555  loss: 3.2677 (3.1822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7005 (0.9432)  time: 0.5327  data: 0.0004  max mem: 43713
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 3.3173 (3.1849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8731 (0.9392)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [134] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 3.3173 (3.1890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8731 (0.9392)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.6982 (0.6982)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.0579  data: 4.7333  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8055 (0.8065)  acc1: 84.8000 (83.2364)  acc5: 97.2000 (97.1273)  time: 0.7006  data: 0.4307  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0237 (0.9443)  acc1: 79.6000 (80.4952)  acc5: 95.2000 (95.4667)  time: 0.2646  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0624 (0.9585)  acc1: 78.4000 (80.0640)  acc5: 94.8000 (95.4560)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4615 s / it)
* Acc@1 80.290 Acc@5 95.574 loss 0.952
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.46%
Epoch: [135]  [   0/1251]  eta: 1:10:57  lr: 0.002554  min_lr: 0.002554  loss: 2.7940 (2.7940)  weight_decay: 0.0500 (0.0500)  time: 3.4030  data: 2.2538  max mem: 43713
Epoch: [135]  [ 200/1251]  eta: 0:09:26  lr: 0.002551  min_lr: 0.002551  loss: 3.3083 (3.1578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8583 (0.9222)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [135]  [ 400/1251]  eta: 0:07:31  lr: 0.002547  min_lr: 0.002547  loss: 3.0557 (3.1749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9192 (0.9758)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [135]  [ 600/1251]  eta: 0:05:44  lr: 0.002544  min_lr: 0.002544  loss: 3.5064 (3.2077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8122 (0.9997)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [135]  [ 800/1251]  eta: 0:03:58  lr: 0.002540  min_lr: 0.002540  loss: 3.2487 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6924 (0.9793)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [135]  [1000/1251]  eta: 0:02:12  lr: 0.002537  min_lr: 0.002537  loss: 3.2585 (3.2135)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6714 (0.9684)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [135]  [1200/1251]  eta: 0:00:26  lr: 0.002533  min_lr: 0.002533  loss: 3.4781 (3.2140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8999 (0.9840)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 3.1632 (3.2131)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1512 (0.9943)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [135] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 3.1632 (3.1825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1512 (0.9943)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6397 (0.6397)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.2264  data: 4.8983  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7959 (0.7780)  acc1: 84.8000 (83.9273)  acc5: 97.6000 (97.3091)  time: 0.7158  data: 0.4456  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9842 (0.9222)  acc1: 78.4000 (80.5143)  acc5: 94.8000 (95.4667)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0357 (0.9363)  acc1: 77.2000 (80.0160)  acc5: 94.8000 (95.3760)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4684 s / it)
* Acc@1 80.346 Acc@5 95.616 loss 0.924
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.46%
Epoch: [136]  [   0/1251]  eta: 1:11:34  lr: 0.002532  min_lr: 0.002532  loss: 2.8694 (2.8694)  weight_decay: 0.0500 (0.0500)  time: 3.4330  data: 2.2331  max mem: 43713
Epoch: [136]  [ 200/1251]  eta: 0:09:26  lr: 0.002529  min_lr: 0.002529  loss: 3.1932 (3.1389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9261 (0.9625)  time: 0.5312  data: 0.0005  max mem: 43713
Epoch: [136]  [ 400/1251]  eta: 0:07:33  lr: 0.002526  min_lr: 0.002526  loss: 3.3733 (3.1493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7933 (0.8985)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [136]  [ 600/1251]  eta: 0:05:44  lr: 0.002522  min_lr: 0.002522  loss: 3.2718 (3.1728)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9359 (0.8975)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [136]  [ 800/1251]  eta: 0:03:58  lr: 0.002519  min_lr: 0.002519  loss: 3.2809 (3.1855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7258 (0.9269)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [136]  [1000/1251]  eta: 0:02:12  lr: 0.002515  min_lr: 0.002515  loss: 3.2398 (3.1855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6904 (0.9182)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [136]  [1200/1251]  eta: 0:00:26  lr: 0.002512  min_lr: 0.002512  loss: 3.3891 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9834 (0.9313)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 3.2758 (3.1937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9834 (0.9354)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [136] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 3.2758 (3.1830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9834 (0.9354)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7129 (0.7129)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.6548  data: 5.3537  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8049 (0.8354)  acc1: 86.4000 (83.8182)  acc5: 97.6000 (97.3818)  time: 0.7546  data: 0.4870  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0429 (0.9857)  acc1: 78.4000 (80.1524)  acc5: 95.2000 (95.5810)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0744 (0.9984)  acc1: 78.4000 (79.7920)  acc5: 94.8000 (95.4880)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4844 s / it)
* Acc@1 80.294 Acc@5 95.586 loss 0.988
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.46%
Epoch: [137]  [   0/1251]  eta: 1:11:53  lr: 0.002511  min_lr: 0.002511  loss: 3.3353 (3.3353)  weight_decay: 0.0500 (0.0500)  time: 3.4484  data: 2.6644  max mem: 43713
Epoch: [137]  [ 200/1251]  eta: 0:09:31  lr: 0.002507  min_lr: 0.002507  loss: 3.3832 (3.1080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7879 (0.8959)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [137]  [ 400/1251]  eta: 0:07:34  lr: 0.002504  min_lr: 0.002504  loss: 3.4239 (3.1433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6856 (0.8411)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [137]  [ 600/1251]  eta: 0:05:45  lr: 0.002500  min_lr: 0.002500  loss: 3.2896 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (0.9211)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [137]  [ 800/1251]  eta: 0:03:58  lr: 0.002497  min_lr: 0.002497  loss: 3.5090 (3.1527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7780 (0.9095)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [137]  [1000/1251]  eta: 0:02:12  lr: 0.002493  min_lr: 0.002493  loss: 3.1851 (3.1499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8887)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [137]  [1200/1251]  eta: 0:00:26  lr: 0.002490  min_lr: 0.002490  loss: 3.2680 (3.1447)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8715 (0.9322)  time: 0.5290  data: 0.0005  max mem: 43713
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 2.9934 (3.1422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (0.9318)  time: 0.4494  data: 0.0007  max mem: 43713
Epoch: [137] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 2.9934 (3.1714)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (0.9318)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6062 (0.6062)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.4368  data: 5.1220  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7114 (0.7428)  acc1: 84.4000 (83.6727)  acc5: 98.0000 (97.5273)  time: 0.7355  data: 0.4660  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9137 (0.8913)  acc1: 78.8000 (80.4571)  acc5: 95.2000 (95.6762)  time: 0.2651  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0002 (0.9041)  acc1: 77.6000 (80.0320)  acc5: 94.8000 (95.5840)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4770 s / it)
* Acc@1 80.610 Acc@5 95.776 loss 0.891
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.61%
Epoch: [138]  [   0/1251]  eta: 0:51:24  lr: 0.002489  min_lr: 0.002489  loss: 3.5247 (3.5247)  weight_decay: 0.0500 (0.0500)  time: 2.4660  data: 1.9246  max mem: 43713
Epoch: [138]  [ 200/1251]  eta: 0:09:20  lr: 0.002486  min_lr: 0.002486  loss: 3.3209 (3.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8176 (0.9067)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [138]  [ 400/1251]  eta: 0:07:30  lr: 0.002482  min_lr: 0.002482  loss: 3.2992 (3.2000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6571 (0.8444)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [138]  [ 600/1251]  eta: 0:05:43  lr: 0.002479  min_lr: 0.002479  loss: 3.1222 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6727 (0.8529)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [138]  [ 800/1251]  eta: 0:03:57  lr: 0.002475  min_lr: 0.002475  loss: 2.9924 (3.1826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8569 (0.8458)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [138]  [1000/1251]  eta: 0:02:12  lr: 0.002472  min_lr: 0.002472  loss: 2.9023 (3.1677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9425 (0.8647)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [138]  [1200/1251]  eta: 0:00:26  lr: 0.002468  min_lr: 0.002468  loss: 3.3534 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9327 (0.8565)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 3.3149 (3.1681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.8507)  time: 0.4433  data: 0.0004  max mem: 43713
Epoch: [138] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 3.3149 (3.1703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7856 (0.8507)
Test:  [ 0/25]  eta: 0:02:03  loss: 0.6608 (0.6608)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 4.9386  data: 4.6297  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8034 (0.8066)  acc1: 84.8000 (84.1455)  acc5: 97.6000 (97.3455)  time: 0.6894  data: 0.4212  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0010 (0.9530)  acc1: 78.0000 (80.7429)  acc5: 95.2000 (95.5429)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0575 (0.9642)  acc1: 78.0000 (80.0480)  acc5: 94.8000 (95.5360)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4543 s / it)
* Acc@1 80.652 Acc@5 95.616 loss 0.949
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.65%
Epoch: [139]  [   0/1251]  eta: 0:56:23  lr: 0.002467  min_lr: 0.002467  loss: 3.5344 (3.5344)  weight_decay: 0.0500 (0.0500)  time: 2.7047  data: 2.1565  max mem: 43713
Epoch: [139]  [ 200/1251]  eta: 0:09:21  lr: 0.002464  min_lr: 0.002464  loss: 3.2682 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (1.0652)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [139]  [ 400/1251]  eta: 0:07:32  lr: 0.002460  min_lr: 0.002460  loss: 3.2902 (3.2156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8347 (1.0068)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [139]  [ 600/1251]  eta: 0:05:44  lr: 0.002457  min_lr: 0.002457  loss: 3.1526 (3.2060)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8164 (0.9424)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [139]  [ 800/1251]  eta: 0:03:57  lr: 0.002453  min_lr: 0.002453  loss: 3.4356 (3.2026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6378 (0.9104)  time: 0.5283  data: 0.0004  max mem: 43713
Epoch: [139]  [1000/1251]  eta: 0:02:12  lr: 0.002450  min_lr: 0.002450  loss: 3.1434 (3.1986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9252 (0.9279)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [139]  [1200/1251]  eta: 0:00:26  lr: 0.002446  min_lr: 0.002446  loss: 3.2745 (3.1912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8941 (0.9194)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 3.0803 (3.1887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0735 (0.9332)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [139] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 3.0803 (3.1680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0735 (0.9332)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6348 (0.6348)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (97.6000)  time: 5.4515  data: 5.1647  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7738 (0.7598)  acc1: 84.8000 (83.6364)  acc5: 97.6000 (97.1273)  time: 0.7365  data: 0.4698  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9890 (0.9028)  acc1: 78.4000 (80.8381)  acc5: 95.2000 (95.4857)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0353 (0.9124)  acc1: 78.4000 (80.5280)  acc5: 94.8000 (95.3760)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4772 s / it)
* Acc@1 80.630 Acc@5 95.708 loss 0.897
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.65%
Epoch: [140]  [   0/1251]  eta: 1:09:20  lr: 0.002445  min_lr: 0.002445  loss: 2.8580 (2.8580)  weight_decay: 0.0500 (0.0500)  time: 3.3261  data: 1.6204  max mem: 43713
Epoch: [140]  [ 200/1251]  eta: 0:09:27  lr: 0.002442  min_lr: 0.002442  loss: 3.2923 (3.1260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9626 (0.9727)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [140]  [ 400/1251]  eta: 0:07:33  lr: 0.002438  min_lr: 0.002438  loss: 3.3833 (3.1595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9588 (0.9966)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [140]  [ 600/1251]  eta: 0:05:44  lr: 0.002435  min_lr: 0.002435  loss: 3.4571 (3.1690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6945 (0.9484)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [140]  [ 800/1251]  eta: 0:03:58  lr: 0.002431  min_lr: 0.002431  loss: 3.3756 (3.1735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8616 (0.9276)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [140]  [1000/1251]  eta: 0:02:12  lr: 0.002428  min_lr: 0.002428  loss: 3.3747 (3.1765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9813 (0.9416)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [140]  [1200/1251]  eta: 0:00:26  lr: 0.002424  min_lr: 0.002424  loss: 3.0993 (3.1787)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9433 (0.9620)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 3.4124 (3.1820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9645)  time: 0.4433  data: 0.0007  max mem: 43713
Epoch: [140] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 3.4124 (3.1708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9715 (0.9645)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7879 (0.7879)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.3035  data: 5.0022  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8916 (0.8830)  acc1: 85.2000 (84.1091)  acc5: 97.6000 (97.2364)  time: 0.7223  data: 0.4551  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.1145 (1.0258)  acc1: 79.6000 (80.3238)  acc5: 94.8000 (95.5238)  time: 0.2640  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0983 (1.0321)  acc1: 79.6000 (79.8880)  acc5: 94.4000 (95.5360)  time: 0.2639  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4697 s / it)
* Acc@1 80.344 Acc@5 95.632 loss 1.026
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.65%
Epoch: [141]  [   0/1251]  eta: 1:13:28  lr: 0.002424  min_lr: 0.002424  loss: 3.4470 (3.4470)  weight_decay: 0.0500 (0.0500)  time: 3.5238  data: 2.6937  max mem: 43713
Epoch: [141]  [ 200/1251]  eta: 0:09:31  lr: 0.002420  min_lr: 0.002420  loss: 3.2111 (3.1373)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8081 (0.9475)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [141]  [ 400/1251]  eta: 0:07:34  lr: 0.002417  min_lr: 0.002417  loss: 3.4157 (3.1778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6389 (0.9168)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [141]  [ 600/1251]  eta: 0:05:46  lr: 0.002413  min_lr: 0.002413  loss: 3.0750 (3.1535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9127 (1.0004)  time: 0.5348  data: 0.0004  max mem: 43713
Epoch: [141]  [ 800/1251]  eta: 0:03:59  lr: 0.002409  min_lr: 0.002409  loss: 3.0978 (3.1445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9009 (0.9790)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [141]  [1000/1251]  eta: 0:02:12  lr: 0.002406  min_lr: 0.002406  loss: 3.2369 (3.1514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7997 (0.9585)  time: 0.5297  data: 0.0005  max mem: 43713
Epoch: [141]  [1200/1251]  eta: 0:00:26  lr: 0.002402  min_lr: 0.002402  loss: 3.3769 (3.1613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5949 (0.9391)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.2954 (3.1581)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8369 (0.9427)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [141] Total time: 0:11:00 (0.5278 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.2954 (3.1741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8369 (0.9427)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6617 (0.6617)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.3606  data: 5.0569  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8374 (0.8245)  acc1: 83.6000 (83.7818)  acc5: 98.0000 (97.3091)  time: 0.7279  data: 0.4600  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0370 (0.9659)  acc1: 78.8000 (80.4000)  acc5: 95.2000 (95.5619)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0393 (0.9765)  acc1: 78.8000 (79.9840)  acc5: 94.8000 (95.5040)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4734 s / it)
* Acc@1 80.598 Acc@5 95.676 loss 0.958
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.65%
Epoch: [142]  [   0/1251]  eta: 1:13:24  lr: 0.002402  min_lr: 0.002402  loss: 3.2068 (3.2068)  weight_decay: 0.0500 (0.0500)  time: 3.5212  data: 2.8903  max mem: 43713
Epoch: [142]  [ 200/1251]  eta: 0:09:26  lr: 0.002398  min_lr: 0.002398  loss: 3.3326 (3.2223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8605 (0.8840)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [142]  [ 400/1251]  eta: 0:07:32  lr: 0.002395  min_lr: 0.002395  loss: 3.2851 (3.2227)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7636 (0.8874)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [142]  [ 600/1251]  eta: 0:05:45  lr: 0.002391  min_lr: 0.002391  loss: 3.2451 (3.2048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9695 (0.9038)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [142]  [ 800/1251]  eta: 0:03:58  lr: 0.002387  min_lr: 0.002387  loss: 3.1571 (3.1925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2001 (0.9213)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [142]  [1000/1251]  eta: 0:02:12  lr: 0.002384  min_lr: 0.002384  loss: 3.2949 (3.1824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1682 (0.9360)  time: 0.5299  data: 0.0004  max mem: 43713
Epoch: [142]  [1200/1251]  eta: 0:00:26  lr: 0.002380  min_lr: 0.002380  loss: 3.2646 (3.1783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9138 (0.9300)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 3.0922 (3.1762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (0.9200)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [142] Total time: 0:10:58 (0.5268 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 3.0922 (3.1667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7375 (0.9200)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5609 (0.5609)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.8000  data: 5.4812  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7387 (0.7601)  acc1: 84.8000 (84.1455)  acc5: 98.0000 (97.4909)  time: 0.7679  data: 0.4986  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9625 (0.9116)  acc1: 79.2000 (80.6857)  acc5: 95.2000 (95.8476)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0037 (0.9243)  acc1: 78.8000 (80.0960)  acc5: 95.2000 (95.7120)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4918 s / it)
* Acc@1 80.664 Acc@5 95.790 loss 0.916
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.66%
Epoch: [143]  [   0/1251]  eta: 0:58:26  lr: 0.002380  min_lr: 0.002380  loss: 2.7797 (2.7797)  weight_decay: 0.0500 (0.0500)  time: 2.8033  data: 2.2591  max mem: 43713
Epoch: [143]  [ 200/1251]  eta: 0:09:24  lr: 0.002376  min_lr: 0.002376  loss: 3.3586 (3.2555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9489 (1.0073)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [143]  [ 400/1251]  eta: 0:07:32  lr: 0.002373  min_lr: 0.002373  loss: 3.2818 (3.1970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2699 (1.0075)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [143]  [ 600/1251]  eta: 0:05:44  lr: 0.002369  min_lr: 0.002369  loss: 3.2947 (3.1892)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9870 (1.0134)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [143]  [ 800/1251]  eta: 0:03:58  lr: 0.002365  min_lr: 0.002365  loss: 3.0630 (3.1782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.9681)  time: 0.5341  data: 0.0004  max mem: 43713
Epoch: [143]  [1000/1251]  eta: 0:02:12  lr: 0.002362  min_lr: 0.002362  loss: 3.4797 (3.1775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8212 (0.9448)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [143]  [1200/1251]  eta: 0:00:26  lr: 0.002358  min_lr: 0.002358  loss: 3.2869 (3.1621)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7519 (0.9348)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 3.2987 (3.1595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8242 (0.9350)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [143] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 3.2987 (3.1631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8242 (0.9350)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6225 (0.6225)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.2804  data: 4.9865  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7600 (0.7571)  acc1: 84.4000 (84.4000)  acc5: 97.2000 (97.2364)  time: 0.7204  data: 0.4536  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9911 (0.9017)  acc1: 80.0000 (81.0476)  acc5: 95.6000 (95.7143)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9911 (0.9225)  acc1: 78.8000 (80.3360)  acc5: 95.2000 (95.5680)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4701 s / it)
* Acc@1 80.852 Acc@5 95.754 loss 0.911
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.85%
Epoch: [144]  [   0/1251]  eta: 0:54:01  lr: 0.002358  min_lr: 0.002358  loss: 3.2602 (3.2602)  weight_decay: 0.0500 (0.0500)  time: 2.5912  data: 2.0661  max mem: 43713
Epoch: [144]  [ 200/1251]  eta: 0:09:24  lr: 0.002354  min_lr: 0.002354  loss: 3.0794 (3.1124)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9222 (1.1079)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [144]  [ 400/1251]  eta: 0:07:32  lr: 0.002350  min_lr: 0.002350  loss: 3.3759 (3.1494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8608 (0.9919)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [144]  [ 600/1251]  eta: 0:05:44  lr: 0.002347  min_lr: 0.002347  loss: 3.0933 (3.1317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7296 (nan)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [144]  [ 800/1251]  eta: 0:03:58  lr: 0.002343  min_lr: 0.002343  loss: 3.1210 (3.1383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [144]  [1000/1251]  eta: 0:02:12  lr: 0.002340  min_lr: 0.002340  loss: 3.3128 (3.1475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9790 (nan)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [144]  [1200/1251]  eta: 0:00:26  lr: 0.002336  min_lr: 0.002336  loss: 3.2702 (3.1440)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9534 (nan)  time: 0.5387  data: 0.0005  max mem: 43713
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.1821 (3.1448)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7634 (nan)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [144] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.1821 (3.1517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7634 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6497 (0.6497)  acc1: 86.0000 (86.0000)  acc5: 98.8000 (98.8000)  time: 5.8455  data: 5.5324  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7805 (0.7662)  acc1: 83.6000 (83.9273)  acc5: 98.0000 (97.7091)  time: 0.7719  data: 0.5032  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9533 (0.9186)  acc1: 78.8000 (80.6476)  acc5: 95.6000 (95.7333)  time: 0.2644  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0274 (0.9299)  acc1: 78.8000 (80.2720)  acc5: 94.4000 (95.5680)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4924 s / it)
* Acc@1 80.580 Acc@5 95.702 loss 0.914
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.85%
Epoch: [145]  [   0/1251]  eta: 1:15:56  lr: 0.002335  min_lr: 0.002335  loss: 3.0527 (3.0527)  weight_decay: 0.0500 (0.0500)  time: 3.6425  data: 1.7195  max mem: 43713
Epoch: [145]  [ 200/1251]  eta: 0:09:27  lr: 0.002332  min_lr: 0.002332  loss: 3.0681 (3.1042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8828 (0.9698)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [145]  [ 400/1251]  eta: 0:07:32  lr: 0.002328  min_lr: 0.002328  loss: 3.1321 (3.1224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7340 (0.9266)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [145]  [ 600/1251]  eta: 0:05:45  lr: 0.002325  min_lr: 0.002325  loss: 3.3150 (3.1356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8394 (0.9174)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [145]  [ 800/1251]  eta: 0:03:58  lr: 0.002321  min_lr: 0.002321  loss: 3.1243 (3.1383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8117 (0.9274)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [145]  [1000/1251]  eta: 0:02:12  lr: 0.002318  min_lr: 0.002318  loss: 3.2201 (3.1439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1773 (0.9472)  time: 0.5338  data: 0.0004  max mem: 43713
Epoch: [145]  [1200/1251]  eta: 0:00:26  lr: 0.002314  min_lr: 0.002314  loss: 3.3178 (3.1418)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8830 (0.9380)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 3.2992 (3.1427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.9310)  time: 0.4481  data: 0.0005  max mem: 43713
Epoch: [145] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 3.2992 (3.1395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7354 (0.9310)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.6746 (0.6746)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 4.4697  data: 4.1673  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8324 (0.8252)  acc1: 85.6000 (84.8364)  acc5: 97.6000 (97.2727)  time: 0.7067  data: 0.4392  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0102 (0.9722)  acc1: 80.0000 (80.9143)  acc5: 95.2000 (95.5048)  time: 0.2974  data: 0.0333  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0794 (0.9814)  acc1: 79.2000 (80.5120)  acc5: 94.8000 (95.5360)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4623 s / it)
* Acc@1 80.762 Acc@5 95.716 loss 0.970
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.85%
Epoch: [146]  [   0/1251]  eta: 1:13:47  lr: 0.002313  min_lr: 0.002313  loss: 2.3969 (2.3969)  weight_decay: 0.0500 (0.0500)  time: 3.5389  data: 2.6524  max mem: 43713
Epoch: [146]  [ 200/1251]  eta: 0:09:27  lr: 0.002310  min_lr: 0.002310  loss: 3.0856 (3.0896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7731 (0.9769)  time: 0.5346  data: 0.0005  max mem: 43713
Epoch: [146]  [ 400/1251]  eta: 0:07:33  lr: 0.002306  min_lr: 0.002306  loss: 3.2838 (3.1076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7346 (0.8773)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [146]  [ 600/1251]  eta: 0:05:45  lr: 0.002303  min_lr: 0.002303  loss: 3.2061 (3.1014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.9211)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [146]  [ 800/1251]  eta: 0:03:58  lr: 0.002299  min_lr: 0.002299  loss: 3.0848 (3.1061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7807 (0.9019)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [146]  [1000/1251]  eta: 0:02:12  lr: 0.002296  min_lr: 0.002296  loss: 2.9895 (3.1145)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8156 (0.8959)  time: 0.5329  data: 0.0005  max mem: 43713
Epoch: [146]  [1200/1251]  eta: 0:00:26  lr: 0.002292  min_lr: 0.002292  loss: 3.2743 (3.1193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6731 (0.8820)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.2372 (3.1199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.8826)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [146] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.2372 (3.1429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7664 (0.8826)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6290 (0.6290)  acc1: 88.4000 (88.4000)  acc5: 97.6000 (97.6000)  time: 5.5102  data: 5.2243  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7850 (0.7854)  acc1: 85.2000 (83.5273)  acc5: 97.2000 (97.2727)  time: 0.7417  data: 0.4752  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9887 (0.9186)  acc1: 78.4000 (80.6286)  acc5: 94.4000 (95.6191)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9757 (0.9259)  acc1: 78.4000 (80.2080)  acc5: 94.4000 (95.6960)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4793 s / it)
* Acc@1 80.704 Acc@5 95.790 loss 0.920
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.85%
Epoch: [147]  [   0/1251]  eta: 1:13:33  lr: 0.002291  min_lr: 0.002291  loss: 3.6304 (3.6304)  weight_decay: 0.0500 (0.0500)  time: 3.5280  data: 2.6474  max mem: 43713
Epoch: [147]  [ 200/1251]  eta: 0:09:30  lr: 0.002288  min_lr: 0.002288  loss: 3.3232 (3.1357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8319 (1.0290)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [147]  [ 400/1251]  eta: 0:07:34  lr: 0.002284  min_lr: 0.002284  loss: 3.0771 (3.1393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0774 (1.0354)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [147]  [ 600/1251]  eta: 0:05:45  lr: 0.002280  min_lr: 0.002280  loss: 3.2661 (3.1245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9078 (1.0316)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [147]  [ 800/1251]  eta: 0:03:58  lr: 0.002277  min_lr: 0.002277  loss: 3.1844 (3.1206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8984 (1.0026)  time: 0.5299  data: 0.0005  max mem: 43713
Epoch: [147]  [1000/1251]  eta: 0:02:12  lr: 0.002273  min_lr: 0.002273  loss: 3.4105 (3.1283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7764 (0.9820)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [147]  [1200/1251]  eta: 0:00:26  lr: 0.002270  min_lr: 0.002270  loss: 3.4063 (3.1351)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8266 (0.9882)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 3.2321 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.9793)  time: 0.4516  data: 0.0007  max mem: 43713
Epoch: [147] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 3.2321 (3.1486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7480 (0.9793)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6472 (0.6472)  acc1: 88.8000 (88.8000)  acc5: 98.0000 (98.0000)  time: 5.4631  data: 5.1466  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8197 (0.8155)  acc1: 83.6000 (83.8182)  acc5: 97.2000 (97.0182)  time: 0.7430  data: 0.4740  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0447 (0.9530)  acc1: 79.6000 (81.0476)  acc5: 94.8000 (95.4857)  time: 0.2676  data: 0.0034  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0447 (0.9664)  acc1: 78.8000 (80.7360)  acc5: 94.4000 (95.5040)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4783 s / it)
* Acc@1 80.680 Acc@5 95.746 loss 0.956
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.85%
Epoch: [148]  [   0/1251]  eta: 1:14:36  lr: 0.002269  min_lr: 0.002269  loss: 3.5469 (3.5469)  weight_decay: 0.0500 (0.0500)  time: 3.5787  data: 2.6760  max mem: 43713
Epoch: [148]  [ 200/1251]  eta: 0:09:27  lr: 0.002265  min_lr: 0.002265  loss: 3.2331 (3.1370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0023 (1.0377)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [148]  [ 400/1251]  eta: 0:07:32  lr: 0.002262  min_lr: 0.002262  loss: 3.2993 (3.1240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9568 (0.9427)  time: 0.5322  data: 0.0005  max mem: 43713
Epoch: [148]  [ 600/1251]  eta: 0:05:45  lr: 0.002258  min_lr: 0.002258  loss: 2.9632 (3.1320)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0371 (0.9314)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [148]  [ 800/1251]  eta: 0:03:58  lr: 0.002255  min_lr: 0.002255  loss: 3.3352 (3.1248)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.9364)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [148]  [1000/1251]  eta: 0:02:12  lr: 0.002251  min_lr: 0.002251  loss: 3.2465 (3.1293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7381 (0.9090)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [148]  [1200/1251]  eta: 0:00:26  lr: 0.002248  min_lr: 0.002248  loss: 3.3704 (3.1336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (0.9049)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 3.1742 (3.1328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.9171)  time: 0.4431  data: 0.0005  max mem: 43713
Epoch: [148] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 3.1742 (3.1412)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.9171)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6732 (0.6732)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.6820  data: 5.3511  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8140 (0.8090)  acc1: 84.4000 (83.9636)  acc5: 97.6000 (97.3091)  time: 0.7572  data: 0.4868  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0000 (0.9496)  acc1: 79.2000 (80.6286)  acc5: 95.2000 (95.7905)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0517 (0.9630)  acc1: 78.0000 (80.1280)  acc5: 95.2000 (95.7600)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4868 s / it)
* Acc@1 80.676 Acc@5 95.678 loss 0.957
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.85%
Epoch: [149]  [   0/1251]  eta: 1:16:14  lr: 0.002247  min_lr: 0.002247  loss: 3.2860 (3.2860)  weight_decay: 0.0500 (0.0500)  time: 3.6566  data: 2.7901  max mem: 43713
Epoch: [149]  [ 200/1251]  eta: 0:09:27  lr: 0.002243  min_lr: 0.002243  loss: 3.2840 (3.1851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0163 (1.0761)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [149]  [ 400/1251]  eta: 0:07:35  lr: 0.002240  min_lr: 0.002240  loss: 3.4238 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (1.0201)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [149]  [ 600/1251]  eta: 0:05:45  lr: 0.002236  min_lr: 0.002236  loss: 3.2776 (3.1687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1494 (1.0869)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [149]  [ 800/1251]  eta: 0:03:58  lr: 0.002232  min_lr: 0.002232  loss: 3.2996 (3.1647)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8082 (1.0193)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [149]  [1000/1251]  eta: 0:02:12  lr: 0.002229  min_lr: 0.002229  loss: 3.3875 (3.1675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9523 (1.0284)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [149]  [1200/1251]  eta: 0:00:26  lr: 0.002225  min_lr: 0.002225  loss: 3.1192 (3.1595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7550 (0.9981)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 3.0631 (3.1530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9046 (1.0016)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [149] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 3.0631 (3.1497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9046 (1.0016)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6040 (0.6040)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.5441  data: 5.2456  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8327 (0.7624)  acc1: 84.8000 (84.1818)  acc5: 97.6000 (97.4182)  time: 0.7449  data: 0.4771  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9293 (0.8957)  acc1: 79.6000 (80.7810)  acc5: 96.0000 (95.9429)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0102 (0.9089)  acc1: 78.8000 (80.4480)  acc5: 95.2000 (95.8560)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4820 s / it)
* Acc@1 80.846 Acc@5 95.864 loss 0.906
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.85%
Epoch: [150]  [   0/1251]  eta: 1:14:34  lr: 0.002224  min_lr: 0.002224  loss: 3.5462 (3.5462)  weight_decay: 0.0500 (0.0500)  time: 3.5763  data: 2.2719  max mem: 43713
Epoch: [150]  [ 200/1251]  eta: 0:09:31  lr: 0.002221  min_lr: 0.002221  loss: 3.1334 (3.1296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (0.8128)  time: 0.5353  data: 0.0004  max mem: 43713
Epoch: [150]  [ 400/1251]  eta: 0:07:34  lr: 0.002217  min_lr: 0.002217  loss: 3.0485 (3.1175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9023 (0.8630)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [150]  [ 600/1251]  eta: 0:05:45  lr: 0.002214  min_lr: 0.002214  loss: 3.1461 (3.1162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.8413)  time: 0.5297  data: 0.0004  max mem: 43713
Epoch: [150]  [ 800/1251]  eta: 0:03:58  lr: 0.002210  min_lr: 0.002210  loss: 3.3867 (3.1189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8460 (0.8915)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [150]  [1000/1251]  eta: 0:02:12  lr: 0.002207  min_lr: 0.002207  loss: 3.3142 (3.0998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8077 (0.9040)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [150]  [1200/1251]  eta: 0:00:26  lr: 0.002203  min_lr: 0.002203  loss: 3.2253 (3.1078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8596 (0.9066)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.2629 (3.1071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.9041)  time: 0.4432  data: 0.0005  max mem: 43713
Epoch: [150] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.2629 (3.1368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7377 (0.9041)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.5924 (0.5924)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.1408  data: 4.8410  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7224 (0.7506)  acc1: 85.2000 (84.7273)  acc5: 98.0000 (97.7818)  time: 0.7076  data: 0.4403  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9471 (0.9086)  acc1: 78.8000 (81.1238)  acc5: 95.6000 (95.7905)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0601 (0.9250)  acc1: 77.6000 (80.4480)  acc5: 94.0000 (95.7440)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4628 s / it)
* Acc@1 80.868 Acc@5 95.890 loss 0.920
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.87%
Epoch: [151]  [   0/1251]  eta: 1:03:13  lr: 0.002202  min_lr: 0.002202  loss: 3.3716 (3.3716)  weight_decay: 0.0500 (0.0500)  time: 3.0321  data: 2.4929  max mem: 43713
Epoch: [151]  [ 200/1251]  eta: 0:09:27  lr: 0.002198  min_lr: 0.002198  loss: 3.0392 (3.1377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7191 (0.9735)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [151]  [ 400/1251]  eta: 0:07:32  lr: 0.002195  min_lr: 0.002195  loss: 3.0787 (3.1296)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8384 (0.9855)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [151]  [ 600/1251]  eta: 0:05:45  lr: 0.002191  min_lr: 0.002191  loss: 3.2921 (3.1442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8752 (nan)  time: 0.5319  data: 0.0004  max mem: 43713
Epoch: [151]  [ 800/1251]  eta: 0:03:58  lr: 0.002188  min_lr: 0.002188  loss: 3.1532 (3.1398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (nan)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [151]  [1000/1251]  eta: 0:02:12  lr: 0.002184  min_lr: 0.002184  loss: 3.3908 (3.1478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8019 (nan)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [151]  [1200/1251]  eta: 0:00:26  lr: 0.002181  min_lr: 0.002181  loss: 3.4140 (3.1482)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0100 (nan)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.2094 (3.1495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (nan)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [151] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.2094 (3.1419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6198 (0.6198)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.4811  data: 5.1923  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7515 (0.7498)  acc1: 83.6000 (84.5818)  acc5: 97.6000 (97.2364)  time: 0.7391  data: 0.4723  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9245 (0.9096)  acc1: 79.6000 (81.4095)  acc5: 94.8000 (95.4095)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0374 (0.9220)  acc1: 78.4000 (80.8800)  acc5: 94.4000 (95.3440)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4797 s / it)
* Acc@1 80.816 Acc@5 95.712 loss 0.920
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.87%
Epoch: [152]  [   0/1251]  eta: 1:17:13  lr: 0.002180  min_lr: 0.002180  loss: 3.5980 (3.5980)  weight_decay: 0.0500 (0.0500)  time: 3.7040  data: 1.5507  max mem: 43713
Epoch: [152]  [ 200/1251]  eta: 0:09:28  lr: 0.002176  min_lr: 0.002176  loss: 3.1405 (3.1438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (0.9167)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [152]  [ 400/1251]  eta: 0:07:34  lr: 0.002173  min_lr: 0.002173  loss: 3.2079 (3.1689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7321 (0.9535)  time: 0.5391  data: 0.0004  max mem: 43713
Epoch: [152]  [ 600/1251]  eta: 0:05:45  lr: 0.002169  min_lr: 0.002169  loss: 3.3664 (3.1682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8161 (0.9265)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [152]  [ 800/1251]  eta: 0:03:58  lr: 0.002165  min_lr: 0.002165  loss: 3.2301 (3.1716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9584 (0.9913)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [152]  [1000/1251]  eta: 0:02:12  lr: 0.002162  min_lr: 0.002162  loss: 3.2034 (3.1577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (0.9903)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [152]  [1200/1251]  eta: 0:00:26  lr: 0.002158  min_lr: 0.002158  loss: 2.9991 (3.1467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8949 (0.9780)  time: 0.5279  data: 0.0004  max mem: 43713
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 2.9950 (3.1454)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7658 (0.9722)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [152] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 2.9950 (3.1290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7658 (0.9722)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5917 (0.5917)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.4972  data: 5.1764  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7302 (0.7507)  acc1: 85.6000 (83.8909)  acc5: 98.0000 (97.3455)  time: 0.7396  data: 0.4709  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9408 (0.8965)  acc1: 76.8000 (80.7619)  acc5: 96.0000 (95.6952)  time: 0.2637  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0001 (0.9079)  acc1: 76.8000 (80.4160)  acc5: 95.2000 (95.6000)  time: 0.2636  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4785 s / it)
* Acc@1 80.788 Acc@5 95.766 loss 0.894
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.87%
Epoch: [153]  [   0/1251]  eta: 1:11:36  lr: 0.002157  min_lr: 0.002157  loss: 2.7988 (2.7988)  weight_decay: 0.0500 (0.0500)  time: 3.4341  data: 2.8313  max mem: 43713
Epoch: [153]  [ 200/1251]  eta: 0:09:26  lr: 0.002154  min_lr: 0.002154  loss: 3.2030 (3.1693)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.9095)  time: 0.5308  data: 0.0004  max mem: 43713
Epoch: [153]  [ 400/1251]  eta: 0:07:34  lr: 0.002150  min_lr: 0.002150  loss: 3.4004 (3.1496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6800 (0.9182)  time: 0.5348  data: 0.0004  max mem: 43713
Epoch: [153]  [ 600/1251]  eta: 0:05:45  lr: 0.002147  min_lr: 0.002147  loss: 2.9411 (3.1220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8481 (0.9382)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [153]  [ 800/1251]  eta: 0:03:58  lr: 0.002143  min_lr: 0.002143  loss: 3.4200 (3.1265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.9215)  time: 0.5348  data: 0.0005  max mem: 43713
Epoch: [153]  [1000/1251]  eta: 0:02:12  lr: 0.002139  min_lr: 0.002139  loss: 3.1099 (3.1244)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0293 (0.9332)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [153]  [1200/1251]  eta: 0:00:26  lr: 0.002136  min_lr: 0.002136  loss: 3.3613 (3.1276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1280 (0.9493)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 3.1245 (3.1276)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0934 (0.9594)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [153] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 3.1245 (3.1259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0934 (0.9594)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.5833 (0.5833)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 4.3653  data: 4.0395  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7027 (0.7377)  acc1: 85.2000 (84.0364)  acc5: 97.6000 (97.5636)  time: 0.6954  data: 0.4255  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9365 (0.8811)  acc1: 79.2000 (81.0286)  acc5: 95.2000 (95.9619)  time: 0.2965  data: 0.0321  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9365 (0.8957)  acc1: 78.0000 (80.5920)  acc5: 95.2000 (95.9040)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4575 s / it)
* Acc@1 81.156 Acc@5 95.912 loss 0.882
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.16%
Epoch: [154]  [   0/1251]  eta: 1:00:32  lr: 0.002135  min_lr: 0.002135  loss: 2.8118 (2.8118)  weight_decay: 0.0500 (0.0500)  time: 2.9040  data: 2.3666  max mem: 43713
Epoch: [154]  [ 200/1251]  eta: 0:09:27  lr: 0.002131  min_lr: 0.002131  loss: 3.2824 (3.0849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8697 (1.0172)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [154]  [ 400/1251]  eta: 0:07:32  lr: 0.002128  min_lr: 0.002128  loss: 3.0318 (3.0646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8069 (0.9158)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [154]  [ 600/1251]  eta: 0:05:44  lr: 0.002124  min_lr: 0.002124  loss: 3.4095 (3.0842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2221 (0.9808)  time: 0.5295  data: 0.0004  max mem: 43713
Epoch: [154]  [ 800/1251]  eta: 0:03:58  lr: 0.002121  min_lr: 0.002121  loss: 3.3337 (3.0855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8106 (0.9601)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [154]  [1000/1251]  eta: 0:02:12  lr: 0.002117  min_lr: 0.002117  loss: 3.2912 (3.0893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (0.9356)  time: 0.5218  data: 0.0005  max mem: 43713
Epoch: [154]  [1200/1251]  eta: 0:00:26  lr: 0.002113  min_lr: 0.002113  loss: 3.1948 (3.0996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8637 (0.9304)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 3.0236 (3.0975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7651 (0.9304)  time: 0.4426  data: 0.0005  max mem: 43713
Epoch: [154] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 3.0236 (3.1109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7651 (0.9304)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6126 (0.6126)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.2506  data: 4.9678  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7743 (0.7638)  acc1: 85.6000 (84.0000)  acc5: 98.4000 (97.7818)  time: 0.7171  data: 0.4519  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9754 (0.8992)  acc1: 78.8000 (81.2381)  acc5: 95.6000 (96.2286)  time: 0.2636  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9754 (0.9154)  acc1: 78.8000 (80.7040)  acc5: 95.6000 (96.1760)  time: 0.2635  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4669 s / it)
* Acc@1 81.042 Acc@5 95.960 loss 0.906
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.16%
Epoch: [155]  [   0/1251]  eta: 1:10:19  lr: 0.002113  min_lr: 0.002113  loss: 2.2076 (2.2076)  weight_decay: 0.0500 (0.0500)  time: 3.3728  data: 2.3835  max mem: 43713
Epoch: [155]  [ 200/1251]  eta: 0:09:25  lr: 0.002109  min_lr: 0.002109  loss: 3.0563 (3.0357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9629 (1.0074)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [155]  [ 400/1251]  eta: 0:07:32  lr: 0.002105  min_lr: 0.002105  loss: 3.1778 (3.0740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8298 (0.9480)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [155]  [ 600/1251]  eta: 0:05:44  lr: 0.002102  min_lr: 0.002102  loss: 3.2481 (3.0883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8253 (0.9168)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [155]  [ 800/1251]  eta: 0:03:57  lr: 0.002098  min_lr: 0.002098  loss: 3.3059 (3.1026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9446 (0.9213)  time: 0.5219  data: 0.0004  max mem: 43713
Epoch: [155]  [1000/1251]  eta: 0:02:12  lr: 0.002095  min_lr: 0.002095  loss: 3.2948 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9194 (0.9390)  time: 0.5299  data: 0.0004  max mem: 43713
Epoch: [155]  [1200/1251]  eta: 0:00:26  lr: 0.002091  min_lr: 0.002091  loss: 3.4410 (3.1074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0364 (0.9553)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 3.1346 (3.1045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1608 (0.9653)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [155] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 3.1346 (3.1122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1608 (0.9653)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5504 (0.5504)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.6410  data: 5.3508  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7707 (0.7443)  acc1: 84.8000 (84.5091)  acc5: 98.4000 (97.7818)  time: 0.7533  data: 0.4867  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9045 (0.8911)  acc1: 78.4000 (80.9143)  acc5: 95.2000 (96.0571)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0101 (0.9065)  acc1: 78.0000 (80.3680)  acc5: 94.8000 (96.0480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4850 s / it)
* Acc@1 81.014 Acc@5 95.848 loss 0.899
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.16%
Epoch: [156]  [   0/1251]  eta: 1:15:47  lr: 0.002090  min_lr: 0.002090  loss: 2.9301 (2.9301)  weight_decay: 0.0500 (0.0500)  time: 3.6351  data: 2.6470  max mem: 43713
Epoch: [156]  [ 200/1251]  eta: 0:09:27  lr: 0.002087  min_lr: 0.002087  loss: 3.1774 (3.1369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7720 (0.9221)  time: 0.5300  data: 0.0005  max mem: 43713
Epoch: [156]  [ 400/1251]  eta: 0:07:34  lr: 0.002083  min_lr: 0.002083  loss: 3.2202 (3.1260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6868 (0.9244)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [156]  [ 600/1251]  eta: 0:05:45  lr: 0.002079  min_lr: 0.002079  loss: 3.2122 (3.1199)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9372 (0.9182)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [156]  [ 800/1251]  eta: 0:03:58  lr: 0.002076  min_lr: 0.002076  loss: 3.4103 (3.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0303 (0.9718)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [156]  [1000/1251]  eta: 0:02:12  lr: 0.002072  min_lr: 0.002072  loss: 3.2238 (3.1120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7145 (0.9668)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [156]  [1200/1251]  eta: 0:00:26  lr: 0.002069  min_lr: 0.002069  loss: 3.2104 (3.1123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9124 (0.9688)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 2.7780 (3.1105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8585 (0.9631)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [156] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 2.7780 (3.1083)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8585 (0.9631)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6327 (0.6327)  acc1: 90.0000 (90.0000)  acc5: 98.0000 (98.0000)  time: 5.5258  data: 5.2168  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7887 (0.7900)  acc1: 84.0000 (84.4727)  acc5: 97.6000 (97.4182)  time: 0.7432  data: 0.4746  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9502 (0.9242)  acc1: 79.2000 (81.1619)  acc5: 96.0000 (95.9238)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0164 (0.9365)  acc1: 78.4000 (80.8320)  acc5: 95.2000 (95.7760)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4810 s / it)
* Acc@1 81.246 Acc@5 95.942 loss 0.916
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.25%
Epoch: [157]  [   0/1251]  eta: 1:01:51  lr: 0.002068  min_lr: 0.002068  loss: 3.4000 (3.4000)  weight_decay: 0.0500 (0.0500)  time: 2.9671  data: 2.4234  max mem: 43713
Epoch: [157]  [ 200/1251]  eta: 0:09:27  lr: 0.002064  min_lr: 0.002064  loss: 3.0818 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0878 (1.0709)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [157]  [ 400/1251]  eta: 0:07:32  lr: 0.002061  min_lr: 0.002061  loss: 3.2993 (3.1183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8184 (1.0131)  time: 0.5247  data: 0.0004  max mem: 43713
Epoch: [157]  [ 600/1251]  eta: 0:05:44  lr: 0.002057  min_lr: 0.002057  loss: 2.8990 (3.1236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8843 (0.9779)  time: 0.5284  data: 0.0005  max mem: 43713
Epoch: [157]  [ 800/1251]  eta: 0:03:58  lr: 0.002053  min_lr: 0.002053  loss: 3.2185 (3.1247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0697 (0.9805)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [157]  [1000/1251]  eta: 0:02:12  lr: 0.002050  min_lr: 0.002050  loss: 3.1693 (3.1172)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9472 (0.9722)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [157]  [1200/1251]  eta: 0:00:26  lr: 0.002046  min_lr: 0.002046  loss: 3.3125 (3.1247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8434 (nan)  time: 0.5371  data: 0.0004  max mem: 43713
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 3.2665 (3.1251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9586 (nan)  time: 0.4433  data: 0.0006  max mem: 43713
Epoch: [157] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 3.2665 (3.1104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9586 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6727 (0.6727)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.4390  data: 5.1461  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7943 (0.8253)  acc1: 85.2000 (84.5455)  acc5: 98.0000 (97.5273)  time: 0.7345  data: 0.4681  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0294 (0.9608)  acc1: 78.8000 (81.2191)  acc5: 95.2000 (96.1333)  time: 0.2639  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0680 (0.9778)  acc1: 78.8000 (80.6880)  acc5: 94.8000 (95.8560)  time: 0.2638  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4741 s / it)
* Acc@1 81.166 Acc@5 96.016 loss 0.962
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.25%
Epoch: [158]  [   0/1251]  eta: 1:13:33  lr: 0.002045  min_lr: 0.002045  loss: 3.5695 (3.5695)  weight_decay: 0.0500 (0.0500)  time: 3.5279  data: 2.9832  max mem: 43713
Epoch: [158]  [ 200/1251]  eta: 0:09:26  lr: 0.002042  min_lr: 0.002042  loss: 2.9218 (3.0844)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8635 (0.9187)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [158]  [ 400/1251]  eta: 0:07:32  lr: 0.002038  min_lr: 0.002038  loss: 3.4370 (3.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0241 (0.8930)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [158]  [ 600/1251]  eta: 0:05:45  lr: 0.002035  min_lr: 0.002035  loss: 3.0777 (3.0943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2465 (0.9831)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [158]  [ 800/1251]  eta: 0:03:58  lr: 0.002031  min_lr: 0.002031  loss: 3.3767 (3.0950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8028 (0.9933)  time: 0.5277  data: 0.0004  max mem: 43713
Epoch: [158]  [1000/1251]  eta: 0:02:12  lr: 0.002027  min_lr: 0.002027  loss: 3.2312 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8041 (0.9547)  time: 0.5282  data: 0.0005  max mem: 43713
Epoch: [158]  [1200/1251]  eta: 0:00:26  lr: 0.002024  min_lr: 0.002024  loss: 3.3005 (3.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0753 (0.9848)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 3.1907 (3.1117)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2254 (0.9959)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [158] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 3.1907 (3.1156)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2254 (0.9959)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5933 (0.5933)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6203  data: 5.3199  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7761 (0.7711)  acc1: 85.6000 (84.4000)  acc5: 97.2000 (97.3455)  time: 0.7516  data: 0.4839  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9741 (0.9114)  acc1: 78.8000 (80.9905)  acc5: 94.8000 (95.7333)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9908 (0.9225)  acc1: 78.4000 (80.5600)  acc5: 94.8000 (95.6160)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4833 s / it)
* Acc@1 81.172 Acc@5 95.906 loss 0.907
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.25%
Epoch: [159]  [   0/1251]  eta: 1:15:39  lr: 0.002023  min_lr: 0.002023  loss: 3.6739 (3.6739)  weight_decay: 0.0500 (0.0500)  time: 3.6286  data: 2.5398  max mem: 43713
Epoch: [159]  [ 200/1251]  eta: 0:09:27  lr: 0.002019  min_lr: 0.002019  loss: 2.9861 (3.1093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8262 (0.8577)  time: 0.5305  data: 0.0005  max mem: 43713
Epoch: [159]  [ 400/1251]  eta: 0:07:34  lr: 0.002016  min_lr: 0.002016  loss: 3.0049 (3.1259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7566 (0.8699)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [159]  [ 600/1251]  eta: 0:05:45  lr: 0.002012  min_lr: 0.002012  loss: 3.0756 (3.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (0.9070)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [159]  [ 800/1251]  eta: 0:03:58  lr: 0.002009  min_lr: 0.002009  loss: 3.3632 (3.1201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9346 (0.9089)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [159]  [1000/1251]  eta: 0:02:12  lr: 0.002005  min_lr: 0.002005  loss: 2.9341 (3.1001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.9171)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [159]  [1200/1251]  eta: 0:00:26  lr: 0.002001  min_lr: 0.002001  loss: 3.1806 (3.1066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7046 (0.9106)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 3.2141 (3.1088)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.9076)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [159] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 3.2141 (3.1033)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7811 (0.9076)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5463 (0.5463)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.4930  data: 5.2067  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7366 (0.7435)  acc1: 86.0000 (84.8727)  acc5: 97.6000 (97.4545)  time: 0.7400  data: 0.4736  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9515 (0.8836)  acc1: 78.8000 (81.3524)  acc5: 95.6000 (95.9429)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9656 (0.8987)  acc1: 78.8000 (80.8160)  acc5: 94.8000 (95.8080)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4786 s / it)
* Acc@1 81.296 Acc@5 96.028 loss 0.887
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.30%
Epoch: [160]  [   0/1251]  eta: 0:58:51  lr: 0.002001  min_lr: 0.002001  loss: 3.3208 (3.3208)  weight_decay: 0.0500 (0.0500)  time: 2.8226  data: 2.2953  max mem: 43713
Epoch: [160]  [ 200/1251]  eta: 0:09:27  lr: 0.001997  min_lr: 0.001997  loss: 3.2799 (3.0801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.9426)  time: 0.5299  data: 0.0004  max mem: 43713
Epoch: [160]  [ 400/1251]  eta: 0:07:32  lr: 0.001993  min_lr: 0.001993  loss: 3.1704 (3.0814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8424 (0.9818)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [160]  [ 600/1251]  eta: 0:05:44  lr: 0.001990  min_lr: 0.001990  loss: 2.9314 (3.0825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7078 (0.9437)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [160]  [ 800/1251]  eta: 0:03:58  lr: 0.001986  min_lr: 0.001986  loss: 3.3444 (3.0875)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8505 (0.9305)  time: 0.5250  data: 0.0004  max mem: 43713
Epoch: [160]  [1000/1251]  eta: 0:02:12  lr: 0.001983  min_lr: 0.001983  loss: 3.2483 (3.0825)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7063 (0.9341)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [160]  [1200/1251]  eta: 0:00:26  lr: 0.001979  min_lr: 0.001979  loss: 2.9960 (3.0769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9885 (0.9384)  time: 0.5317  data: 0.0005  max mem: 43713
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 3.3705 (3.0786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (0.9379)  time: 0.4472  data: 0.0006  max mem: 43713
Epoch: [160] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 3.3705 (3.1019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (0.9379)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6640 (0.6640)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.2336  data: 4.9350  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8218 (0.8125)  acc1: 84.4000 (84.6546)  acc5: 98.0000 (97.6000)  time: 0.7165  data: 0.4489  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0319 (0.9513)  acc1: 79.2000 (81.4286)  acc5: 96.0000 (96.0762)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0402 (0.9697)  acc1: 79.2000 (80.8160)  acc5: 94.8000 (95.9360)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4681 s / it)
* Acc@1 81.304 Acc@5 95.988 loss 0.959
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.30%
Epoch: [161]  [   0/1251]  eta: 0:59:12  lr: 0.001978  min_lr: 0.001978  loss: 2.8200 (2.8200)  weight_decay: 0.0500 (0.0500)  time: 2.8393  data: 2.2954  max mem: 43713
Epoch: [161]  [ 200/1251]  eta: 0:09:23  lr: 0.001974  min_lr: 0.001974  loss: 3.1058 (3.0436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1504 (1.1840)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [161]  [ 400/1251]  eta: 0:07:30  lr: 0.001971  min_lr: 0.001971  loss: 2.8653 (3.0486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8731 (1.0446)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [161]  [ 600/1251]  eta: 0:05:44  lr: 0.001967  min_lr: 0.001967  loss: 3.2013 (3.0414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (1.0055)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [161]  [ 800/1251]  eta: 0:03:57  lr: 0.001964  min_lr: 0.001964  loss: 3.2080 (3.0461)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9780 (1.0149)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [161]  [1000/1251]  eta: 0:02:12  lr: 0.001960  min_lr: 0.001960  loss: 3.1307 (3.0618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7387 (1.0010)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [161]  [1200/1251]  eta: 0:00:26  lr: 0.001956  min_lr: 0.001956  loss: 3.2967 (3.0670)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7076 (0.9905)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 3.0292 (3.0702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.9803)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [161] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 3.0292 (3.0923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6936 (0.9803)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5914 (0.5914)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5328  data: 5.2179  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7838 (0.7680)  acc1: 84.4000 (84.5455)  acc5: 97.6000 (97.5273)  time: 0.7433  data: 0.4747  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9847 (0.9121)  acc1: 78.8000 (81.2762)  acc5: 96.0000 (96.0762)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0190 (0.9235)  acc1: 78.8000 (80.8000)  acc5: 95.2000 (96.0160)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4814 s / it)
* Acc@1 81.280 Acc@5 95.948 loss 0.918
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.30%
Epoch: [162]  [   0/1251]  eta: 1:13:52  lr: 0.001956  min_lr: 0.001956  loss: 2.4502 (2.4502)  weight_decay: 0.0500 (0.0500)  time: 3.5435  data: 1.6251  max mem: 43713
Epoch: [162]  [ 200/1251]  eta: 0:09:25  lr: 0.001952  min_lr: 0.001952  loss: 3.2981 (3.0934)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8032 (0.9466)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [162]  [ 400/1251]  eta: 0:07:33  lr: 0.001948  min_lr: 0.001948  loss: 3.1327 (3.0879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0559 (0.9503)  time: 0.5326  data: 0.0005  max mem: 43713
Epoch: [162]  [ 600/1251]  eta: 0:05:45  lr: 0.001945  min_lr: 0.001945  loss: 3.2764 (3.0834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9479 (0.9586)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [162]  [ 800/1251]  eta: 0:03:58  lr: 0.001941  min_lr: 0.001941  loss: 3.2981 (3.0801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7719 (0.9430)  time: 0.5276  data: 0.0005  max mem: 43713
Epoch: [162]  [1000/1251]  eta: 0:02:12  lr: 0.001938  min_lr: 0.001938  loss: 3.3773 (3.0877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0349 (0.9440)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [162]  [1200/1251]  eta: 0:00:26  lr: 0.001934  min_lr: 0.001934  loss: 3.3770 (3.0942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1127 (0.9465)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 3.2523 (3.0946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1828 (0.9511)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [162] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 3.2523 (3.0851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1828 (0.9511)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6411 (0.6411)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.6731  data: 5.3648  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7783 (0.7972)  acc1: 85.2000 (84.8364)  acc5: 98.4000 (97.8182)  time: 0.7555  data: 0.4880  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9852 (0.9353)  acc1: 79.2000 (81.5429)  acc5: 95.6000 (96.0381)  time: 0.2636  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0249 (0.9533)  acc1: 78.4000 (80.8000)  acc5: 95.2000 (95.8880)  time: 0.2635  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4828 s / it)
* Acc@1 81.174 Acc@5 95.964 loss 0.944
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.30%
Epoch: [163]  [   0/1251]  eta: 1:14:33  lr: 0.001933  min_lr: 0.001933  loss: 3.5346 (3.5346)  weight_decay: 0.0500 (0.0500)  time: 3.5763  data: 2.9261  max mem: 43713
Epoch: [163]  [ 200/1251]  eta: 0:09:29  lr: 0.001930  min_lr: 0.001930  loss: 3.0531 (3.0826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9625 (1.0427)  time: 0.5313  data: 0.0004  max mem: 43713
Epoch: [163]  [ 400/1251]  eta: 0:07:33  lr: 0.001926  min_lr: 0.001926  loss: 3.3928 (3.1014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7390 (0.9385)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [163]  [ 600/1251]  eta: 0:05:45  lr: 0.001922  min_lr: 0.001922  loss: 3.1997 (3.1049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9473 (0.9594)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [163]  [ 800/1251]  eta: 0:03:58  lr: 0.001919  min_lr: 0.001919  loss: 3.2692 (3.0969)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9394 (0.9797)  time: 0.5295  data: 0.0005  max mem: 43713
Epoch: [163]  [1000/1251]  eta: 0:02:12  lr: 0.001915  min_lr: 0.001915  loss: 3.0780 (3.0952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9658 (1.0273)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [163]  [1200/1251]  eta: 0:00:26  lr: 0.001912  min_lr: 0.001912  loss: 3.0532 (3.0961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (1.0098)  time: 0.5297  data: 0.0005  max mem: 43713
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.2184 (3.0960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0087 (1.0221)  time: 0.4524  data: 0.0007  max mem: 43713
Epoch: [163] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.2184 (3.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0087 (1.0221)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6977 (0.6977)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.6451  data: 5.3265  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8233 (0.8342)  acc1: 85.2000 (84.3273)  acc5: 97.6000 (97.4546)  time: 0.7537  data: 0.4845  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0261 (0.9550)  acc1: 79.6000 (81.2381)  acc5: 95.6000 (95.8857)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0466 (0.9695)  acc1: 78.8000 (81.0400)  acc5: 95.2000 (95.7600)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4842 s / it)
* Acc@1 81.408 Acc@5 95.972 loss 0.955
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.41%
Epoch: [164]  [   0/1251]  eta: 1:05:09  lr: 0.001911  min_lr: 0.001911  loss: 3.1434 (3.1434)  weight_decay: 0.0500 (0.0500)  time: 3.1249  data: 2.5843  max mem: 43713
Epoch: [164]  [ 200/1251]  eta: 0:09:26  lr: 0.001907  min_lr: 0.001907  loss: 3.2052 (3.0901)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7531 (0.9052)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [164]  [ 400/1251]  eta: 0:07:32  lr: 0.001904  min_lr: 0.001904  loss: 2.9651 (3.0807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8928 (0.9382)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [164]  [ 600/1251]  eta: 0:05:44  lr: 0.001900  min_lr: 0.001900  loss: 3.0633 (3.1020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.9881)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [164]  [ 800/1251]  eta: 0:03:58  lr: 0.001896  min_lr: 0.001896  loss: 2.9803 (3.1000)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0699 (0.9846)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [164]  [1000/1251]  eta: 0:02:12  lr: 0.001893  min_lr: 0.001893  loss: 3.0440 (3.0962)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [164]  [1200/1251]  eta: 0:00:26  lr: 0.001889  min_lr: 0.001889  loss: 3.3146 (3.0955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (nan)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.1745 (3.1009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8641 (nan)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [164] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.1745 (3.0886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8641 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7111 (0.7111)  acc1: 88.0000 (88.0000)  acc5: 98.0000 (98.0000)  time: 5.3973  data: 5.1011  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8132 (0.8439)  acc1: 86.0000 (84.5455)  acc5: 98.0000 (97.4546)  time: 0.7314  data: 0.4641  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0295 (0.9823)  acc1: 79.2000 (81.3714)  acc5: 95.2000 (95.9619)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0295 (0.9947)  acc1: 79.2000 (81.0400)  acc5: 95.6000 (95.9200)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4747 s / it)
* Acc@1 81.256 Acc@5 96.018 loss 0.987
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.41%
Epoch: [165]  [   0/1251]  eta: 1:15:10  lr: 0.001888  min_lr: 0.001888  loss: 2.9819 (2.9819)  weight_decay: 0.0500 (0.0500)  time: 3.6056  data: 2.0766  max mem: 43713
Epoch: [165]  [ 200/1251]  eta: 0:09:27  lr: 0.001885  min_lr: 0.001885  loss: 2.8999 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1370 (1.0084)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [165]  [ 400/1251]  eta: 0:07:33  lr: 0.001881  min_lr: 0.001881  loss: 3.3089 (3.0770)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8551 (1.0077)  time: 0.5331  data: 0.0005  max mem: 43713
Epoch: [165]  [ 600/1251]  eta: 0:05:45  lr: 0.001878  min_lr: 0.001878  loss: 3.2445 (3.0927)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7819 (0.9656)  time: 0.5304  data: 0.0005  max mem: 43713
Epoch: [165]  [ 800/1251]  eta: 0:03:58  lr: 0.001874  min_lr: 0.001874  loss: 3.1991 (3.0974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8982 (0.9858)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [165]  [1000/1251]  eta: 0:02:12  lr: 0.001870  min_lr: 0.001870  loss: 3.0195 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7608 (0.9642)  time: 0.5386  data: 0.0005  max mem: 43713
Epoch: [165]  [1200/1251]  eta: 0:00:26  lr: 0.001867  min_lr: 0.001867  loss: 3.2710 (3.1072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.9808)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.2766 (3.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.9822)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [165] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.2766 (3.0864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.9822)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6565 (0.6565)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.2113  data: 4.9086  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7703 (0.7681)  acc1: 86.0000 (84.5818)  acc5: 98.0000 (97.6727)  time: 0.7149  data: 0.4466  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9684 (0.8985)  acc1: 80.0000 (81.3333)  acc5: 95.6000 (96.1714)  time: 0.2651  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9490 (0.9069)  acc1: 79.2000 (80.9280)  acc5: 96.0000 (96.2720)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4678 s / it)
* Acc@1 81.452 Acc@5 96.068 loss 0.904
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.45%
Epoch: [166]  [   0/1251]  eta: 1:01:23  lr: 0.001866  min_lr: 0.001866  loss: 2.8695 (2.8695)  weight_decay: 0.0500 (0.0500)  time: 2.9447  data: 2.4173  max mem: 43713
Epoch: [166]  [ 200/1251]  eta: 0:09:24  lr: 0.001862  min_lr: 0.001862  loss: 2.9222 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7308 (0.9428)  time: 0.5312  data: 0.0004  max mem: 43713
Epoch: [166]  [ 400/1251]  eta: 0:07:33  lr: 0.001859  min_lr: 0.001859  loss: 3.2478 (3.0303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9166 (0.9229)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [166]  [ 600/1251]  eta: 0:05:44  lr: 0.001855  min_lr: 0.001855  loss: 3.2351 (3.0451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9338 (0.9576)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [166]  [ 800/1251]  eta: 0:03:58  lr: 0.001852  min_lr: 0.001852  loss: 3.1855 (3.0367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (0.9715)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [166]  [1000/1251]  eta: 0:02:12  lr: 0.001848  min_lr: 0.001848  loss: 3.1870 (3.0408)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0516 (0.9875)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [166]  [1200/1251]  eta: 0:00:26  lr: 0.001844  min_lr: 0.001844  loss: 3.3252 (3.0509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1194 (0.9826)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 3.1085 (3.0534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (0.9869)  time: 0.4479  data: 0.0005  max mem: 43713
Epoch: [166] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 3.1085 (3.0726)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9465 (0.9869)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5724 (0.5724)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.3444  data: 5.0448  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7618 (0.7719)  acc1: 86.0000 (84.7273)  acc5: 97.6000 (97.5273)  time: 0.7265  data: 0.4589  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9871 (0.9094)  acc1: 79.6000 (81.5048)  acc5: 95.6000 (95.9429)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0163 (0.9183)  acc1: 79.6000 (81.2320)  acc5: 95.6000 (95.9520)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4730 s / it)
* Acc@1 81.388 Acc@5 95.964 loss 0.918
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.45%
Epoch: [167]  [   0/1251]  eta: 1:12:55  lr: 0.001844  min_lr: 0.001844  loss: 3.4522 (3.4522)  weight_decay: 0.0500 (0.0500)  time: 3.4974  data: 2.8730  max mem: 43713
Epoch: [167]  [ 200/1251]  eta: 0:09:30  lr: 0.001840  min_lr: 0.001840  loss: 3.3308 (3.0620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (1.0147)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [167]  [ 400/1251]  eta: 0:07:33  lr: 0.001836  min_lr: 0.001836  loss: 3.2828 (3.0664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0638 (nan)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [167]  [ 600/1251]  eta: 0:05:45  lr: 0.001833  min_lr: 0.001833  loss: 3.1893 (3.0897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9966 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [167]  [ 800/1251]  eta: 0:03:58  lr: 0.001829  min_lr: 0.001829  loss: 3.2264 (3.1113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7624 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [167]  [1000/1251]  eta: 0:02:12  lr: 0.001826  min_lr: 0.001826  loss: 3.2058 (3.1039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (nan)  time: 0.5249  data: 0.0005  max mem: 43713
Epoch: [167]  [1200/1251]  eta: 0:00:26  lr: 0.001822  min_lr: 0.001822  loss: 3.3348 (3.1079)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (nan)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 3.0566 (3.1030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9223 (nan)  time: 0.4440  data: 0.0006  max mem: 43713
Epoch: [167] Total time: 0:10:59 (0.5276 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 3.0566 (3.0857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9223 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6156 (0.6156)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.5681  data: 5.2697  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7643 (0.7699)  acc1: 87.6000 (85.2364)  acc5: 97.6000 (97.6727)  time: 0.7464  data: 0.4794  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9950 (0.9195)  acc1: 79.2000 (81.3524)  acc5: 95.6000 (95.8476)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0104 (0.9317)  acc1: 79.2000 (80.8960)  acc5: 95.2000 (95.8240)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4793 s / it)
* Acc@1 81.412 Acc@5 95.942 loss 0.923
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.45%
Epoch: [168]  [   0/1251]  eta: 1:16:27  lr: 0.001821  min_lr: 0.001821  loss: 3.0445 (3.0445)  weight_decay: 0.0500 (0.0500)  time: 3.6670  data: 2.8691  max mem: 43713
Epoch: [168]  [ 200/1251]  eta: 0:09:26  lr: 0.001818  min_lr: 0.001818  loss: 2.9631 (3.0815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.0342)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [168]  [ 400/1251]  eta: 0:07:33  lr: 0.001814  min_lr: 0.001814  loss: 3.2906 (3.0764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2519 (1.1364)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [168]  [ 600/1251]  eta: 0:05:45  lr: 0.001811  min_lr: 0.001811  loss: 3.1814 (3.0773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8921 (1.1338)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [168]  [ 800/1251]  eta: 0:03:58  lr: 0.001807  min_lr: 0.001807  loss: 3.0362 (3.0698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7691 (1.0625)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [168]  [1000/1251]  eta: 0:02:12  lr: 0.001803  min_lr: 0.001803  loss: 3.0887 (3.0725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7342 (1.0260)  time: 0.5339  data: 0.0005  max mem: 43713
Epoch: [168]  [1200/1251]  eta: 0:00:26  lr: 0.001800  min_lr: 0.001800  loss: 3.0833 (3.0641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8553 (1.0016)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 3.2788 (3.0648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1262 (1.0180)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [168] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 3.2788 (3.0763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1262 (1.0180)
Test:  [ 0/25]  eta: 0:02:00  loss: 0.5834 (0.5834)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 4.8071  data: 4.4791  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7645 (0.7553)  acc1: 85.6000 (84.6546)  acc5: 98.0000 (97.7455)  time: 0.7051  data: 0.4352  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9689 (0.9119)  acc1: 79.6000 (81.2191)  acc5: 96.0000 (95.9429)  time: 0.2795  data: 0.0154  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9755 (0.9166)  acc1: 78.8000 (80.7360)  acc5: 95.6000 (95.9840)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4635 s / it)
* Acc@1 81.494 Acc@5 96.064 loss 0.902
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.49%
Epoch: [169]  [   0/1251]  eta: 1:08:32  lr: 0.001799  min_lr: 0.001799  loss: 3.1546 (3.1546)  weight_decay: 0.0500 (0.0500)  time: 3.2872  data: 2.7497  max mem: 43713
Epoch: [169]  [ 200/1251]  eta: 0:09:26  lr: 0.001795  min_lr: 0.001795  loss: 2.6857 (3.0904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9615 (1.0173)  time: 0.5306  data: 0.0005  max mem: 43713
Epoch: [169]  [ 400/1251]  eta: 0:07:33  lr: 0.001792  min_lr: 0.001792  loss: 3.0544 (3.0771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7992 (1.0057)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [169]  [ 600/1251]  eta: 0:05:44  lr: 0.001788  min_lr: 0.001788  loss: 3.1902 (3.0566)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8610 (0.9660)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [169]  [ 800/1251]  eta: 0:03:58  lr: 0.001785  min_lr: 0.001785  loss: 3.2257 (3.0611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8408 (0.9913)  time: 0.5269  data: 0.0004  max mem: 43713
Epoch: [169]  [1000/1251]  eta: 0:02:12  lr: 0.001781  min_lr: 0.001781  loss: 3.0846 (3.0677)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1263 (1.0068)  time: 0.5246  data: 0.0004  max mem: 43713
Epoch: [169]  [1200/1251]  eta: 0:00:26  lr: 0.001777  min_lr: 0.001777  loss: 3.2225 (3.0762)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0114 (1.0241)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.1857 (3.0783)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2012 (1.0282)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [169] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.1857 (3.0616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2012 (1.0282)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.6681 (0.6681)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 4.2727  data: 3.9734  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8085 (0.8028)  acc1: 84.4000 (84.2182)  acc5: 97.6000 (97.4909)  time: 0.6957  data: 0.4285  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9484 (0.9357)  acc1: 79.2000 (81.3714)  acc5: 95.6000 (96.0000)  time: 0.3036  data: 0.0371  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9931 (0.9461)  acc1: 79.2000 (81.0240)  acc5: 95.6000 (95.8080)  time: 0.2672  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4598 s / it)
* Acc@1 81.490 Acc@5 95.970 loss 0.932
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.49%
Epoch: [170]  [   0/1251]  eta: 1:15:44  lr: 0.001777  min_lr: 0.001777  loss: 2.9598 (2.9598)  weight_decay: 0.0500 (0.0500)  time: 3.6325  data: 2.9263  max mem: 43713
Epoch: [170]  [ 200/1251]  eta: 0:09:32  lr: 0.001773  min_lr: 0.001773  loss: 3.2339 (3.0318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6925 (0.8309)  time: 0.5250  data: 0.0005  max mem: 43713
Epoch: [170]  [ 400/1251]  eta: 0:07:34  lr: 0.001769  min_lr: 0.001769  loss: 3.3929 (3.0712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1134 (0.9374)  time: 0.5353  data: 0.0005  max mem: 43713
Epoch: [170]  [ 600/1251]  eta: 0:05:45  lr: 0.001766  min_lr: 0.001766  loss: 3.1369 (3.0701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7399 (0.9380)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [170]  [ 800/1251]  eta: 0:03:58  lr: 0.001762  min_lr: 0.001762  loss: 3.1945 (3.0745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8370 (0.9338)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [170]  [1000/1251]  eta: 0:02:12  lr: 0.001759  min_lr: 0.001759  loss: 3.0524 (3.0744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9270 (0.9593)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [170]  [1200/1251]  eta: 0:00:26  lr: 0.001755  min_lr: 0.001755  loss: 3.2586 (3.0708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8429 (0.9462)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 3.2608 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9617 (0.9519)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [170] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 3.2608 (3.0700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9617 (0.9519)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6663 (0.6663)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.7008  data: 5.4051  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8454 (0.8241)  acc1: 85.6000 (84.5091)  acc5: 97.6000 (97.6364)  time: 0.7588  data: 0.4917  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9850 (0.9690)  acc1: 79.2000 (80.9905)  acc5: 96.0000 (96.0381)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0572 (0.9784)  acc1: 78.8000 (80.7520)  acc5: 94.8000 (95.9360)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4869 s / it)
* Acc@1 81.452 Acc@5 96.008 loss 0.964
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.49%
Epoch: [171]  [   0/1251]  eta: 1:09:08  lr: 0.001754  min_lr: 0.001754  loss: 3.4561 (3.4561)  weight_decay: 0.0500 (0.0500)  time: 3.3163  data: 2.4787  max mem: 43713
Epoch: [171]  [ 200/1251]  eta: 0:09:26  lr: 0.001751  min_lr: 0.001751  loss: 3.0093 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7386 (1.0246)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [171]  [ 400/1251]  eta: 0:07:31  lr: 0.001747  min_lr: 0.001747  loss: 2.9611 (3.0297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (0.9464)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [171]  [ 600/1251]  eta: 0:05:44  lr: 0.001744  min_lr: 0.001744  loss: 3.0166 (3.0281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8244 (0.9363)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [171]  [ 800/1251]  eta: 0:03:58  lr: 0.001740  min_lr: 0.001740  loss: 2.9726 (3.0305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7710 (0.9458)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [171]  [1000/1251]  eta: 0:02:12  lr: 0.001737  min_lr: 0.001737  loss: 3.2657 (3.0413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1066 (0.9689)  time: 0.5288  data: 0.0005  max mem: 43713
Epoch: [171]  [1200/1251]  eta: 0:00:26  lr: 0.001733  min_lr: 0.001733  loss: 3.3447 (3.0542)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0203 (0.9906)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 3.1427 (3.0534)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0215 (0.9830)  time: 0.4504  data: 0.0006  max mem: 43713
Epoch: [171] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 3.1427 (3.0615)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0215 (0.9830)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5808 (0.5808)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.6705  data: 5.3579  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7771 (0.7454)  acc1: 86.4000 (85.0909)  acc5: 97.6000 (97.4545)  time: 0.7561  data: 0.4874  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8911 (0.8835)  acc1: 79.6000 (81.9048)  acc5: 95.2000 (95.9810)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9989 (0.8990)  acc1: 79.6000 (81.4720)  acc5: 95.2000 (95.8880)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4848 s / it)
* Acc@1 81.668 Acc@5 96.128 loss 0.892
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.67%
Epoch: [172]  [   0/1251]  eta: 0:53:50  lr: 0.001732  min_lr: 0.001732  loss: 3.3462 (3.3462)  weight_decay: 0.0500 (0.0500)  time: 2.5823  data: 2.0480  max mem: 43713
Epoch: [172]  [ 200/1251]  eta: 0:09:20  lr: 0.001729  min_lr: 0.001729  loss: 3.1355 (3.0411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8951 (0.9690)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [172]  [ 400/1251]  eta: 0:07:31  lr: 0.001725  min_lr: 0.001725  loss: 3.0128 (3.0498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8245 (0.9764)  time: 0.5397  data: 0.0004  max mem: 43713
Epoch: [172]  [ 600/1251]  eta: 0:05:44  lr: 0.001721  min_lr: 0.001721  loss: 2.9545 (3.0463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9156 (0.9971)  time: 0.5247  data: 0.0004  max mem: 43713
Epoch: [172]  [ 800/1251]  eta: 0:03:57  lr: 0.001718  min_lr: 0.001718  loss: 3.0726 (3.0316)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1436 (1.0610)  time: 0.5288  data: 0.0004  max mem: 43713
Epoch: [172]  [1000/1251]  eta: 0:02:12  lr: 0.001714  min_lr: 0.001714  loss: 3.1901 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0543 (1.0450)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [172]  [1200/1251]  eta: 0:00:26  lr: 0.001711  min_lr: 0.001711  loss: 3.0762 (3.0392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7757 (1.0331)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 3.3358 (3.0406)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (1.0258)  time: 0.4440  data: 0.0007  max mem: 43713
Epoch: [172] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 3.3358 (3.0565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7946 (1.0258)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6574 (0.6574)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.2088  data: 4.9104  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8110 (0.8205)  acc1: 86.0000 (84.8727)  acc5: 97.6000 (97.8182)  time: 0.7145  data: 0.4468  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0531 (0.9622)  acc1: 79.2000 (81.7143)  acc5: 95.6000 (96.1714)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0613 (0.9759)  acc1: 79.6000 (81.5520)  acc5: 95.2000 (96.0640)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4679 s / it)
* Acc@1 81.722 Acc@5 96.112 loss 0.963
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.72%
Epoch: [173]  [   0/1251]  eta: 0:56:05  lr: 0.001710  min_lr: 0.001710  loss: 3.3739 (3.3739)  weight_decay: 0.0500 (0.0500)  time: 2.6905  data: 2.1632  max mem: 43713
Epoch: [173]  [ 200/1251]  eta: 0:09:25  lr: 0.001706  min_lr: 0.001706  loss: 3.2021 (3.0595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0196 (1.0747)  time: 0.5307  data: 0.0004  max mem: 43713
Epoch: [173]  [ 400/1251]  eta: 0:07:32  lr: 0.001703  min_lr: 0.001703  loss: 3.0267 (3.0545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8699 (1.0364)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [173]  [ 600/1251]  eta: 0:05:44  lr: 0.001699  min_lr: 0.001699  loss: 3.3552 (3.0522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8424 (0.9709)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [173]  [ 800/1251]  eta: 0:03:58  lr: 0.001696  min_lr: 0.001696  loss: 3.1993 (3.0605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8680 (0.9838)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [173]  [1000/1251]  eta: 0:02:12  lr: 0.001692  min_lr: 0.001692  loss: 3.0987 (3.0587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0520 (0.9910)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [173]  [1200/1251]  eta: 0:00:26  lr: 0.001689  min_lr: 0.001689  loss: 3.2337 (3.0736)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0909 (1.0254)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 2.9874 (3.0713)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.0345)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [173] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 2.9874 (3.0653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.0345)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5443 (0.5443)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.3160  data: 5.0095  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7521 (0.7471)  acc1: 86.8000 (85.2727)  acc5: 97.6000 (97.3818)  time: 0.7237  data: 0.4557  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9577 (0.8835)  acc1: 79.2000 (81.7905)  acc5: 95.6000 (96.0762)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9577 (0.8997)  acc1: 79.2000 (81.2800)  acc5: 95.6000 (95.9680)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4720 s / it)
* Acc@1 81.718 Acc@5 96.050 loss 0.888
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.72%
Epoch: [174]  [   0/1251]  eta: 1:08:30  lr: 0.001688  min_lr: 0.001688  loss: 2.5273 (2.5273)  weight_decay: 0.0500 (0.0500)  time: 3.2855  data: 1.6307  max mem: 43713
Epoch: [174]  [ 200/1251]  eta: 0:09:27  lr: 0.001684  min_lr: 0.001684  loss: 3.1740 (3.0925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3249 (1.1075)  time: 0.5250  data: 0.0005  max mem: 43713
Epoch: [174]  [ 400/1251]  eta: 0:07:32  lr: 0.001681  min_lr: 0.001681  loss: 2.8817 (3.0658)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2385 (1.1240)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [174]  [ 600/1251]  eta: 0:05:45  lr: 0.001677  min_lr: 0.001677  loss: 2.9790 (3.0690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8169 (1.0649)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [174]  [ 800/1251]  eta: 0:03:58  lr: 0.001674  min_lr: 0.001674  loss: 3.2404 (3.0759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9540 (1.0618)  time: 0.5242  data: 0.0004  max mem: 43713
Epoch: [174]  [1000/1251]  eta: 0:02:12  lr: 0.001670  min_lr: 0.001670  loss: 3.3652 (3.0879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0443 (1.0679)  time: 0.5280  data: 0.0005  max mem: 43713
Epoch: [174]  [1200/1251]  eta: 0:00:26  lr: 0.001666  min_lr: 0.001666  loss: 3.2925 (3.0795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1211 (1.0706)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.7838 (3.0741)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (1.0675)  time: 0.4440  data: 0.0006  max mem: 43713
Epoch: [174] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.7838 (3.0562)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (1.0675)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5480 (0.5480)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.2091  data: 4.8926  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7403 (0.7132)  acc1: 84.8000 (85.2364)  acc5: 97.6000 (97.4909)  time: 0.7141  data: 0.4451  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9108 (0.8503)  acc1: 81.6000 (82.0381)  acc5: 95.6000 (96.1524)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9108 (0.8698)  acc1: 80.8000 (81.4400)  acc5: 95.2000 (95.9840)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4673 s / it)
* Acc@1 81.748 Acc@5 96.084 loss 0.857
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.75%
Epoch: [175]  [   0/1251]  eta: 0:56:59  lr: 0.001666  min_lr: 0.001666  loss: 3.3299 (3.3299)  weight_decay: 0.0500 (0.0500)  time: 2.7334  data: 2.2032  max mem: 43713
Epoch: [175]  [ 200/1251]  eta: 0:09:22  lr: 0.001662  min_lr: 0.001662  loss: 3.1779 (3.0567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.8368)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [175]  [ 400/1251]  eta: 0:07:31  lr: 0.001658  min_lr: 0.001658  loss: 3.3110 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2983 (1.0185)  time: 0.5390  data: 0.0005  max mem: 43713
Epoch: [175]  [ 600/1251]  eta: 0:05:44  lr: 0.001655  min_lr: 0.001655  loss: 3.0433 (3.0469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8329 (1.0251)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [175]  [ 800/1251]  eta: 0:03:57  lr: 0.001651  min_lr: 0.001651  loss: 3.2186 (3.0444)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1055 (1.0163)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [175]  [1000/1251]  eta: 0:02:12  lr: 0.001648  min_lr: 0.001648  loss: 3.3757 (3.0545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8869 (1.0380)  time: 0.5293  data: 0.0005  max mem: 43713
Epoch: [175]  [1200/1251]  eta: 0:00:26  lr: 0.001644  min_lr: 0.001644  loss: 3.1848 (3.0520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0920 (1.0628)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 3.2125 (3.0502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (1.0510)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [175] Total time: 0:10:58 (0.5260 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 3.2125 (3.0445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8251 (1.0510)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5893 (0.5893)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6005  data: 5.3184  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7382 (0.7580)  acc1: 84.4000 (84.5455)  acc5: 98.4000 (98.0000)  time: 0.7497  data: 0.4838  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9576 (0.9034)  acc1: 79.6000 (81.6191)  acc5: 96.0000 (96.4381)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0188 (0.9151)  acc1: 79.6000 (81.3280)  acc5: 95.2000 (96.2880)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4832 s / it)
* Acc@1 81.666 Acc@5 96.090 loss 0.906
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.75%
Epoch: [176]  [   0/1251]  eta: 1:07:42  lr: 0.001643  min_lr: 0.001643  loss: 3.5191 (3.5191)  weight_decay: 0.0500 (0.0500)  time: 3.2471  data: 2.5179  max mem: 43713
Epoch: [176]  [ 200/1251]  eta: 0:09:26  lr: 0.001640  min_lr: 0.001640  loss: 3.0056 (3.0254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7220 (0.9350)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [176]  [ 400/1251]  eta: 0:07:33  lr: 0.001636  min_lr: 0.001636  loss: 3.0903 (3.0157)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1660 (0.9486)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [176]  [ 600/1251]  eta: 0:05:44  lr: 0.001633  min_lr: 0.001633  loss: 3.1453 (3.0222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0263 (0.9492)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [176]  [ 800/1251]  eta: 0:03:58  lr: 0.001629  min_lr: 0.001629  loss: 3.3375 (3.0250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8612 (0.9694)  time: 0.5352  data: 0.0004  max mem: 43713
Epoch: [176]  [1000/1251]  eta: 0:02:12  lr: 0.001626  min_lr: 0.001626  loss: 3.2137 (3.0397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6826 (0.9737)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [176]  [1200/1251]  eta: 0:00:26  lr: 0.001622  min_lr: 0.001622  loss: 3.1014 (3.0345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0014 (0.9800)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 3.1479 (3.0358)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9653 (0.9791)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [176] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 3.1479 (3.0438)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9653 (0.9791)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6535 (0.6535)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5116  data: 5.2194  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7643 (0.7822)  acc1: 86.0000 (84.3636)  acc5: 98.0000 (97.7455)  time: 0.7415  data: 0.4748  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9921 (0.9239)  acc1: 78.0000 (81.1238)  acc5: 96.0000 (96.1714)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0434 (0.9376)  acc1: 78.8000 (80.7520)  acc5: 95.2000 (96.0160)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4798 s / it)
* Acc@1 81.526 Acc@5 96.062 loss 0.926
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.75%
Epoch: [177]  [   0/1251]  eta: 1:11:26  lr: 0.001621  min_lr: 0.001621  loss: 3.3979 (3.3979)  weight_decay: 0.0500 (0.0500)  time: 3.4269  data: 2.4088  max mem: 43713
Epoch: [177]  [ 200/1251]  eta: 0:09:30  lr: 0.001618  min_lr: 0.001618  loss: 3.0484 (2.9644)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7155 (0.7591)  time: 0.5310  data: 0.0004  max mem: 43713
Epoch: [177]  [ 400/1251]  eta: 0:07:33  lr: 0.001614  min_lr: 0.001614  loss: 2.9754 (2.9839)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8716)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [177]  [ 600/1251]  eta: 0:05:45  lr: 0.001611  min_lr: 0.001611  loss: 3.1236 (2.9859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9251 (0.9150)  time: 0.5313  data: 0.0004  max mem: 43713
Epoch: [177]  [ 800/1251]  eta: 0:03:58  lr: 0.001607  min_lr: 0.001607  loss: 2.9502 (3.0050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8769 (0.9771)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [177]  [1000/1251]  eta: 0:02:12  lr: 0.001604  min_lr: 0.001604  loss: 2.9359 (2.9953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9844 (0.9755)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [177]  [1200/1251]  eta: 0:00:26  lr: 0.001600  min_lr: 0.001600  loss: 3.1482 (3.0139)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9409 (0.9643)  time: 0.5315  data: 0.0004  max mem: 43713
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 3.1721 (3.0173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (0.9606)  time: 0.4439  data: 0.0007  max mem: 43713
Epoch: [177] Total time: 0:10:59 (0.5275 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 3.1721 (3.0220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8759 (0.9606)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6644 (0.6644)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.1879  data: 4.8787  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8407 (0.8307)  acc1: 84.0000 (84.6909)  acc5: 97.6000 (97.7455)  time: 0.7114  data: 0.4438  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0123 (0.9680)  acc1: 79.6000 (81.5429)  acc5: 95.2000 (96.0191)  time: 0.2636  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0823 (0.9797)  acc1: 79.2000 (81.2960)  acc5: 94.8000 (96.0160)  time: 0.2634  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4655 s / it)
* Acc@1 81.654 Acc@5 96.134 loss 0.971
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.75%
Epoch: [178]  [   0/1251]  eta: 1:13:33  lr: 0.001599  min_lr: 0.001599  loss: 2.1555 (2.1555)  weight_decay: 0.0500 (0.0500)  time: 3.5280  data: 2.9607  max mem: 43713
Epoch: [178]  [ 200/1251]  eta: 0:09:25  lr: 0.001596  min_lr: 0.001596  loss: 3.1888 (3.0172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0053 (1.1083)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [178]  [ 400/1251]  eta: 0:07:31  lr: 0.001592  min_lr: 0.001592  loss: 3.3651 (3.0255)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6526 (1.0838)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [178]  [ 600/1251]  eta: 0:05:44  lr: 0.001589  min_lr: 0.001589  loss: 3.1182 (3.0211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8878 (1.0970)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [178]  [ 800/1251]  eta: 0:03:58  lr: 0.001585  min_lr: 0.001585  loss: 3.1908 (3.0244)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7382 (1.0342)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [178]  [1000/1251]  eta: 0:02:12  lr: 0.001582  min_lr: 0.001582  loss: 3.1925 (3.0297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9348 (1.0019)  time: 0.5353  data: 0.0004  max mem: 43713
Epoch: [178]  [1200/1251]  eta: 0:00:26  lr: 0.001578  min_lr: 0.001578  loss: 3.1583 (3.0198)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0759 (1.0199)  time: 0.5260  data: 0.0005  max mem: 43713
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 3.3030 (3.0227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0879 (1.0209)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [178] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 3.3030 (3.0293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0879 (1.0209)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6747 (0.6747)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.4843  data: 5.1837  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8346 (0.8350)  acc1: 86.0000 (84.8000)  acc5: 97.6000 (97.5273)  time: 0.7391  data: 0.4715  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0508 (0.9704)  acc1: 78.8000 (81.6571)  acc5: 96.0000 (96.2857)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0487 (0.9828)  acc1: 79.2000 (81.3920)  acc5: 95.6000 (96.1600)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4799 s / it)
* Acc@1 81.712 Acc@5 96.208 loss 0.973
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.75%
Epoch: [179]  [   0/1251]  eta: 1:12:57  lr: 0.001577  min_lr: 0.001577  loss: 3.2012 (3.2012)  weight_decay: 0.0500 (0.0500)  time: 3.4995  data: 1.7078  max mem: 43713
Epoch: [179]  [ 200/1251]  eta: 0:09:26  lr: 0.001574  min_lr: 0.001574  loss: 3.0334 (3.0565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7633 (0.9188)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [179]  [ 400/1251]  eta: 0:07:34  lr: 0.001570  min_lr: 0.001570  loss: 3.2769 (3.0419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8111 (0.9398)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [179]  [ 600/1251]  eta: 0:05:45  lr: 0.001567  min_lr: 0.001567  loss: 3.0770 (3.0353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1127 (0.9994)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [179]  [ 800/1251]  eta: 0:03:58  lr: 0.001563  min_lr: 0.001563  loss: 3.2759 (3.0359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8729 (1.0045)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [179]  [1000/1251]  eta: 0:02:12  lr: 0.001560  min_lr: 0.001560  loss: 3.0549 (3.0261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8667 (1.0177)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [179]  [1200/1251]  eta: 0:00:26  lr: 0.001556  min_lr: 0.001556  loss: 2.9728 (3.0272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (1.0231)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 2.9720 (3.0249)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (1.0180)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [179] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 2.9720 (3.0254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7501 (1.0180)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.6033 (0.6033)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 4.2496  data: 3.9607  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7421 (0.7255)  acc1: 86.0000 (85.2000)  acc5: 98.0000 (97.7091)  time: 0.7289  data: 0.4623  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9060 (0.8572)  acc1: 79.6000 (81.7524)  acc5: 96.0000 (96.3048)  time: 0.3208  data: 0.0563  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9319 (0.8700)  acc1: 78.8000 (81.3120)  acc5: 95.6000 (96.1920)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4725 s / it)
* Acc@1 81.980 Acc@5 96.304 loss 0.854
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 81.98%
Epoch: [180]  [   0/1251]  eta: 0:55:11  lr: 0.001556  min_lr: 0.001556  loss: 3.2197 (3.2197)  weight_decay: 0.0500 (0.0500)  time: 2.6471  data: 2.1003  max mem: 43713
Epoch: [180]  [ 200/1251]  eta: 0:09:26  lr: 0.001552  min_lr: 0.001552  loss: 2.7366 (2.9653)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (1.2747)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [180]  [ 400/1251]  eta: 0:07:31  lr: 0.001549  min_lr: 0.001549  loss: 3.3244 (2.9840)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0510 (1.1119)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [180]  [ 600/1251]  eta: 0:05:44  lr: 0.001545  min_lr: 0.001545  loss: 2.8625 (2.9862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8886 (1.0943)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [180]  [ 800/1251]  eta: 0:03:58  lr: 0.001542  min_lr: 0.001542  loss: 3.0326 (2.9895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6993 (1.0569)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [180]  [1000/1251]  eta: 0:02:12  lr: 0.001538  min_lr: 0.001538  loss: 3.1333 (3.0013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9255 (1.0411)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [180]  [1200/1251]  eta: 0:00:26  lr: 0.001535  min_lr: 0.001535  loss: 3.3062 (3.0074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9373 (1.0430)  time: 0.5287  data: 0.0004  max mem: 43713
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 3.1898 (3.0129)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9430 (1.0380)  time: 0.4432  data: 0.0004  max mem: 43713
Epoch: [180] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 3.1898 (3.0246)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9430 (1.0380)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6455 (0.6455)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.3148  data: 5.0088  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7555 (0.7906)  acc1: 86.8000 (84.6546)  acc5: 98.0000 (97.6364)  time: 0.7238  data: 0.4557  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0202 (0.9223)  acc1: 78.4000 (81.6762)  acc5: 96.0000 (96.4191)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0288 (0.9359)  acc1: 79.6000 (81.3600)  acc5: 95.6000 (96.3040)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4708 s / it)
* Acc@1 81.872 Acc@5 96.124 loss 0.928
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.98%
Epoch: [181]  [   0/1251]  eta: 1:17:51  lr: 0.001534  min_lr: 0.001534  loss: 3.1440 (3.1440)  weight_decay: 0.0500 (0.0500)  time: 3.7346  data: 2.9360  max mem: 43713
Epoch: [181]  [ 200/1251]  eta: 0:09:27  lr: 0.001530  min_lr: 0.001530  loss: 2.9921 (3.0472)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2155 (1.1826)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [181]  [ 400/1251]  eta: 0:07:32  lr: 0.001527  min_lr: 0.001527  loss: 3.2476 (3.0222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8039 (1.0245)  time: 0.5309  data: 0.0005  max mem: 43713
Epoch: [181]  [ 600/1251]  eta: 0:05:45  lr: 0.001523  min_lr: 0.001523  loss: 3.2024 (3.0323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8702 (1.0034)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [181]  [ 800/1251]  eta: 0:03:58  lr: 0.001520  min_lr: 0.001520  loss: 3.1847 (3.0368)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8894 (0.9966)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [181]  [1000/1251]  eta: 0:02:12  lr: 0.001516  min_lr: 0.001516  loss: 3.2227 (3.0386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0133 (1.0306)  time: 0.5293  data: 0.0005  max mem: 43713
Epoch: [181]  [1200/1251]  eta: 0:00:26  lr: 0.001513  min_lr: 0.001513  loss: 3.1391 (3.0419)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2123 (1.0529)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.1850 (3.0422)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (1.0466)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [181] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.1850 (3.0276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (1.0466)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6017 (0.6017)  acc1: 89.2000 (89.2000)  acc5: 98.0000 (98.0000)  time: 5.4953  data: 5.2023  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7722 (0.7603)  acc1: 86.0000 (85.0909)  acc5: 98.0000 (97.8182)  time: 0.7401  data: 0.4732  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9300 (0.8993)  acc1: 80.8000 (81.8476)  acc5: 96.4000 (96.3619)  time: 0.2644  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9794 (0.9085)  acc1: 79.2000 (81.5040)  acc5: 95.6000 (96.3520)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4788 s / it)
* Acc@1 81.998 Acc@5 96.270 loss 0.899
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.00%
Epoch: [182]  [   0/1251]  eta: 0:55:24  lr: 0.001512  min_lr: 0.001512  loss: 2.4958 (2.4958)  weight_decay: 0.0500 (0.0500)  time: 2.6576  data: 2.1130  max mem: 43713
Epoch: [182]  [ 200/1251]  eta: 0:09:21  lr: 0.001508  min_lr: 0.001508  loss: 2.7343 (2.9590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7250 (0.8312)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [182]  [ 400/1251]  eta: 0:07:31  lr: 0.001505  min_lr: 0.001505  loss: 3.1652 (2.9936)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1638 (0.9523)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [182]  [ 600/1251]  eta: 0:05:44  lr: 0.001501  min_lr: 0.001501  loss: 3.3151 (2.9965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8156 (0.9773)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [182]  [ 800/1251]  eta: 0:03:57  lr: 0.001498  min_lr: 0.001498  loss: 3.1263 (3.0041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0694 (1.0425)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [182]  [1000/1251]  eta: 0:02:12  lr: 0.001495  min_lr: 0.001495  loss: 3.0490 (3.0149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (1.1122)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [182]  [1200/1251]  eta: 0:00:26  lr: 0.001491  min_lr: 0.001491  loss: 3.0632 (3.0201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9012 (1.0838)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 2.9740 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.0941)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [182] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 2.9740 (3.0310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0682 (1.0941)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6004 (0.6004)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5822  data: 5.2816  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7980 (0.7799)  acc1: 85.6000 (84.2182)  acc5: 98.0000 (97.5636)  time: 0.7483  data: 0.4805  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9599 (0.9080)  acc1: 78.8000 (81.2381)  acc5: 95.6000 (96.2857)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0244 (0.9186)  acc1: 77.6000 (80.9600)  acc5: 95.6000 (96.2720)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4832 s / it)
* Acc@1 81.874 Acc@5 96.216 loss 0.910
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 82.00%
Epoch: [183]  [   0/1251]  eta: 1:14:25  lr: 0.001490  min_lr: 0.001490  loss: 2.8958 (2.8958)  weight_decay: 0.0500 (0.0500)  time: 3.5698  data: 2.4334  max mem: 43713
Epoch: [183]  [ 200/1251]  eta: 0:09:31  lr: 0.001487  min_lr: 0.001487  loss: 3.1667 (2.9696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9369 (1.2372)  time: 0.5301  data: 0.0004  max mem: 43713
Epoch: [183]  [ 400/1251]  eta: 0:07:34  lr: 0.001483  min_lr: 0.001483  loss: 3.1523 (3.0193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9800 (nan)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [183]  [ 600/1251]  eta: 0:05:45  lr: 0.001480  min_lr: 0.001480  loss: 3.0276 (3.0103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9701 (nan)  time: 0.5353  data: 0.0004  max mem: 43713
Epoch: [183]  [ 800/1251]  eta: 0:03:58  lr: 0.001476  min_lr: 0.001476  loss: 3.3105 (3.0345)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9948 (nan)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [183]  [1000/1251]  eta: 0:02:12  lr: 0.001473  min_lr: 0.001473  loss: 2.8124 (3.0207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7565 (nan)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [183]  [1200/1251]  eta: 0:00:26  lr: 0.001469  min_lr: 0.001469  loss: 2.9019 (3.0193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (nan)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.1815 (3.0261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8270 (nan)  time: 0.4485  data: 0.0006  max mem: 43713
Epoch: [183] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.1815 (3.0224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8270 (nan)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.5880 (0.5880)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.0865  data: 4.7774  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7996 (0.7978)  acc1: 85.6000 (85.0545)  acc5: 97.2000 (97.7091)  time: 0.7032  data: 0.4346  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9458 (0.9297)  acc1: 81.2000 (82.0381)  acc5: 96.0000 (96.3238)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0401 (0.9407)  acc1: 79.6000 (81.6640)  acc5: 95.6000 (96.1600)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4620 s / it)
* Acc@1 81.912 Acc@5 96.220 loss 0.930
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 82.00%
Epoch: [184]  [   0/1251]  eta: 1:09:04  lr: 0.001469  min_lr: 0.001469  loss: 3.2232 (3.2232)  weight_decay: 0.0500 (0.0500)  time: 3.3126  data: 2.2082  max mem: 43713
Epoch: [184]  [ 200/1251]  eta: 0:09:29  lr: 0.001465  min_lr: 0.001465  loss: 3.0755 (2.9936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7984 (1.0738)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [184]  [ 400/1251]  eta: 0:07:33  lr: 0.001462  min_lr: 0.001462  loss: 3.1367 (2.9975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8517 (0.9763)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [184]  [ 600/1251]  eta: 0:05:45  lr: 0.001458  min_lr: 0.001458  loss: 3.0916 (2.9849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7862 (0.9540)  time: 0.5293  data: 0.0004  max mem: 43713
Epoch: [184]  [ 800/1251]  eta: 0:03:58  lr: 0.001455  min_lr: 0.001455  loss: 3.2355 (3.0025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7988 (0.9542)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [184]  [1000/1251]  eta: 0:02:12  lr: 0.001451  min_lr: 0.001451  loss: 3.1231 (3.0125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8579 (0.9643)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [184]  [1200/1251]  eta: 0:00:26  lr: 0.001448  min_lr: 0.001448  loss: 3.0325 (3.0228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9224 (0.9732)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 3.1984 (3.0222)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1159 (0.9880)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [184] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 3.1984 (3.0054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1159 (0.9880)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5830 (0.5830)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.5133  data: 5.2276  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7686 (0.7590)  acc1: 84.8000 (84.5818)  acc5: 98.0000 (97.8182)  time: 0.7414  data: 0.4755  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9533 (0.8889)  acc1: 79.6000 (81.5238)  acc5: 96.0000 (96.4952)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9844 (0.8985)  acc1: 78.4000 (81.0400)  acc5: 96.0000 (96.3680)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4792 s / it)
* Acc@1 81.964 Acc@5 96.212 loss 0.886
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.00%
Epoch: [185]  [   0/1251]  eta: 1:11:42  lr: 0.001447  min_lr: 0.001447  loss: 3.1034 (3.1034)  weight_decay: 0.0500 (0.0500)  time: 3.4391  data: 2.8189  max mem: 43713
Epoch: [185]  [ 200/1251]  eta: 0:09:26  lr: 0.001444  min_lr: 0.001444  loss: 3.1746 (2.9656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0232 (0.9408)  time: 0.5254  data: 0.0004  max mem: 43713
Epoch: [185]  [ 400/1251]  eta: 0:07:33  lr: 0.001440  min_lr: 0.001440  loss: 3.0334 (2.9628)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.8809)  time: 0.5404  data: 0.0004  max mem: 43713
Epoch: [185]  [ 600/1251]  eta: 0:05:45  lr: 0.001437  min_lr: 0.001437  loss: 3.0939 (2.9781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8477 (0.9215)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [185]  [ 800/1251]  eta: 0:03:58  lr: 0.001433  min_lr: 0.001433  loss: 3.1800 (2.9850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8065 (0.9262)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [185]  [1000/1251]  eta: 0:02:12  lr: 0.001430  min_lr: 0.001430  loss: 3.2024 (2.9883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8827 (0.9480)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [185]  [1200/1251]  eta: 0:00:26  lr: 0.001426  min_lr: 0.001426  loss: 3.1543 (2.9935)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4116 (0.9716)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 3.2214 (2.9980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9542 (0.9822)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [185] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 3.2214 (2.9977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9542 (0.9822)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6192 (0.6192)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.4611  data: 5.1591  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7503 (0.7645)  acc1: 86.0000 (85.2727)  acc5: 98.4000 (97.8182)  time: 0.7371  data: 0.4693  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9685 (0.8982)  acc1: 80.4000 (81.9810)  acc5: 95.6000 (96.4000)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9995 (0.9112)  acc1: 80.4000 (81.4240)  acc5: 95.6000 (96.3680)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4766 s / it)
* Acc@1 82.042 Acc@5 96.308 loss 0.904
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.04%
Epoch: [186]  [   0/1251]  eta: 0:56:23  lr: 0.001425  min_lr: 0.001425  loss: 3.3826 (3.3826)  weight_decay: 0.0500 (0.0500)  time: 2.7043  data: 2.1701  max mem: 43713
Epoch: [186]  [ 200/1251]  eta: 0:09:24  lr: 0.001422  min_lr: 0.001422  loss: 2.7807 (2.9949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (1.1060)  time: 0.5325  data: 0.0005  max mem: 43713
Epoch: [186]  [ 400/1251]  eta: 0:07:32  lr: 0.001419  min_lr: 0.001419  loss: 2.8908 (2.9985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8361 (1.0376)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [186]  [ 600/1251]  eta: 0:05:44  lr: 0.001415  min_lr: 0.001415  loss: 2.9985 (3.0040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9532 (1.0151)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [186]  [ 800/1251]  eta: 0:03:58  lr: 0.001412  min_lr: 0.001412  loss: 3.2335 (3.0074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9773 (1.0190)  time: 0.5289  data: 0.0005  max mem: 43713
Epoch: [186]  [1000/1251]  eta: 0:02:12  lr: 0.001408  min_lr: 0.001408  loss: 3.0335 (3.0142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8245 (1.0129)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [186]  [1200/1251]  eta: 0:00:26  lr: 0.001405  min_lr: 0.001405  loss: 2.8631 (3.0050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9171 (1.0131)  time: 0.5286  data: 0.0004  max mem: 43713
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.3143 (3.0086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0305 (1.0237)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [186] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.3143 (2.9966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0305 (1.0237)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6863 (0.6863)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.7369  data: 5.4301  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8582 (0.8472)  acc1: 86.0000 (85.3455)  acc5: 98.0000 (97.8546)  time: 0.7621  data: 0.4939  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0097 (0.9881)  acc1: 80.8000 (81.9810)  acc5: 96.4000 (96.4000)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.1185 (1.0001)  acc1: 80.4000 (81.5520)  acc5: 96.0000 (96.3520)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4883 s / it)
* Acc@1 82.040 Acc@5 96.220 loss 0.997
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.04%
Epoch: [187]  [   0/1251]  eta: 1:16:40  lr: 0.001404  min_lr: 0.001404  loss: 2.7453 (2.7453)  weight_decay: 0.0500 (0.0500)  time: 3.6774  data: 3.0558  max mem: 43713
Epoch: [187]  [ 200/1251]  eta: 0:09:31  lr: 0.001401  min_lr: 0.001401  loss: 3.2001 (2.9981)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.9349)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [187]  [ 400/1251]  eta: 0:07:33  lr: 0.001397  min_lr: 0.001397  loss: 2.9647 (2.9668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8336 (0.9785)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [187]  [ 600/1251]  eta: 0:05:45  lr: 0.001394  min_lr: 0.001394  loss: 3.1076 (2.9749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8580 (0.9794)  time: 0.5285  data: 0.0005  max mem: 43713
Epoch: [187]  [ 800/1251]  eta: 0:03:58  lr: 0.001390  min_lr: 0.001390  loss: 3.0570 (2.9868)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9992 (1.0533)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [187]  [1000/1251]  eta: 0:02:12  lr: 0.001387  min_lr: 0.001387  loss: 3.2710 (2.9926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9429 (1.0375)  time: 0.5311  data: 0.0005  max mem: 43713
Epoch: [187]  [1200/1251]  eta: 0:00:26  lr: 0.001383  min_lr: 0.001383  loss: 2.8331 (2.9907)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0291 (1.0264)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 3.0221 (2.9912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (1.0253)  time: 0.4432  data: 0.0007  max mem: 43713
Epoch: [187] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 3.0221 (2.9920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9037 (1.0253)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6193 (0.6193)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.4623  data: 5.1533  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7790 (0.7654)  acc1: 85.6000 (85.9273)  acc5: 98.0000 (97.7455)  time: 0.7365  data: 0.4688  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9370 (0.9111)  acc1: 81.2000 (82.4381)  acc5: 96.0000 (96.2476)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0022 (0.9240)  acc1: 80.4000 (81.9200)  acc5: 95.2000 (96.1440)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4769 s / it)
* Acc@1 82.128 Acc@5 96.164 loss 0.914
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.13%
Epoch: [188]  [   0/1251]  eta: 0:58:58  lr: 0.001383  min_lr: 0.001383  loss: 3.1456 (3.1456)  weight_decay: 0.0500 (0.0500)  time: 2.8287  data: 2.2829  max mem: 43713
Epoch: [188]  [ 200/1251]  eta: 0:09:23  lr: 0.001379  min_lr: 0.001379  loss: 2.4883 (2.8764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7312 (0.9796)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [188]  [ 400/1251]  eta: 0:07:31  lr: 0.001376  min_lr: 0.001376  loss: 3.1285 (2.9264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9139 (1.0089)  time: 0.5330  data: 0.0004  max mem: 43713
Epoch: [188]  [ 600/1251]  eta: 0:05:44  lr: 0.001372  min_lr: 0.001372  loss: 3.1620 (2.9505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0924 (1.0406)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [188]  [ 800/1251]  eta: 0:03:57  lr: 0.001369  min_lr: 0.001369  loss: 3.0547 (2.9612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9658 (1.0296)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [188]  [1000/1251]  eta: 0:02:12  lr: 0.001366  min_lr: 0.001366  loss: 3.0947 (2.9711)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0140 (1.0466)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [188]  [1200/1251]  eta: 0:00:26  lr: 0.001362  min_lr: 0.001362  loss: 3.1895 (2.9668)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7745 (1.0236)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.0918 (2.9710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (1.0387)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [188] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.0918 (2.9816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8880 (1.0387)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6524 (0.6524)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.2627  data: 4.9663  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8264 (0.8059)  acc1: 86.4000 (85.6364)  acc5: 98.0000 (97.7455)  time: 0.7191  data: 0.4518  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9655 (0.9437)  acc1: 80.0000 (82.0762)  acc5: 96.4000 (96.4191)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0429 (0.9600)  acc1: 79.6000 (81.6320)  acc5: 95.6000 (96.2080)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4718 s / it)
* Acc@1 82.024 Acc@5 96.310 loss 0.949
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.13%
Epoch: [189]  [   0/1251]  eta: 1:12:59  lr: 0.001361  min_lr: 0.001361  loss: 2.7934 (2.7934)  weight_decay: 0.0500 (0.0500)  time: 3.5010  data: 2.8045  max mem: 43713
Epoch: [189]  [ 200/1251]  eta: 0:09:27  lr: 0.001358  min_lr: 0.001358  loss: 3.2319 (2.9937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9128 (1.2386)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [189]  [ 400/1251]  eta: 0:07:34  lr: 0.001355  min_lr: 0.001355  loss: 3.1848 (2.9595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8293 (1.0390)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [189]  [ 600/1251]  eta: 0:05:45  lr: 0.001351  min_lr: 0.001351  loss: 2.7937 (2.9750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8687 (1.0614)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [189]  [ 800/1251]  eta: 0:03:58  lr: 0.001348  min_lr: 0.001348  loss: 3.1276 (2.9821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (1.0823)  time: 0.5282  data: 0.0005  max mem: 43713
Epoch: [189]  [1000/1251]  eta: 0:02:12  lr: 0.001344  min_lr: 0.001344  loss: 2.9415 (2.9717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0084 (1.0840)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [189]  [1200/1251]  eta: 0:00:26  lr: 0.001341  min_lr: 0.001341  loss: 2.9136 (2.9646)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9189 (1.0708)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 3.0340 (2.9612)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8427 (1.0660)  time: 0.4476  data: 0.0005  max mem: 43713
Epoch: [189] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 3.0340 (2.9763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8427 (1.0660)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5680 (0.5680)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.3298  data: 5.0254  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8074 (0.7505)  acc1: 85.6000 (85.4545)  acc5: 97.6000 (97.7818)  time: 0.7251  data: 0.4571  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9243 (0.8833)  acc1: 79.2000 (81.9429)  acc5: 95.2000 (96.1524)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9611 (0.8945)  acc1: 79.2000 (81.5040)  acc5: 95.2000 (96.0800)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4717 s / it)
* Acc@1 82.174 Acc@5 96.240 loss 0.874
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.17%
Epoch: [190]  [   0/1251]  eta: 0:52:04  lr: 0.001340  min_lr: 0.001340  loss: 2.3349 (2.3349)  weight_decay: 0.0500 (0.0500)  time: 2.4973  data: 1.9583  max mem: 43713
Epoch: [190]  [ 200/1251]  eta: 0:09:25  lr: 0.001337  min_lr: 0.001337  loss: 2.8834 (2.9767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7609 (1.0361)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [190]  [ 400/1251]  eta: 0:07:31  lr: 0.001333  min_lr: 0.001333  loss: 2.9919 (2.9549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0122 (1.0537)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [190]  [ 600/1251]  eta: 0:05:44  lr: 0.001330  min_lr: 0.001330  loss: 3.1500 (2.9745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1666 (1.0924)  time: 0.5284  data: 0.0005  max mem: 43713
Epoch: [190]  [ 800/1251]  eta: 0:03:58  lr: 0.001327  min_lr: 0.001327  loss: 2.8644 (2.9835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8589 (1.0664)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [190]  [1000/1251]  eta: 0:02:12  lr: 0.001323  min_lr: 0.001323  loss: 3.1102 (2.9756)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [190]  [1200/1251]  eta: 0:00:26  lr: 0.001320  min_lr: 0.001320  loss: 3.1808 (2.9790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9744 (nan)  time: 0.5295  data: 0.0005  max mem: 43713
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 3.1144 (2.9803)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9879 (nan)  time: 0.4433  data: 0.0006  max mem: 43713
Epoch: [190] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 3.1144 (2.9857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9879 (nan)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.6572 (0.6572)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.1504  data: 4.8516  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8037 (0.7883)  acc1: 85.2000 (85.2727)  acc5: 97.6000 (97.7091)  time: 0.7088  data: 0.4413  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9374 (0.9245)  acc1: 80.0000 (82.1143)  acc5: 96.0000 (96.3048)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9732 (0.9336)  acc1: 79.6000 (81.6800)  acc5: 95.6000 (96.2720)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4654 s / it)
* Acc@1 82.306 Acc@5 96.328 loss 0.916
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.31%
Epoch: [191]  [   0/1251]  eta: 0:56:41  lr: 0.001319  min_lr: 0.001319  loss: 3.3619 (3.3619)  weight_decay: 0.0500 (0.0500)  time: 2.7189  data: 2.1928  max mem: 43713
Epoch: [191]  [ 200/1251]  eta: 0:09:22  lr: 0.001316  min_lr: 0.001316  loss: 3.0050 (2.9433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8446 (1.0048)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [191]  [ 400/1251]  eta: 0:07:30  lr: 0.001312  min_lr: 0.001312  loss: 3.0529 (2.9649)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.9723)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [191]  [ 600/1251]  eta: 0:05:44  lr: 0.001309  min_lr: 0.001309  loss: 2.7918 (2.9499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1305 (1.0595)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [191]  [ 800/1251]  eta: 0:03:57  lr: 0.001305  min_lr: 0.001305  loss: 3.0574 (2.9652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0331 (1.0723)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [191]  [1000/1251]  eta: 0:02:12  lr: 0.001302  min_lr: 0.001302  loss: 3.0261 (2.9759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8734 (1.0683)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [191]  [1200/1251]  eta: 0:00:26  lr: 0.001299  min_lr: 0.001299  loss: 3.1993 (2.9836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9673 (1.0896)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 3.1802 (2.9824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9033 (1.0868)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [191] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 3.1802 (2.9813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9033 (1.0868)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6004 (0.6004)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.4048  data: 5.0960  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7552 (0.7361)  acc1: 86.4000 (85.7091)  acc5: 98.0000 (97.7091)  time: 0.7321  data: 0.4637  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8968 (0.8716)  acc1: 80.0000 (82.7429)  acc5: 96.0000 (96.2286)  time: 0.2649  data: 0.0003  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9479 (0.8847)  acc1: 79.6000 (82.3840)  acc5: 95.2000 (96.0960)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4751 s / it)
* Acc@1 82.256 Acc@5 96.346 loss 0.875
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.31%
Epoch: [192]  [   0/1251]  eta: 1:11:04  lr: 0.001298  min_lr: 0.001298  loss: 3.0224 (3.0224)  weight_decay: 0.0500 (0.0500)  time: 3.4091  data: 1.8672  max mem: 43713
Epoch: [192]  [ 200/1251]  eta: 0:09:26  lr: 0.001295  min_lr: 0.001295  loss: 3.0027 (3.0008)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1428 (1.4112)  time: 0.5299  data: 0.0004  max mem: 43713
Epoch: [192]  [ 400/1251]  eta: 0:07:33  lr: 0.001291  min_lr: 0.001291  loss: 3.0855 (2.9451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9450 (1.1925)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [192]  [ 600/1251]  eta: 0:05:44  lr: 0.001288  min_lr: 0.001288  loss: 3.1795 (2.9586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9100 (1.1498)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [192]  [ 800/1251]  eta: 0:03:58  lr: 0.001284  min_lr: 0.001284  loss: 3.1415 (2.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0464 (1.1248)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [192]  [1000/1251]  eta: 0:02:12  lr: 0.001281  min_lr: 0.001281  loss: 3.1609 (2.9626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1613 (1.1405)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [192]  [1200/1251]  eta: 0:00:26  lr: 0.001278  min_lr: 0.001278  loss: 2.9395 (2.9624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1031 (1.1287)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 3.1650 (2.9639)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2570 (1.1416)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [192] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 3.1650 (2.9798)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2570 (1.1416)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6211 (0.6211)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.7966  data: 5.4987  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8134 (0.7809)  acc1: 85.6000 (85.0545)  acc5: 98.0000 (97.8182)  time: 0.7676  data: 0.5002  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9373 (0.9224)  acc1: 80.0000 (82.0000)  acc5: 96.4000 (96.3429)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0047 (0.9314)  acc1: 80.4000 (81.8400)  acc5: 95.6000 (96.2880)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4901 s / it)
* Acc@1 82.370 Acc@5 96.294 loss 0.919
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.37%
Epoch: [193]  [   0/1251]  eta: 0:59:13  lr: 0.001277  min_lr: 0.001277  loss: 3.0347 (3.0347)  weight_decay: 0.0500 (0.0500)  time: 2.8406  data: 2.3111  max mem: 43713
Epoch: [193]  [ 200/1251]  eta: 0:09:27  lr: 0.001274  min_lr: 0.001274  loss: 3.0723 (3.0232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0127 (1.0423)  time: 0.5323  data: 0.0004  max mem: 43713
Epoch: [193]  [ 400/1251]  eta: 0:07:32  lr: 0.001270  min_lr: 0.001270  loss: 3.0702 (2.9922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9000 (0.9977)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [193]  [ 600/1251]  eta: 0:05:44  lr: 0.001267  min_lr: 0.001267  loss: 3.1391 (2.9939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1183 (1.0256)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [193]  [ 800/1251]  eta: 0:03:58  lr: 0.001264  min_lr: 0.001264  loss: 3.0012 (2.9970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3061 (1.0681)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [193]  [1000/1251]  eta: 0:02:12  lr: 0.001260  min_lr: 0.001260  loss: 2.9061 (2.9810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (1.0385)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [193]  [1200/1251]  eta: 0:00:26  lr: 0.001257  min_lr: 0.001257  loss: 2.8286 (2.9748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0848 (1.0454)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 3.1835 (2.9750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2085 (1.0533)  time: 0.4504  data: 0.0006  max mem: 43713
Epoch: [193] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 3.1835 (2.9721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2085 (1.0533)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7573 (0.7573)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.3559  data: 5.0646  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8854 (0.8784)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (97.8545)  time: 0.7268  data: 0.4607  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0435 (1.0044)  acc1: 81.6000 (82.9333)  acc5: 96.0000 (96.2667)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0584 (1.0176)  acc1: 81.2000 (82.5280)  acc5: 95.6000 (96.2560)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4710 s / it)
* Acc@1 82.568 Acc@5 96.260 loss 1.007
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.57%
Epoch: [194]  [   0/1251]  eta: 0:54:43  lr: 0.001256  min_lr: 0.001256  loss: 2.5008 (2.5008)  weight_decay: 0.0500 (0.0500)  time: 2.6244  data: 2.0880  max mem: 43713
Epoch: [194]  [ 200/1251]  eta: 0:09:22  lr: 0.001253  min_lr: 0.001253  loss: 2.8671 (2.9382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8563 (1.0747)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [194]  [ 400/1251]  eta: 0:07:30  lr: 0.001249  min_lr: 0.001249  loss: 3.1086 (2.9490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9526 (1.1318)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [194]  [ 600/1251]  eta: 0:05:44  lr: 0.001246  min_lr: 0.001246  loss: 2.9884 (2.9371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9125 (nan)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [194]  [ 800/1251]  eta: 0:03:58  lr: 0.001243  min_lr: 0.001243  loss: 3.0543 (2.9456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4732 (nan)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [194]  [1000/1251]  eta: 0:02:12  lr: 0.001239  min_lr: 0.001239  loss: 2.9895 (2.9449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0170 (nan)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [194]  [1200/1251]  eta: 0:00:26  lr: 0.001236  min_lr: 0.001236  loss: 2.6995 (2.9480)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8094 (nan)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.0456 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8954 (nan)  time: 0.4484  data: 0.0004  max mem: 43713
Epoch: [194] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.0456 (2.9607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8954 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6270 (0.6270)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.8671  data: 5.5685  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8007 (0.7755)  acc1: 85.6000 (85.0182)  acc5: 97.6000 (97.6000)  time: 0.7739  data: 0.5065  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9385 (0.9069)  acc1: 80.4000 (82.1524)  acc5: 96.4000 (96.1333)  time: 0.2644  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9703 (0.9187)  acc1: 80.4000 (81.7920)  acc5: 96.0000 (96.2080)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4935 s / it)
* Acc@1 82.198 Acc@5 96.362 loss 0.906
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.57%
Epoch: [195]  [   0/1251]  eta: 1:14:56  lr: 0.001235  min_lr: 0.001235  loss: 2.5053 (2.5053)  weight_decay: 0.0500 (0.0500)  time: 3.5945  data: 2.4272  max mem: 43713
Epoch: [195]  [ 200/1251]  eta: 0:09:26  lr: 0.001232  min_lr: 0.001232  loss: 2.4649 (2.8906)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [195]  [ 400/1251]  eta: 0:07:33  lr: 0.001229  min_lr: 0.001229  loss: 3.1693 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9782 (nan)  time: 0.5330  data: 0.0005  max mem: 43713
Epoch: [195]  [ 600/1251]  eta: 0:05:45  lr: 0.001225  min_lr: 0.001225  loss: 3.0709 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [195]  [ 800/1251]  eta: 0:03:58  lr: 0.001222  min_lr: 0.001222  loss: 3.1780 (2.9250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8736 (nan)  time: 0.5285  data: 0.0005  max mem: 43713
Epoch: [195]  [1000/1251]  eta: 0:02:12  lr: 0.001219  min_lr: 0.001219  loss: 3.0517 (2.9308)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1999 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [195]  [1200/1251]  eta: 0:00:26  lr: 0.001215  min_lr: 0.001215  loss: 3.0156 (2.9325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8897 (nan)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 3.0604 (2.9366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0016 (nan)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [195] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 3.0604 (2.9432)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0016 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6434 (0.6434)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.6746  data: 5.3619  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8259 (0.8048)  acc1: 86.4000 (85.1636)  acc5: 98.0000 (97.6727)  time: 0.7564  data: 0.4877  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0173 (0.9402)  acc1: 79.6000 (82.2095)  acc5: 96.0000 (96.2667)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0145 (0.9526)  acc1: 79.6000 (81.8240)  acc5: 96.0000 (96.2880)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4868 s / it)
* Acc@1 82.336 Acc@5 96.334 loss 0.938
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.57%
Epoch: [196]  [   0/1251]  eta: 1:15:32  lr: 0.001215  min_lr: 0.001215  loss: 3.2108 (3.2108)  weight_decay: 0.0500 (0.0500)  time: 3.6231  data: 2.6753  max mem: 43713
Epoch: [196]  [ 200/1251]  eta: 0:09:30  lr: 0.001211  min_lr: 0.001211  loss: 2.6019 (2.9460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9312 (1.0593)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [196]  [ 400/1251]  eta: 0:07:34  lr: 0.001208  min_lr: 0.001208  loss: 2.9714 (2.9681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0619 (1.0362)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [196]  [ 600/1251]  eta: 0:05:45  lr: 0.001205  min_lr: 0.001205  loss: 3.0764 (2.9489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8112 (1.0818)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [196]  [ 800/1251]  eta: 0:03:59  lr: 0.001201  min_lr: 0.001201  loss: 2.8769 (2.9376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9753 (1.1121)  time: 0.5300  data: 0.0005  max mem: 43713
Epoch: [196]  [1000/1251]  eta: 0:02:12  lr: 0.001198  min_lr: 0.001198  loss: 3.0660 (2.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0687 (1.1171)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [196]  [1200/1251]  eta: 0:00:26  lr: 0.001195  min_lr: 0.001195  loss: 3.1888 (2.9418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0358 (1.1091)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.1607 (2.9474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2095 (1.1199)  time: 0.4497  data: 0.0006  max mem: 43713
Epoch: [196] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.1607 (2.9446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2095 (1.1199)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6735 (0.6735)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5707  data: 5.2649  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8397 (0.8212)  acc1: 87.2000 (85.9273)  acc5: 98.0000 (97.8546)  time: 0.7464  data: 0.4789  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0393 (0.9613)  acc1: 79.6000 (82.3048)  acc5: 95.6000 (96.3810)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0560 (0.9784)  acc1: 79.6000 (81.8080)  acc5: 95.6000 (96.3040)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4809 s / it)
* Acc@1 82.198 Acc@5 96.386 loss 0.966
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.57%
Epoch: [197]  [   0/1251]  eta: 1:12:28  lr: 0.001194  min_lr: 0.001194  loss: 3.3685 (3.3685)  weight_decay: 0.0500 (0.0500)  time: 3.4759  data: 2.1992  max mem: 43713
Epoch: [197]  [ 200/1251]  eta: 0:09:29  lr: 0.001191  min_lr: 0.001191  loss: 2.8466 (2.9430)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0706 (1.1392)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [197]  [ 400/1251]  eta: 0:07:33  lr: 0.001187  min_lr: 0.001187  loss: 3.1050 (2.9569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9916 (1.1493)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [197]  [ 600/1251]  eta: 0:05:45  lr: 0.001184  min_lr: 0.001184  loss: 3.0783 (2.9558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (1.1275)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [197]  [ 800/1251]  eta: 0:03:58  lr: 0.001181  min_lr: 0.001181  loss: 3.2007 (2.9619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9364 (1.1507)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [197]  [1000/1251]  eta: 0:02:12  lr: 0.001178  min_lr: 0.001178  loss: 2.9481 (2.9547)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0309 (1.1274)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [197]  [1200/1251]  eta: 0:00:26  lr: 0.001174  min_lr: 0.001174  loss: 3.1137 (2.9487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8787 (1.1389)  time: 0.5299  data: 0.0005  max mem: 43713
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 2.9844 (2.9513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (1.1285)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [197] Total time: 0:10:59 (0.5273 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 2.9844 (2.9484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8026 (1.1285)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.6313 (0.6313)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.1362  data: 4.8329  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7665 (0.7739)  acc1: 87.2000 (85.2727)  acc5: 98.0000 (97.8545)  time: 0.7077  data: 0.4396  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9361 (0.9081)  acc1: 79.6000 (82.2857)  acc5: 95.6000 (96.3810)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0197 (0.9203)  acc1: 79.6000 (81.8720)  acc5: 95.6000 (96.3200)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4645 s / it)
* Acc@1 82.506 Acc@5 96.360 loss 0.903
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.57%
Epoch: [198]  [   0/1251]  eta: 1:15:42  lr: 0.001174  min_lr: 0.001174  loss: 2.2056 (2.2056)  weight_decay: 0.0500 (0.0500)  time: 3.6313  data: 2.9928  max mem: 43713
Epoch: [198]  [ 200/1251]  eta: 0:09:26  lr: 0.001170  min_lr: 0.001170  loss: 2.8734 (2.9342)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9711 (1.0715)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [198]  [ 400/1251]  eta: 0:07:33  lr: 0.001167  min_lr: 0.001167  loss: 3.0680 (2.9285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9394 (1.1033)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [198]  [ 600/1251]  eta: 0:05:45  lr: 0.001164  min_lr: 0.001164  loss: 3.0288 (2.9379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0707 (1.1118)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [198]  [ 800/1251]  eta: 0:03:58  lr: 0.001161  min_lr: 0.001161  loss: 3.0907 (2.9429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9635 (1.1031)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [198]  [1000/1251]  eta: 0:02:12  lr: 0.001157  min_lr: 0.001157  loss: 3.2074 (2.9435)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8497 (1.1140)  time: 0.5276  data: 0.0005  max mem: 43713
Epoch: [198]  [1200/1251]  eta: 0:00:26  lr: 0.001154  min_lr: 0.001154  loss: 2.7690 (2.9411)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8487 (1.1041)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 2.9083 (2.9393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8719 (1.0981)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [198] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 2.9083 (2.9379)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8719 (1.0981)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6014 (0.6014)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.3038  data: 4.9955  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7715 (0.7553)  acc1: 86.0000 (85.2727)  acc5: 98.0000 (98.0727)  time: 0.7228  data: 0.4545  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9654 (0.8829)  acc1: 80.4000 (82.4762)  acc5: 96.8000 (96.5905)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9654 (0.9006)  acc1: 80.4000 (81.9680)  acc5: 95.2000 (96.4160)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4719 s / it)
* Acc@1 82.690 Acc@5 96.444 loss 0.886
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.69%
Epoch: [199]  [   0/1251]  eta: 0:56:31  lr: 0.001153  min_lr: 0.001153  loss: 3.0184 (3.0184)  weight_decay: 0.0500 (0.0500)  time: 2.7111  data: 2.1626  max mem: 43713
Epoch: [199]  [ 200/1251]  eta: 0:09:24  lr: 0.001150  min_lr: 0.001150  loss: 3.0568 (2.9159)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0866 (1.2927)  time: 0.5337  data: 0.0004  max mem: 43713
Epoch: [199]  [ 400/1251]  eta: 0:07:32  lr: 0.001147  min_lr: 0.001147  loss: 2.8284 (2.9108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0296 (1.2086)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [199]  [ 600/1251]  eta: 0:05:44  lr: 0.001143  min_lr: 0.001143  loss: 2.9799 (2.9390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1310 (1.2506)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [199]  [ 800/1251]  eta: 0:03:58  lr: 0.001140  min_lr: 0.001140  loss: 2.8828 (2.9366)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0933 (1.2327)  time: 0.5362  data: 0.0004  max mem: 43713
Epoch: [199]  [1000/1251]  eta: 0:02:12  lr: 0.001137  min_lr: 0.001137  loss: 2.9745 (2.9365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8036 (1.2048)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [199]  [1200/1251]  eta: 0:00:26  lr: 0.001134  min_lr: 0.001134  loss: 2.9083 (2.9321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3386 (1.1923)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 3.2930 (2.9339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2397 (1.1912)  time: 0.4432  data: 0.0004  max mem: 43713
Epoch: [199] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 3.2930 (2.9431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2397 (1.1912)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6632 (0.6632)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5827  data: 5.2714  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8310 (0.8044)  acc1: 85.6000 (85.4909)  acc5: 98.0000 (98.0000)  time: 0.7481  data: 0.4795  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9633 (0.9437)  acc1: 79.6000 (82.2095)  acc5: 96.4000 (96.4571)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0370 (0.9540)  acc1: 80.0000 (81.9840)  acc5: 95.2000 (96.3360)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4823 s / it)
* Acc@1 82.566 Acc@5 96.400 loss 0.936
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.69%
Epoch: [200]  [   0/1251]  eta: 1:13:37  lr: 0.001133  min_lr: 0.001133  loss: 3.1135 (3.1135)  weight_decay: 0.0500 (0.0500)  time: 3.5315  data: 2.2802  max mem: 43713
Epoch: [200]  [ 200/1251]  eta: 0:09:31  lr: 0.001130  min_lr: 0.001130  loss: 3.2031 (2.9509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3075 (1.2340)  time: 0.5250  data: 0.0005  max mem: 43713
Epoch: [200]  [ 400/1251]  eta: 0:07:34  lr: 0.001126  min_lr: 0.001126  loss: 2.9296 (2.9285)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0313 (1.1223)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [200]  [ 600/1251]  eta: 0:05:45  lr: 0.001123  min_lr: 0.001123  loss: 3.0174 (2.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8606 (1.0565)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [200]  [ 800/1251]  eta: 0:03:58  lr: 0.001120  min_lr: 0.001120  loss: 2.9445 (2.9066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1735 (1.0812)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [200]  [1000/1251]  eta: 0:02:12  lr: 0.001117  min_lr: 0.001117  loss: 2.8761 (2.9132)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3183 (1.1300)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [200]  [1200/1251]  eta: 0:00:26  lr: 0.001114  min_lr: 0.001114  loss: 2.9495 (2.9152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (1.1931)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.8136 (2.9157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (1.1840)  time: 0.4475  data: 0.0007  max mem: 43713
Epoch: [200] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.8136 (2.9297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8388 (1.1840)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5838 (0.5838)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.6264  data: 5.3323  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7693 (0.7369)  acc1: 84.4000 (84.7636)  acc5: 97.6000 (97.6364)  time: 0.7522  data: 0.4851  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9046 (0.8633)  acc1: 80.4000 (82.0762)  acc5: 95.6000 (96.2857)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9409 (0.8774)  acc1: 80.0000 (81.6640)  acc5: 95.2000 (96.2080)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4839 s / it)
* Acc@1 82.458 Acc@5 96.410 loss 0.858
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.69%
Epoch: [201]  [   0/1251]  eta: 1:12:36  lr: 0.001113  min_lr: 0.001113  loss: 3.4147 (3.4147)  weight_decay: 0.0500 (0.0500)  time: 3.4824  data: 2.8474  max mem: 43713
Epoch: [201]  [ 200/1251]  eta: 0:09:26  lr: 0.001110  min_lr: 0.001110  loss: 3.0961 (2.9458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8603 (1.0131)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [201]  [ 400/1251]  eta: 0:07:32  lr: 0.001106  min_lr: 0.001106  loss: 2.9907 (2.9212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0378 (1.0641)  time: 0.5310  data: 0.0005  max mem: 43713
Epoch: [201]  [ 600/1251]  eta: 0:05:44  lr: 0.001103  min_lr: 0.001103  loss: 3.1005 (2.9354)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8654 (1.0357)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [201]  [ 800/1251]  eta: 0:03:58  lr: 0.001100  min_lr: 0.001100  loss: 2.9190 (2.9499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9209 (1.0152)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [201]  [1000/1251]  eta: 0:02:12  lr: 0.001097  min_lr: 0.001097  loss: 2.8281 (2.9362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7452 (1.0169)  time: 0.5359  data: 0.0005  max mem: 43713
Epoch: [201]  [1200/1251]  eta: 0:00:26  lr: 0.001094  min_lr: 0.001094  loss: 2.7850 (2.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8577 (1.0003)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 3.0049 (2.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9105 (0.9980)  time: 0.4443  data: 0.0005  max mem: 43713
Epoch: [201] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 3.0049 (2.9234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9105 (0.9980)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5975 (0.5975)  acc1: 90.4000 (90.4000)  acc5: 98.4000 (98.4000)  time: 5.2120  data: 4.9180  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7704 (0.7608)  acc1: 86.8000 (85.4909)  acc5: 98.0000 (97.8545)  time: 0.7146  data: 0.4474  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9412 (0.8945)  acc1: 80.8000 (82.5714)  acc5: 96.4000 (96.4762)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9914 (0.9079)  acc1: 80.0000 (82.1280)  acc5: 95.6000 (96.4160)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4681 s / it)
* Acc@1 82.590 Acc@5 96.432 loss 0.890
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.69%
Epoch: [202]  [   0/1251]  eta: 1:16:22  lr: 0.001093  min_lr: 0.001093  loss: 1.8798 (1.8798)  weight_decay: 0.0500 (0.0500)  time: 3.6632  data: 3.0631  max mem: 43713
Epoch: [202]  [ 200/1251]  eta: 0:09:27  lr: 0.001090  min_lr: 0.001090  loss: 3.1101 (2.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.9801)  time: 0.5312  data: 0.0005  max mem: 43713
Epoch: [202]  [ 400/1251]  eta: 0:07:33  lr: 0.001086  min_lr: 0.001086  loss: 2.5674 (2.9203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8417 (0.9801)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [202]  [ 600/1251]  eta: 0:05:45  lr: 0.001083  min_lr: 0.001083  loss: 2.9580 (2.9180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9254 (1.0224)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [202]  [ 800/1251]  eta: 0:03:58  lr: 0.001080  min_lr: 0.001080  loss: 3.1004 (2.9196)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0093 (1.0537)  time: 0.5293  data: 0.0004  max mem: 43713
Epoch: [202]  [1000/1251]  eta: 0:02:12  lr: 0.001077  min_lr: 0.001077  loss: 2.9121 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0445 (1.0696)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [202]  [1200/1251]  eta: 0:00:26  lr: 0.001074  min_lr: 0.001074  loss: 3.0297 (2.9273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8547 (1.0553)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.2249 (2.9317)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9065 (1.0511)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [202] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.2249 (2.9101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9065 (1.0511)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6549 (0.6549)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.4888  data: 5.1786  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8171 (0.8176)  acc1: 86.0000 (85.3091)  acc5: 98.0000 (97.6000)  time: 0.7392  data: 0.4711  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0062 (0.9407)  acc1: 81.6000 (82.5333)  acc5: 96.0000 (96.2667)  time: 0.2641  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0062 (0.9546)  acc1: 80.8000 (81.9840)  acc5: 95.6000 (96.2400)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4771 s / it)
* Acc@1 82.668 Acc@5 96.330 loss 0.941
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.69%
Epoch: [203]  [   0/1251]  eta: 1:15:43  lr: 0.001073  min_lr: 0.001073  loss: 2.4333 (2.4333)  weight_decay: 0.0500 (0.0500)  time: 3.6323  data: 2.6600  max mem: 43713
Epoch: [203]  [ 200/1251]  eta: 0:09:32  lr: 0.001070  min_lr: 0.001070  loss: 2.9083 (2.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8446 (1.0442)  time: 0.5383  data: 0.0005  max mem: 43713
Epoch: [203]  [ 400/1251]  eta: 0:07:34  lr: 0.001066  min_lr: 0.001066  loss: 2.9504 (2.8584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0842 (1.0502)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [203]  [ 600/1251]  eta: 0:05:45  lr: 0.001063  min_lr: 0.001063  loss: 2.9828 (2.8911)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8802 (1.0208)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [203]  [ 800/1251]  eta: 0:03:59  lr: 0.001060  min_lr: 0.001060  loss: 3.1480 (2.9106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1762 (1.0679)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [203]  [1000/1251]  eta: 0:02:12  lr: 0.001057  min_lr: 0.001057  loss: 2.9062 (2.9007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9194 (1.0764)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [203]  [1200/1251]  eta: 0:00:26  lr: 0.001054  min_lr: 0.001054  loss: 3.0079 (2.9065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2346 (1.1039)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 3.0047 (2.9068)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1007)  time: 0.4438  data: 0.0007  max mem: 43713
Epoch: [203] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 3.0047 (2.9158)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0588 (1.1007)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5684 (0.5684)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.2635  data: 4.9669  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7357 (0.7533)  acc1: 84.8000 (85.1273)  acc5: 98.0000 (97.8546)  time: 0.7195  data: 0.4518  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9543 (0.8972)  acc1: 79.6000 (82.1333)  acc5: 96.0000 (96.4191)  time: 0.2651  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0044 (0.9093)  acc1: 79.6000 (81.7440)  acc5: 96.0000 (96.3520)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4697 s / it)
* Acc@1 82.508 Acc@5 96.444 loss 0.894
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.69%
Epoch: [204]  [   0/1251]  eta: 1:11:55  lr: 0.001053  min_lr: 0.001053  loss: 1.9556 (1.9556)  weight_decay: 0.0500 (0.0500)  time: 3.4498  data: 2.5219  max mem: 43713
Epoch: [204]  [ 200/1251]  eta: 0:09:27  lr: 0.001050  min_lr: 0.001050  loss: 3.1400 (2.9365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9899 (1.1398)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [204]  [ 400/1251]  eta: 0:07:32  lr: 0.001047  min_lr: 0.001047  loss: 2.8491 (2.9344)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8647 (1.1750)  time: 0.5302  data: 0.0005  max mem: 43713
Epoch: [204]  [ 600/1251]  eta: 0:05:45  lr: 0.001044  min_lr: 0.001044  loss: 2.8975 (2.9377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9573 (1.1740)  time: 0.5304  data: 0.0005  max mem: 43713
Epoch: [204]  [ 800/1251]  eta: 0:03:58  lr: 0.001040  min_lr: 0.001040  loss: 2.8442 (2.9330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1104 (1.1718)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [204]  [1000/1251]  eta: 0:02:12  lr: 0.001037  min_lr: 0.001037  loss: 3.0661 (2.9471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9669 (1.2052)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [204]  [1200/1251]  eta: 0:00:26  lr: 0.001034  min_lr: 0.001034  loss: 2.9930 (2.9474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3707 (1.2324)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 3.1548 (2.9468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0811 (1.2369)  time: 0.4432  data: 0.0007  max mem: 43713
Epoch: [204] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 3.1548 (2.9171)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0811 (1.2369)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.6214 (0.6214)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 4.4726  data: 4.1530  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7760 (0.7748)  acc1: 85.2000 (84.9091)  acc5: 98.0000 (97.7455)  time: 0.6687  data: 0.3993  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9390 (0.9055)  acc1: 80.4000 (82.2667)  acc5: 95.6000 (96.4000)  time: 0.2762  data: 0.0120  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0187 (0.9169)  acc1: 80.0000 (81.9040)  acc5: 95.6000 (96.3360)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4461 s / it)
* Acc@1 82.510 Acc@5 96.350 loss 0.900
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.69%
Epoch: [205]  [   0/1251]  eta: 1:12:30  lr: 0.001033  min_lr: 0.001033  loss: 2.6023 (2.6023)  weight_decay: 0.0500 (0.0500)  time: 3.4776  data: 2.8393  max mem: 43713
Epoch: [205]  [ 200/1251]  eta: 0:09:25  lr: 0.001030  min_lr: 0.001030  loss: 3.0156 (2.9603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2098 (1.1826)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [205]  [ 400/1251]  eta: 0:07:33  lr: 0.001027  min_lr: 0.001027  loss: 3.0987 (2.9334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0745 (1.1284)  time: 0.5308  data: 0.0005  max mem: 43713
Epoch: [205]  [ 600/1251]  eta: 0:05:45  lr: 0.001024  min_lr: 0.001024  loss: 2.7396 (2.9374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9672 (1.1210)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [205]  [ 800/1251]  eta: 0:03:58  lr: 0.001021  min_lr: 0.001021  loss: 3.0197 (2.9439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1691 (1.1685)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [205]  [1000/1251]  eta: 0:02:12  lr: 0.001018  min_lr: 0.001018  loss: 2.8956 (2.9309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9033 (1.1652)  time: 0.5241  data: 0.0006  max mem: 43713
Epoch: [205]  [1200/1251]  eta: 0:00:26  lr: 0.001014  min_lr: 0.001014  loss: 2.8742 (2.9372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0056 (1.1620)  time: 0.5256  data: 0.0006  max mem: 43713
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 3.0671 (2.9383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (1.1559)  time: 0.4442  data: 0.0008  max mem: 43713
Epoch: [205] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 3.0671 (2.9229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0032 (1.1559)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5917 (0.5917)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.7241  data: 5.4238  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7530 (0.7394)  acc1: 85.6000 (85.9273)  acc5: 98.4000 (97.8182)  time: 0.7617  data: 0.4934  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9095 (0.8742)  acc1: 80.8000 (82.8000)  acc5: 96.0000 (96.6095)  time: 0.2654  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9460 (0.8913)  acc1: 79.6000 (82.3360)  acc5: 95.6000 (96.4320)  time: 0.2653  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4887 s / it)
* Acc@1 82.768 Acc@5 96.534 loss 0.875
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.77%
Epoch: [206]  [   0/1251]  eta: 0:58:25  lr: 0.001014  min_lr: 0.001014  loss: 2.8905 (2.8905)  weight_decay: 0.0500 (0.0500)  time: 2.8025  data: 2.2671  max mem: 43713
Epoch: [206]  [ 200/1251]  eta: 0:09:25  lr: 0.001011  min_lr: 0.001011  loss: 3.0190 (2.8749)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1846 (1.2541)  time: 0.5315  data: 0.0004  max mem: 43713
Epoch: [206]  [ 400/1251]  eta: 0:07:32  lr: 0.001007  min_lr: 0.001007  loss: 2.9088 (2.8798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8274 (1.1784)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [206]  [ 600/1251]  eta: 0:05:44  lr: 0.001004  min_lr: 0.001004  loss: 3.1616 (2.8962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1985 (1.1630)  time: 0.5317  data: 0.0004  max mem: 43713
Epoch: [206]  [ 800/1251]  eta: 0:03:58  lr: 0.001001  min_lr: 0.001001  loss: 3.1198 (2.9013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9523 (1.1800)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [206]  [1000/1251]  eta: 0:02:12  lr: 0.000998  min_lr: 0.000998  loss: 2.8657 (2.9030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8014 (1.1447)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [206]  [1200/1251]  eta: 0:00:26  lr: 0.000995  min_lr: 0.000995  loss: 2.6584 (2.8949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.1486)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 2.9005 (2.8955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9891 (1.1514)  time: 0.4482  data: 0.0006  max mem: 43713
Epoch: [206] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 2.9005 (2.9035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9891 (1.1514)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.4991 (0.4991)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.2699  data: 4.9708  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7066 (0.6868)  acc1: 86.0000 (85.9636)  acc5: 98.8000 (98.1455)  time: 0.7197  data: 0.4522  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8469 (0.8146)  acc1: 82.4000 (82.7048)  acc5: 96.4000 (96.7810)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9072 (0.8341)  acc1: 80.8000 (82.2400)  acc5: 95.6000 (96.6720)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4691 s / it)
* Acc@1 82.810 Acc@5 96.640 loss 0.817
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.81%
Epoch: [207]  [   0/1251]  eta: 0:58:58  lr: 0.000994  min_lr: 0.000994  loss: 3.1223 (3.1223)  weight_decay: 0.0500 (0.0500)  time: 2.8284  data: 2.2931  max mem: 43713
Epoch: [207]  [ 200/1251]  eta: 0:09:23  lr: 0.000991  min_lr: 0.000991  loss: 3.1456 (2.8874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9334 (1.1152)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [207]  [ 400/1251]  eta: 0:07:30  lr: 0.000988  min_lr: 0.000988  loss: 3.0010 (2.9097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1113 (1.1272)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [207]  [ 600/1251]  eta: 0:05:43  lr: 0.000985  min_lr: 0.000985  loss: 2.9443 (2.9281)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0063 (1.1052)  time: 0.5282  data: 0.0004  max mem: 43713
Epoch: [207]  [ 800/1251]  eta: 0:03:57  lr: 0.000982  min_lr: 0.000982  loss: 2.9823 (2.9141)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0338 (1.0837)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [207]  [1000/1251]  eta: 0:02:12  lr: 0.000979  min_lr: 0.000979  loss: 3.1173 (2.9215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0422 (1.1228)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [207]  [1200/1251]  eta: 0:00:26  lr: 0.000976  min_lr: 0.000976  loss: 2.7649 (2.9221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4818 (1.1804)  time: 0.5283  data: 0.0004  max mem: 43713
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 3.0089 (2.9221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0284 (1.1744)  time: 0.4432  data: 0.0005  max mem: 43713
Epoch: [207] Total time: 0:10:57 (0.5252 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 3.0089 (2.9060)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0284 (1.1744)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5621 (0.5621)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5697  data: 5.2650  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6974 (0.7310)  acc1: 86.8000 (85.6000)  acc5: 98.0000 (97.8909)  time: 0.7467  data: 0.4789  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8868 (0.8642)  acc1: 82.0000 (82.8000)  acc5: 96.4000 (96.6667)  time: 0.2644  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9549 (0.8823)  acc1: 81.2000 (82.2080)  acc5: 95.6000 (96.5120)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4823 s / it)
* Acc@1 82.850 Acc@5 96.490 loss 0.863
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.85%
Epoch: [208]  [   0/1251]  eta: 0:49:08  lr: 0.000975  min_lr: 0.000975  loss: 3.4352 (3.4352)  weight_decay: 0.0500 (0.0500)  time: 2.3569  data: 1.8140  max mem: 43713
Epoch: [208]  [ 200/1251]  eta: 0:09:19  lr: 0.000972  min_lr: 0.000972  loss: 3.0459 (2.8771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1102 (1.1448)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [208]  [ 400/1251]  eta: 0:07:30  lr: 0.000969  min_lr: 0.000969  loss: 3.1075 (2.8919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9534 (1.0600)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [208]  [ 600/1251]  eta: 0:05:43  lr: 0.000966  min_lr: 0.000966  loss: 2.9399 (2.8946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2304 (1.1410)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [208]  [ 800/1251]  eta: 0:03:57  lr: 0.000963  min_lr: 0.000963  loss: 3.0610 (2.8995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1827 (1.1630)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [208]  [1000/1251]  eta: 0:02:12  lr: 0.000960  min_lr: 0.000960  loss: 3.0527 (2.8948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0176 (1.1625)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [208]  [1200/1251]  eta: 0:00:26  lr: 0.000956  min_lr: 0.000956  loss: 3.1716 (2.8985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1984 (1.1785)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 2.8354 (2.9007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (1.1711)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [208] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 2.8354 (2.9077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (1.1711)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6538 (0.6538)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.3479  data: 5.0515  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7885 (0.7980)  acc1: 86.0000 (85.4545)  acc5: 97.6000 (97.6727)  time: 0.7262  data: 0.4596  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9661 (0.9197)  acc1: 80.4000 (82.0381)  acc5: 96.0000 (96.2286)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0135 (0.9367)  acc1: 80.4000 (81.6160)  acc5: 95.6000 (96.1760)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4704 s / it)
* Acc@1 82.486 Acc@5 96.300 loss 0.926
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.85%
Epoch: [209]  [   0/1251]  eta: 1:11:45  lr: 0.000956  min_lr: 0.000956  loss: 1.8652 (1.8652)  weight_decay: 0.0500 (0.0500)  time: 3.4415  data: 2.8153  max mem: 43713
Epoch: [209]  [ 200/1251]  eta: 0:09:26  lr: 0.000953  min_lr: 0.000953  loss: 2.9980 (2.8518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9546 (1.1181)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [209]  [ 400/1251]  eta: 0:07:34  lr: 0.000950  min_lr: 0.000950  loss: 2.7901 (2.8574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0207 (1.1360)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [209]  [ 600/1251]  eta: 0:05:45  lr: 0.000947  min_lr: 0.000947  loss: 3.1601 (2.8816)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9266 (1.1346)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [209]  [ 800/1251]  eta: 0:03:58  lr: 0.000944  min_lr: 0.000944  loss: 2.9995 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9424 (1.1702)  time: 0.5288  data: 0.0005  max mem: 43713
Epoch: [209]  [1000/1251]  eta: 0:02:12  lr: 0.000940  min_lr: 0.000940  loss: 3.1091 (2.8893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0867 (1.1654)  time: 0.5247  data: 0.0005  max mem: 43713
Epoch: [209]  [1200/1251]  eta: 0:00:26  lr: 0.000937  min_lr: 0.000937  loss: 2.9088 (2.8850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9161 (1.1511)  time: 0.5332  data: 0.0005  max mem: 43713
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 3.0142 (2.8835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9166 (1.1528)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [209] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 3.0142 (2.8905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9166 (1.1528)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5625 (0.5625)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.5876  data: 5.2884  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7577 (0.7240)  acc1: 86.8000 (86.6909)  acc5: 98.4000 (97.8545)  time: 0.7485  data: 0.4810  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8914 (0.8489)  acc1: 82.0000 (83.6571)  acc5: 96.4000 (96.6286)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9239 (0.8649)  acc1: 81.2000 (83.1840)  acc5: 96.4000 (96.5280)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4819 s / it)
* Acc@1 82.988 Acc@5 96.474 loss 0.861
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 82.99%
Epoch: [210]  [   0/1251]  eta: 0:55:40  lr: 0.000937  min_lr: 0.000937  loss: 3.4042 (3.4042)  weight_decay: 0.0500 (0.0500)  time: 2.6702  data: 2.1473  max mem: 43713
Epoch: [210]  [ 200/1251]  eta: 0:09:25  lr: 0.000934  min_lr: 0.000934  loss: 3.0595 (2.9237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1884 (1.4271)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [210]  [ 400/1251]  eta: 0:07:31  lr: 0.000931  min_lr: 0.000931  loss: 2.9777 (2.9334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1549 (1.2723)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [210]  [ 600/1251]  eta: 0:05:44  lr: 0.000928  min_lr: 0.000928  loss: 2.8322 (2.9160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9277 (1.1856)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [210]  [ 800/1251]  eta: 0:03:58  lr: 0.000925  min_lr: 0.000925  loss: 2.9219 (2.8969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2204 (1.1799)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [210]  [1000/1251]  eta: 0:02:12  lr: 0.000922  min_lr: 0.000922  loss: 3.0953 (2.9114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3437 (1.1829)  time: 0.5245  data: 0.0005  max mem: 43713
Epoch: [210]  [1200/1251]  eta: 0:00:26  lr: 0.000918  min_lr: 0.000918  loss: 2.9875 (2.9087)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3755 (1.2449)  time: 0.5299  data: 0.0004  max mem: 43713
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 3.0995 (2.9082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0339 (1.2410)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [210] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 3.0995 (2.8997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0339 (1.2410)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.6283 (0.6283)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 4.9685  data: 4.6648  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7530 (0.7667)  acc1: 86.8000 (85.8545)  acc5: 98.4000 (98.0364)  time: 0.6921  data: 0.4244  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9545 (0.9011)  acc1: 81.2000 (82.7810)  acc5: 96.4000 (96.6476)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9953 (0.9180)  acc1: 80.4000 (82.2880)  acc5: 96.0000 (96.5760)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4583 s / it)
* Acc@1 82.762 Acc@5 96.502 loss 0.909
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.99%
Epoch: [211]  [   0/1251]  eta: 1:18:03  lr: 0.000918  min_lr: 0.000918  loss: 3.3124 (3.3124)  weight_decay: 0.0500 (0.0500)  time: 3.7438  data: 3.1953  max mem: 43713
Epoch: [211]  [ 200/1251]  eta: 0:09:27  lr: 0.000915  min_lr: 0.000915  loss: 2.8061 (2.9092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8466 (1.0207)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [211]  [ 400/1251]  eta: 0:07:32  lr: 0.000912  min_lr: 0.000912  loss: 3.1408 (2.9183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8494 (0.9703)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [211]  [ 600/1251]  eta: 0:05:45  lr: 0.000909  min_lr: 0.000909  loss: 2.6994 (2.8941)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0434 (1.0808)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [211]  [ 800/1251]  eta: 0:03:58  lr: 0.000906  min_lr: 0.000906  loss: 2.5778 (2.8760)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0223 (1.1086)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [211]  [1000/1251]  eta: 0:02:12  lr: 0.000903  min_lr: 0.000903  loss: 2.8162 (2.8749)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (1.0976)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [211]  [1200/1251]  eta: 0:00:26  lr: 0.000900  min_lr: 0.000900  loss: 2.7005 (2.8780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9564 (1.1146)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 3.1593 (2.8801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8118 (1.1090)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [211] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 3.1593 (2.8857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8118 (1.1090)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.6428 (0.6428)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 4.3937  data: 4.0676  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8219 (0.8056)  acc1: 87.2000 (86.0364)  acc5: 98.0000 (97.9636)  time: 0.7108  data: 0.4410  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9851 (0.9301)  acc1: 81.2000 (83.2191)  acc5: 96.0000 (96.6667)  time: 0.3034  data: 0.0392  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0094 (0.9457)  acc1: 80.8000 (82.8960)  acc5: 95.6000 (96.5600)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4641 s / it)
* Acc@1 83.030 Acc@5 96.494 loss 0.937
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.03%
Epoch: [212]  [   0/1251]  eta: 1:00:38  lr: 0.000899  min_lr: 0.000899  loss: 2.9857 (2.9857)  weight_decay: 0.0500 (0.0500)  time: 2.9081  data: 2.3829  max mem: 43713
Epoch: [212]  [ 200/1251]  eta: 0:09:25  lr: 0.000896  min_lr: 0.000896  loss: 2.9972 (2.8748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9369 (1.1325)  time: 0.5254  data: 0.0005  max mem: 43713
Epoch: [212]  [ 400/1251]  eta: 0:07:34  lr: 0.000893  min_lr: 0.000893  loss: 3.0179 (2.8634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9896 (1.1842)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [212]  [ 600/1251]  eta: 0:05:45  lr: 0.000890  min_lr: 0.000890  loss: 2.6936 (2.8442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2044 (1.2192)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [212]  [ 800/1251]  eta: 0:03:58  lr: 0.000887  min_lr: 0.000887  loss: 3.1149 (2.8544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9606 (1.1718)  time: 0.5416  data: 0.0005  max mem: 43713
Epoch: [212]  [1000/1251]  eta: 0:02:12  lr: 0.000884  min_lr: 0.000884  loss: 3.1734 (2.8488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (1.1647)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [212]  [1200/1251]  eta: 0:00:26  lr: 0.000881  min_lr: 0.000881  loss: 2.9221 (2.8446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9277 (1.1383)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 2.9976 (2.8475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.1500)  time: 0.4441  data: 0.0006  max mem: 43713
Epoch: [212] Total time: 0:10:59 (0.5273 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 2.9976 (2.8669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9872 (1.1500)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6014 (0.6014)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.6547  data: 5.3543  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7578 (0.7560)  acc1: 87.2000 (86.3273)  acc5: 98.4000 (97.9636)  time: 0.7544  data: 0.4871  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9149 (0.8913)  acc1: 81.6000 (83.1048)  acc5: 96.4000 (96.8762)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0117 (0.9074)  acc1: 81.2000 (82.6400)  acc5: 96.0000 (96.7360)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4842 s / it)
* Acc@1 83.012 Acc@5 96.608 loss 0.901
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.03%
Epoch: [213]  [   0/1251]  eta: 1:14:09  lr: 0.000880  min_lr: 0.000880  loss: 3.0055 (3.0055)  weight_decay: 0.0500 (0.0500)  time: 3.5568  data: 2.7219  max mem: 43713
Epoch: [213]  [ 200/1251]  eta: 0:09:30  lr: 0.000877  min_lr: 0.000877  loss: 2.8425 (2.8505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3315 (1.3845)  time: 0.5306  data: 0.0005  max mem: 43713
Epoch: [213]  [ 400/1251]  eta: 0:07:34  lr: 0.000874  min_lr: 0.000874  loss: 2.8679 (2.8471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1706 (1.2954)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [213]  [ 600/1251]  eta: 0:05:45  lr: 0.000871  min_lr: 0.000871  loss: 3.0218 (2.8519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0064 (1.2699)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [213]  [ 800/1251]  eta: 0:03:58  lr: 0.000868  min_lr: 0.000868  loss: 2.9407 (2.8510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2222 (1.2463)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [213]  [1000/1251]  eta: 0:02:12  lr: 0.000865  min_lr: 0.000865  loss: 2.8588 (2.8602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8915 (1.2199)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [213]  [1200/1251]  eta: 0:00:26  lr: 0.000863  min_lr: 0.000863  loss: 2.9853 (2.8543)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5157 (nan)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 3.0479 (2.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5379 (nan)  time: 0.4474  data: 0.0007  max mem: 43713
Epoch: [213] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 3.0479 (2.8714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5379 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6589 (0.6589)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.5634  data: 5.2623  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8204 (0.8076)  acc1: 85.2000 (85.5636)  acc5: 98.4000 (97.9273)  time: 0.7463  data: 0.4787  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9859 (0.9397)  acc1: 81.6000 (82.6095)  acc5: 96.0000 (96.6095)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0495 (0.9613)  acc1: 80.8000 (82.0320)  acc5: 96.0000 (96.4800)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4811 s / it)
* Acc@1 82.654 Acc@5 96.446 loss 0.945
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 83.03%
Epoch: [214]  [   0/1251]  eta: 1:13:03  lr: 0.000862  min_lr: 0.000862  loss: 2.8222 (2.8222)  weight_decay: 0.0500 (0.0500)  time: 3.5039  data: 2.4102  max mem: 43713
Epoch: [214]  [ 200/1251]  eta: 0:09:28  lr: 0.000859  min_lr: 0.000859  loss: 2.9303 (2.8682)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9877 (1.2439)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [214]  [ 400/1251]  eta: 0:07:33  lr: 0.000856  min_lr: 0.000856  loss: 3.0525 (2.8589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9029 (1.1390)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [214]  [ 600/1251]  eta: 0:05:45  lr: 0.000853  min_lr: 0.000853  loss: 2.9832 (2.8611)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1353 (1.1224)  time: 0.5290  data: 0.0004  max mem: 43713
Epoch: [214]  [ 800/1251]  eta: 0:03:58  lr: 0.000850  min_lr: 0.000850  loss: 2.9495 (2.8687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.1296)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [214]  [1000/1251]  eta: 0:02:12  lr: 0.000847  min_lr: 0.000847  loss: 2.8302 (2.8685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2060 (1.1235)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [214]  [1200/1251]  eta: 0:00:26  lr: 0.000844  min_lr: 0.000844  loss: 2.9805 (2.8720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1641 (1.1675)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 3.0611 (2.8755)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9772 (1.1614)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [214] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 3.0611 (2.8641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9772 (1.1614)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6816 (0.6816)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5555  data: 5.2512  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8643 (0.8498)  acc1: 86.8000 (86.1455)  acc5: 98.0000 (97.8182)  time: 0.7457  data: 0.4777  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0301 (0.9692)  acc1: 81.6000 (83.0857)  acc5: 96.0000 (96.5333)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0542 (0.9864)  acc1: 80.8000 (82.4640)  acc5: 96.0000 (96.4480)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4820 s / it)
* Acc@1 82.850 Acc@5 96.526 loss 0.980
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.03%
Epoch: [215]  [   0/1251]  eta: 1:17:49  lr: 0.000843  min_lr: 0.000843  loss: 3.3536 (3.3536)  weight_decay: 0.0500 (0.0500)  time: 3.7324  data: 2.3663  max mem: 43713
Epoch: [215]  [ 200/1251]  eta: 0:09:27  lr: 0.000841  min_lr: 0.000841  loss: 2.9695 (2.8469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9456 (1.0923)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [215]  [ 400/1251]  eta: 0:07:34  lr: 0.000838  min_lr: 0.000838  loss: 2.9720 (2.8698)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0233 (1.1067)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [215]  [ 600/1251]  eta: 0:05:45  lr: 0.000835  min_lr: 0.000835  loss: 2.9738 (2.8675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9661 (1.1181)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [215]  [ 800/1251]  eta: 0:03:58  lr: 0.000832  min_lr: 0.000832  loss: 2.8399 (2.8720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9288 (1.1236)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [215]  [1000/1251]  eta: 0:02:12  lr: 0.000829  min_lr: 0.000829  loss: 2.9065 (2.8720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9811 (1.1408)  time: 0.5222  data: 0.0005  max mem: 43713
Epoch: [215]  [1200/1251]  eta: 0:00:26  lr: 0.000826  min_lr: 0.000826  loss: 3.0754 (2.8774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9643 (1.1484)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 2.8993 (2.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0553 (1.1496)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [215] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 2.8993 (2.8630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0553 (1.1496)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5820 (0.5820)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.4912  data: 5.2030  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7434 (0.7230)  acc1: 86.8000 (86.2545)  acc5: 97.6000 (97.6727)  time: 0.7400  data: 0.4732  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8937 (0.8389)  acc1: 81.6000 (83.0095)  acc5: 96.4000 (96.5333)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9184 (0.8514)  acc1: 80.4000 (82.4640)  acc5: 96.0000 (96.4960)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4790 s / it)
* Acc@1 83.092 Acc@5 96.592 loss 0.841
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.09%
Epoch: [216]  [   0/1251]  eta: 1:03:41  lr: 0.000825  min_lr: 0.000825  loss: 2.6503 (2.6503)  weight_decay: 0.0500 (0.0500)  time: 3.0546  data: 2.5105  max mem: 43713
Epoch: [216]  [ 200/1251]  eta: 0:09:28  lr: 0.000822  min_lr: 0.000822  loss: 2.8540 (2.8465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0112 (1.3532)  time: 0.5319  data: 0.0005  max mem: 43713
Epoch: [216]  [ 400/1251]  eta: 0:07:33  lr: 0.000819  min_lr: 0.000819  loss: 2.8859 (2.8528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5951 (1.4338)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [216]  [ 600/1251]  eta: 0:05:44  lr: 0.000817  min_lr: 0.000817  loss: 2.9520 (2.8536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9975 (1.3509)  time: 0.5316  data: 0.0005  max mem: 43713
Epoch: [216]  [ 800/1251]  eta: 0:03:58  lr: 0.000814  min_lr: 0.000814  loss: 2.5780 (2.8404)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1547 (1.3520)  time: 0.5284  data: 0.0005  max mem: 43713
Epoch: [216]  [1000/1251]  eta: 0:02:12  lr: 0.000811  min_lr: 0.000811  loss: 2.9703 (2.8388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0230 (1.2805)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [216]  [1200/1251]  eta: 0:00:26  lr: 0.000808  min_lr: 0.000808  loss: 2.9459 (2.8381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3935 (1.2755)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.7815 (2.8334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1891 (1.2719)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [216] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.7815 (2.8545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1891 (1.2719)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5136 (0.5136)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.6202  data: 5.3145  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7246 (0.6815)  acc1: 86.4000 (86.3273)  acc5: 98.0000 (97.8909)  time: 0.7515  data: 0.4834  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8428 (0.7923)  acc1: 81.6000 (83.3524)  acc5: 96.4000 (96.8191)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8808 (0.8094)  acc1: 81.2000 (82.8800)  acc5: 96.4000 (96.7200)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4836 s / it)
* Acc@1 83.164 Acc@5 96.622 loss 0.807
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.16%
Epoch: [217]  [   0/1251]  eta: 0:49:00  lr: 0.000807  min_lr: 0.000807  loss: 2.7310 (2.7310)  weight_decay: 0.0500 (0.0500)  time: 2.3504  data: 1.8043  max mem: 43713
Epoch: [217]  [ 200/1251]  eta: 0:09:23  lr: 0.000804  min_lr: 0.000804  loss: 2.7102 (2.8046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1271 (1.1978)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [217]  [ 400/1251]  eta: 0:07:30  lr: 0.000801  min_lr: 0.000801  loss: 2.7624 (2.8235)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9905 (1.1586)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [217]  [ 600/1251]  eta: 0:05:44  lr: 0.000799  min_lr: 0.000799  loss: 2.8581 (2.8403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.1583)  time: 0.5319  data: 0.0004  max mem: 43713
Epoch: [217]  [ 800/1251]  eta: 0:03:57  lr: 0.000796  min_lr: 0.000796  loss: 2.9195 (2.8396)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1292 (1.1580)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [217]  [1000/1251]  eta: 0:02:12  lr: 0.000793  min_lr: 0.000793  loss: 3.0963 (2.8418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5370 (1.1954)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [217]  [1200/1251]  eta: 0:00:26  lr: 0.000790  min_lr: 0.000790  loss: 2.8791 (2.8434)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1977 (1.2260)  time: 0.5325  data: 0.0005  max mem: 43713
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 3.0359 (2.8450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4262 (1.2330)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [217] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 3.0359 (2.8490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4262 (1.2330)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6108 (0.6108)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5378  data: 5.2381  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7453 (0.7458)  acc1: 86.4000 (86.1818)  acc5: 98.4000 (98.0727)  time: 0.7442  data: 0.4765  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9348 (0.8641)  acc1: 82.0000 (83.1810)  acc5: 96.4000 (96.8762)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9422 (0.8799)  acc1: 81.2000 (82.8160)  acc5: 96.0000 (96.8160)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4802 s / it)
* Acc@1 83.062 Acc@5 96.552 loss 0.874
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.16%
Epoch: [218]  [   0/1251]  eta: 1:12:16  lr: 0.000789  min_lr: 0.000789  loss: 3.1809 (3.1809)  weight_decay: 0.0500 (0.0500)  time: 3.4665  data: 2.7485  max mem: 43713
Epoch: [218]  [ 200/1251]  eta: 0:09:26  lr: 0.000786  min_lr: 0.000786  loss: 2.8207 (2.8480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1874 (1.2379)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [218]  [ 400/1251]  eta: 0:07:33  lr: 0.000784  min_lr: 0.000784  loss: 2.8656 (2.8241)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2154 (1.2008)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [218]  [ 600/1251]  eta: 0:05:45  lr: 0.000781  min_lr: 0.000781  loss: 2.8971 (2.8013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1972 (1.2105)  time: 0.5250  data: 0.0005  max mem: 43713
Epoch: [218]  [ 800/1251]  eta: 0:03:58  lr: 0.000778  min_lr: 0.000778  loss: 2.8197 (2.8183)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0670 (1.1981)  time: 0.5252  data: 0.0006  max mem: 43713
Epoch: [218]  [1000/1251]  eta: 0:02:12  lr: 0.000775  min_lr: 0.000775  loss: 3.0544 (2.8273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9509 (1.2037)  time: 0.5322  data: 0.0005  max mem: 43713
Epoch: [218]  [1200/1251]  eta: 0:00:26  lr: 0.000772  min_lr: 0.000772  loss: 3.0018 (2.8342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4793 (1.2366)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.0848 (2.8328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2107 (1.2365)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [218] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.0848 (2.8441)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2107 (1.2365)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6217 (0.6217)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.1867  data: 4.8831  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7850 (0.7797)  acc1: 86.4000 (86.6909)  acc5: 98.4000 (97.6727)  time: 0.7125  data: 0.4442  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9615 (0.8969)  acc1: 81.2000 (83.6571)  acc5: 96.4000 (96.6286)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9652 (0.9158)  acc1: 81.2000 (83.1040)  acc5: 96.4000 (96.5760)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4660 s / it)
* Acc@1 83.074 Acc@5 96.514 loss 0.915
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.16%
Epoch: [219]  [   0/1251]  eta: 1:15:22  lr: 0.000771  min_lr: 0.000771  loss: 2.5437 (2.5437)  weight_decay: 0.0500 (0.0500)  time: 3.6148  data: 1.6926  max mem: 43713
Epoch: [219]  [ 200/1251]  eta: 0:09:28  lr: 0.000769  min_lr: 0.000769  loss: 2.9457 (2.8317)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1851 (1.1432)  time: 0.5401  data: 0.0004  max mem: 43713
Epoch: [219]  [ 400/1251]  eta: 0:07:33  lr: 0.000766  min_lr: 0.000766  loss: 2.9191 (2.8370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9090 (1.0968)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [219]  [ 600/1251]  eta: 0:05:45  lr: 0.000763  min_lr: 0.000763  loss: 2.8915 (2.8296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (1.0943)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [219]  [ 800/1251]  eta: 0:03:58  lr: 0.000760  min_lr: 0.000760  loss: 2.8709 (2.8325)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1417 (1.2775)  time: 0.5296  data: 0.0005  max mem: 43713
Epoch: [219]  [1000/1251]  eta: 0:02:12  lr: 0.000757  min_lr: 0.000757  loss: 2.9350 (2.8375)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1177 (1.2818)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [219]  [1200/1251]  eta: 0:00:26  lr: 0.000755  min_lr: 0.000755  loss: 2.9623 (2.8382)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1799 (1.2850)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.8358 (2.8418)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6966 (1.3245)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [219] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.8358 (2.8387)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6966 (1.3245)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.5425 (0.5425)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.8522  data: 5.5633  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7088 (0.7059)  acc1: 85.2000 (85.3091)  acc5: 98.4000 (97.7455)  time: 0.7726  data: 0.5060  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8715 (0.8187)  acc1: 81.2000 (82.9524)  acc5: 96.0000 (96.7619)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8789 (0.8348)  acc1: 81.2000 (82.5760)  acc5: 96.0000 (96.6720)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4925 s / it)
* Acc@1 83.146 Acc@5 96.524 loss 0.821
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.16%
Epoch: [220]  [   0/1251]  eta: 1:18:58  lr: 0.000754  min_lr: 0.000754  loss: 2.7764 (2.7764)  weight_decay: 0.0500 (0.0500)  time: 3.7880  data: 2.5633  max mem: 43713
Epoch: [220]  [ 200/1251]  eta: 0:09:32  lr: 0.000751  min_lr: 0.000751  loss: 3.1170 (2.8176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9633 (1.1539)  time: 0.5301  data: 0.0004  max mem: 43713
Epoch: [220]  [ 400/1251]  eta: 0:07:34  lr: 0.000748  min_lr: 0.000748  loss: 2.8918 (2.8324)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9998 (1.1671)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [220]  [ 600/1251]  eta: 0:05:45  lr: 0.000745  min_lr: 0.000745  loss: 2.8888 (2.8225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9825 (1.1806)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [220]  [ 800/1251]  eta: 0:03:58  lr: 0.000743  min_lr: 0.000743  loss: 2.9939 (2.8257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1087 (1.1894)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [220]  [1000/1251]  eta: 0:02:12  lr: 0.000740  min_lr: 0.000740  loss: 2.8198 (2.8329)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4534 (1.2611)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [220]  [1200/1251]  eta: 0:00:26  lr: 0.000737  min_lr: 0.000737  loss: 3.0044 (2.8325)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4117 (1.2753)  time: 0.5330  data: 0.0005  max mem: 43713
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.8433 (2.8324)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3422 (1.2783)  time: 0.4500  data: 0.0007  max mem: 43713
Epoch: [220] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.8433 (2.8427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3422 (1.2783)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5668 (0.5668)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5468  data: 5.2430  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7078 (0.7304)  acc1: 86.8000 (86.5818)  acc5: 98.0000 (97.7818)  time: 0.7448  data: 0.4769  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8796 (0.8466)  acc1: 82.0000 (83.3143)  acc5: 96.4000 (96.5714)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9363 (0.8623)  acc1: 80.4000 (82.8160)  acc5: 96.0000 (96.4960)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4806 s / it)
* Acc@1 83.166 Acc@5 96.484 loss 0.855
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.17%
Epoch: [221]  [   0/1251]  eta: 0:55:30  lr: 0.000736  min_lr: 0.000736  loss: 3.0962 (3.0962)  weight_decay: 0.0500 (0.0500)  time: 2.6624  data: 2.1327  max mem: 43713
Epoch: [221]  [ 200/1251]  eta: 0:09:21  lr: 0.000734  min_lr: 0.000734  loss: 2.7203 (2.8545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1212 (1.0900)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [221]  [ 400/1251]  eta: 0:07:30  lr: 0.000731  min_lr: 0.000731  loss: 3.0538 (2.8609)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9914 (1.1995)  time: 0.5307  data: 0.0004  max mem: 43713
Epoch: [221]  [ 600/1251]  eta: 0:05:44  lr: 0.000728  min_lr: 0.000728  loss: 2.8014 (2.8356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0947 (1.1610)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [221]  [ 800/1251]  eta: 0:03:57  lr: 0.000725  min_lr: 0.000725  loss: 2.9480 (2.8362)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1063 (1.1839)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [221]  [1000/1251]  eta: 0:02:12  lr: 0.000722  min_lr: 0.000722  loss: 2.9814 (2.8413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3307 (1.2147)  time: 0.5285  data: 0.0004  max mem: 43713
Epoch: [221]  [1200/1251]  eta: 0:00:26  lr: 0.000720  min_lr: 0.000720  loss: 2.9645 (2.8349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1750 (1.2370)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.9027 (2.8360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4659 (1.2503)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [221] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.9027 (2.8374)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4659 (1.2503)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5741 (0.5741)  acc1: 88.8000 (88.8000)  acc5: 99.6000 (99.6000)  time: 5.4376  data: 5.1194  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7205 (0.7241)  acc1: 86.8000 (86.0727)  acc5: 97.6000 (97.7818)  time: 0.7349  data: 0.4657  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8943 (0.8466)  acc1: 82.0000 (83.3143)  acc5: 96.4000 (96.7429)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9263 (0.8634)  acc1: 80.8000 (82.6880)  acc5: 96.0000 (96.7200)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4763 s / it)
* Acc@1 83.218 Acc@5 96.600 loss 0.855
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.22%
Epoch: [222]  [   0/1251]  eta: 1:03:45  lr: 0.000719  min_lr: 0.000719  loss: 3.0143 (3.0143)  weight_decay: 0.0500 (0.0500)  time: 3.0579  data: 2.5212  max mem: 43713
Epoch: [222]  [ 200/1251]  eta: 0:09:24  lr: 0.000716  min_lr: 0.000716  loss: 2.9506 (2.8381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8592 (0.9547)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [222]  [ 400/1251]  eta: 0:07:32  lr: 0.000714  min_lr: 0.000714  loss: 2.8226 (2.7968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0601 (1.1005)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [222]  [ 600/1251]  eta: 0:05:44  lr: 0.000711  min_lr: 0.000711  loss: 2.6748 (2.7968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0736 (1.0895)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [222]  [ 800/1251]  eta: 0:03:58  lr: 0.000708  min_lr: 0.000708  loss: 3.0540 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1029 (1.0942)  time: 0.5290  data: 0.0005  max mem: 43713
Epoch: [222]  [1000/1251]  eta: 0:02:12  lr: 0.000705  min_lr: 0.000705  loss: 2.8847 (2.8006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1202 (1.1166)  time: 0.5248  data: 0.0006  max mem: 43713
Epoch: [222]  [1200/1251]  eta: 0:00:26  lr: 0.000703  min_lr: 0.000703  loss: 2.7339 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3529 (1.1494)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.8579 (2.7973)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.4431  data: 0.0006  max mem: 43713
Epoch: [222] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.8579 (2.8182)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.5326 (0.5326)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 4.5234  data: 4.2144  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7157 (0.7026)  acc1: 86.4000 (86.1455)  acc5: 98.4000 (98.0727)  time: 0.7033  data: 0.4351  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8698 (0.8155)  acc1: 82.4000 (83.6571)  acc5: 96.8000 (96.9143)  time: 0.2928  data: 0.0286  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8802 (0.8306)  acc1: 82.0000 (83.1200)  acc5: 96.4000 (96.8480)  time: 0.2643  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4608 s / it)
* Acc@1 83.354 Acc@5 96.686 loss 0.826
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.35%
Epoch: [223]  [   0/1251]  eta: 0:57:31  lr: 0.000702  min_lr: 0.000702  loss: 2.9495 (2.9495)  weight_decay: 0.0500 (0.0500)  time: 2.7592  data: 2.2323  max mem: 43713
Epoch: [223]  [ 200/1251]  eta: 0:09:26  lr: 0.000699  min_lr: 0.000699  loss: 2.7248 (2.7523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2713 (1.2268)  time: 0.5309  data: 0.0005  max mem: 43713
Epoch: [223]  [ 400/1251]  eta: 0:07:32  lr: 0.000696  min_lr: 0.000696  loss: 3.0908 (2.7955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1092 (1.1907)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [223]  [ 600/1251]  eta: 0:05:44  lr: 0.000694  min_lr: 0.000694  loss: 2.8521 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2422 (1.2263)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [223]  [ 800/1251]  eta: 0:03:58  lr: 0.000691  min_lr: 0.000691  loss: 2.8698 (2.7927)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3636 (1.2852)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [223]  [1000/1251]  eta: 0:02:12  lr: 0.000688  min_lr: 0.000688  loss: 2.9819 (2.7959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1313 (1.2723)  time: 0.5286  data: 0.0005  max mem: 43713
Epoch: [223]  [1200/1251]  eta: 0:00:26  lr: 0.000686  min_lr: 0.000686  loss: 2.8653 (2.8128)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0531 (1.2523)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 3.0166 (2.8146)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3730 (1.2724)  time: 0.4506  data: 0.0006  max mem: 43713
Epoch: [223] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 3.0166 (2.8194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3730 (1.2724)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6415 (0.6415)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.4818  data: 5.1797  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7898 (0.7796)  acc1: 86.0000 (86.0364)  acc5: 98.0000 (97.7091)  time: 0.7391  data: 0.4713  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9572 (0.8957)  acc1: 82.0000 (83.4286)  acc5: 96.0000 (96.5524)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9682 (0.9125)  acc1: 81.6000 (82.8480)  acc5: 95.6000 (96.5120)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4778 s / it)
* Acc@1 83.202 Acc@5 96.556 loss 0.905
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.35%
Epoch: [224]  [   0/1251]  eta: 1:15:36  lr: 0.000685  min_lr: 0.000685  loss: 3.3490 (3.3490)  weight_decay: 0.0500 (0.0500)  time: 3.6260  data: 2.9274  max mem: 43713
Epoch: [224]  [ 200/1251]  eta: 0:09:28  lr: 0.000682  min_lr: 0.000682  loss: 2.9272 (2.8431)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0379 (1.3656)  time: 0.5317  data: 0.0004  max mem: 43713
Epoch: [224]  [ 400/1251]  eta: 0:07:32  lr: 0.000680  min_lr: 0.000680  loss: 2.9743 (2.8001)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0795 (1.3130)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [224]  [ 600/1251]  eta: 0:05:45  lr: 0.000677  min_lr: 0.000677  loss: 2.8744 (2.8080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5374 (1.3591)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [224]  [ 800/1251]  eta: 0:03:58  lr: 0.000674  min_lr: 0.000674  loss: 2.9799 (2.8182)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0179 (1.2891)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [224]  [1000/1251]  eta: 0:02:12  lr: 0.000671  min_lr: 0.000671  loss: 3.0259 (2.8109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1727 (1.2771)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [224]  [1200/1251]  eta: 0:00:26  lr: 0.000669  min_lr: 0.000669  loss: 2.7613 (2.8145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3257 (1.2855)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.8755 (2.8160)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0881 (1.2770)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [224] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.8755 (2.8114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0881 (1.2770)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5701 (0.5701)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.5091  data: 5.1839  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7365 (0.7183)  acc1: 87.2000 (86.0364)  acc5: 97.6000 (97.7091)  time: 0.7414  data: 0.4716  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8896 (0.8352)  acc1: 81.2000 (83.2381)  acc5: 96.4000 (96.7048)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9147 (0.8508)  acc1: 81.2000 (82.6720)  acc5: 96.4000 (96.7040)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4800 s / it)
* Acc@1 83.210 Acc@5 96.556 loss 0.848
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.35%
Epoch: [225]  [   0/1251]  eta: 1:10:53  lr: 0.000668  min_lr: 0.000668  loss: 2.1508 (2.1508)  weight_decay: 0.0500 (0.0500)  time: 3.3998  data: 1.8440  max mem: 43713
Epoch: [225]  [ 200/1251]  eta: 0:09:26  lr: 0.000665  min_lr: 0.000665  loss: 2.7907 (2.7604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1803 (1.2510)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [225]  [ 400/1251]  eta: 0:07:33  lr: 0.000663  min_lr: 0.000663  loss: 2.9845 (2.7872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3893 (1.2448)  time: 0.5318  data: 0.0004  max mem: 43713
Epoch: [225]  [ 600/1251]  eta: 0:05:44  lr: 0.000660  min_lr: 0.000660  loss: 3.0800 (2.7971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0851 (1.2948)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [225]  [ 800/1251]  eta: 0:03:58  lr: 0.000657  min_lr: 0.000657  loss: 2.9934 (2.7821)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2278 (1.3531)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [225]  [1000/1251]  eta: 0:02:12  lr: 0.000655  min_lr: 0.000655  loss: 2.8805 (2.7739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9008 (1.3072)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [225]  [1200/1251]  eta: 0:00:26  lr: 0.000652  min_lr: 0.000652  loss: 2.8607 (2.7734)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2300 (1.2950)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.8410 (2.7743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2313 (1.2940)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [225] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.8410 (2.8075)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2313 (1.2940)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5921 (0.5921)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.6520  data: 5.3536  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7025 (0.7152)  acc1: 86.4000 (87.1636)  acc5: 98.0000 (97.9273)  time: 0.7546  data: 0.4870  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8749 (0.8377)  acc1: 81.6000 (83.6571)  acc5: 96.4000 (96.9143)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9152 (0.8568)  acc1: 81.2000 (83.0240)  acc5: 96.4000 (96.8320)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4856 s / it)
* Acc@1 83.510 Acc@5 96.658 loss 0.846
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.51%
Epoch: [226]  [   0/1251]  eta: 0:55:55  lr: 0.000651  min_lr: 0.000651  loss: 3.0947 (3.0947)  weight_decay: 0.0500 (0.0500)  time: 2.6826  data: 2.1435  max mem: 43713
Epoch: [226]  [ 200/1251]  eta: 0:09:26  lr: 0.000649  min_lr: 0.000649  loss: 2.8372 (2.7634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0546 (1.1338)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [226]  [ 400/1251]  eta: 0:07:32  lr: 0.000646  min_lr: 0.000646  loss: 2.7663 (2.7784)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3146 (1.2625)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [226]  [ 600/1251]  eta: 0:05:44  lr: 0.000644  min_lr: 0.000644  loss: 2.9375 (2.7945)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0738 (1.2384)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [226]  [ 800/1251]  eta: 0:03:58  lr: 0.000641  min_lr: 0.000641  loss: 3.0256 (2.7964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2095 (1.2574)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [226]  [1000/1251]  eta: 0:02:12  lr: 0.000638  min_lr: 0.000638  loss: 3.0436 (2.7994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9708 (1.2469)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [226]  [1200/1251]  eta: 0:00:26  lr: 0.000636  min_lr: 0.000636  loss: 2.8627 (2.7953)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4947 (1.2898)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 3.0403 (2.7958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4947 (1.3047)  time: 0.4432  data: 0.0006  max mem: 43713
Epoch: [226] Total time: 0:10:57 (0.5260 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 3.0403 (2.8085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4947 (1.3047)
Test:  [ 0/25]  eta: 0:01:58  loss: 0.6890 (0.6890)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 4.7351  data: 4.4451  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8130 (0.8056)  acc1: 87.2000 (86.5455)  acc5: 98.0000 (97.7818)  time: 0.6704  data: 0.4044  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9651 (0.9296)  acc1: 81.6000 (83.4857)  acc5: 96.4000 (96.7429)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0206 (0.9464)  acc1: 81.2000 (82.8320)  acc5: 96.4000 (96.6880)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4459 s / it)
* Acc@1 83.192 Acc@5 96.484 loss 0.939
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.51%
Epoch: [227]  [   0/1251]  eta: 1:14:00  lr: 0.000635  min_lr: 0.000635  loss: 2.4315 (2.4315)  weight_decay: 0.0500 (0.0500)  time: 3.5492  data: 2.9029  max mem: 43713
Epoch: [227]  [ 200/1251]  eta: 0:09:28  lr: 0.000632  min_lr: 0.000632  loss: 3.1402 (2.8421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5400 (1.4389)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [227]  [ 400/1251]  eta: 0:07:33  lr: 0.000630  min_lr: 0.000630  loss: 2.9134 (2.8466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0003 (1.3929)  time: 0.5317  data: 0.0004  max mem: 43713
Epoch: [227]  [ 600/1251]  eta: 0:05:45  lr: 0.000627  min_lr: 0.000627  loss: 2.7992 (2.8293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9106 (1.2761)  time: 0.5292  data: 0.0005  max mem: 43713
Epoch: [227]  [ 800/1251]  eta: 0:03:58  lr: 0.000625  min_lr: 0.000625  loss: 3.0317 (2.8321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0650 (1.2291)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [227]  [1000/1251]  eta: 0:02:12  lr: 0.000622  min_lr: 0.000622  loss: 2.7680 (2.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.2657)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [227]  [1200/1251]  eta: 0:00:26  lr: 0.000619  min_lr: 0.000619  loss: 2.9077 (2.8217)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1900 (1.2733)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 2.8581 (2.8205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1774 (1.2731)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [227] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 2.8581 (2.8126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1774 (1.2731)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6139 (0.6139)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.5119  data: 5.2080  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7359 (0.7341)  acc1: 87.6000 (86.6546)  acc5: 98.4000 (98.0364)  time: 0.7418  data: 0.4737  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9100 (0.8457)  acc1: 82.0000 (83.8667)  acc5: 96.4000 (96.7619)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9345 (0.8660)  acc1: 82.0000 (83.3120)  acc5: 96.4000 (96.7520)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4791 s / it)
* Acc@1 83.376 Acc@5 96.660 loss 0.859
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.51%
Epoch: [228]  [   0/1251]  eta: 1:10:41  lr: 0.000619  min_lr: 0.000619  loss: 3.1345 (3.1345)  weight_decay: 0.0500 (0.0500)  time: 3.3907  data: 1.7101  max mem: 43713
Epoch: [228]  [ 200/1251]  eta: 0:09:26  lr: 0.000616  min_lr: 0.000616  loss: 2.9386 (2.8290)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2878 (1.4059)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [228]  [ 400/1251]  eta: 0:07:33  lr: 0.000614  min_lr: 0.000614  loss: 2.6154 (2.8053)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0337 (1.2758)  time: 0.5390  data: 0.0005  max mem: 43713
Epoch: [228]  [ 600/1251]  eta: 0:05:45  lr: 0.000611  min_lr: 0.000611  loss: 2.9819 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2507 (1.3193)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [228]  [ 800/1251]  eta: 0:03:58  lr: 0.000608  min_lr: 0.000608  loss: 2.9665 (2.8066)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4968 (1.3920)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [228]  [1000/1251]  eta: 0:02:12  lr: 0.000606  min_lr: 0.000606  loss: 2.9959 (2.8078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9574 (1.3382)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [228]  [1200/1251]  eta: 0:00:26  lr: 0.000603  min_lr: 0.000603  loss: 3.0765 (2.8045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4481 (1.3354)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 2.8746 (2.8042)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2721 (1.3374)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [228] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 2.8746 (2.8055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2721 (1.3374)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5793 (0.5793)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.3450  data: 5.0413  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7424 (0.7200)  acc1: 86.8000 (86.9091)  acc5: 98.4000 (97.9273)  time: 0.7266  data: 0.4586  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9173 (0.8454)  acc1: 83.2000 (83.6762)  acc5: 96.4000 (96.8952)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9329 (0.8615)  acc1: 81.2000 (83.0560)  acc5: 96.0000 (96.8320)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4734 s / it)
* Acc@1 83.486 Acc@5 96.756 loss 0.856
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.51%
Epoch: [229]  [   0/1251]  eta: 1:16:28  lr: 0.000603  min_lr: 0.000603  loss: 2.3466 (2.3466)  weight_decay: 0.0500 (0.0500)  time: 3.6675  data: 2.3289  max mem: 43713
Epoch: [229]  [ 200/1251]  eta: 0:09:30  lr: 0.000600  min_lr: 0.000600  loss: 2.7022 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1072 (1.2535)  time: 0.5316  data: 0.0005  max mem: 43713
Epoch: [229]  [ 400/1251]  eta: 0:07:34  lr: 0.000597  min_lr: 0.000597  loss: 2.9588 (2.7919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9716 (1.1854)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [229]  [ 600/1251]  eta: 0:05:45  lr: 0.000595  min_lr: 0.000595  loss: 2.9048 (2.7978)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0871 (1.2126)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [229]  [ 800/1251]  eta: 0:03:59  lr: 0.000592  min_lr: 0.000592  loss: 2.8078 (2.7851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9410 (1.1742)  time: 0.5296  data: 0.0005  max mem: 43713
Epoch: [229]  [1000/1251]  eta: 0:02:12  lr: 0.000590  min_lr: 0.000590  loss: 2.8991 (2.7857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9666 (1.1658)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [229]  [1200/1251]  eta: 0:00:26  lr: 0.000587  min_lr: 0.000587  loss: 2.8971 (2.7791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2969 (1.1784)  time: 0.5338  data: 0.0004  max mem: 43713
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.9581 (2.7826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2995 (1.1981)  time: 0.4527  data: 0.0006  max mem: 43713
Epoch: [229] Total time: 0:11:00 (0.5279 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.9581 (2.7903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2995 (1.1981)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6607 (0.6607)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.9812  data: 5.6664  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8262 (0.7969)  acc1: 87.2000 (86.5818)  acc5: 98.4000 (97.8909)  time: 0.7844  data: 0.5154  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9845 (0.9092)  acc1: 82.0000 (83.6191)  acc5: 96.4000 (96.7810)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9862 (0.9251)  acc1: 81.6000 (83.1200)  acc5: 96.4000 (96.7520)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4981 s / it)
* Acc@1 83.458 Acc@5 96.648 loss 0.921
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.51%
Epoch: [230]  [   0/1251]  eta: 1:16:33  lr: 0.000587  min_lr: 0.000587  loss: 3.3101 (3.3101)  weight_decay: 0.0500 (0.0500)  time: 3.6719  data: 2.9197  max mem: 43713
Epoch: [230]  [ 200/1251]  eta: 0:09:30  lr: 0.000584  min_lr: 0.000584  loss: 2.8344 (2.7381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9744 (1.1443)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [230]  [ 400/1251]  eta: 0:07:33  lr: 0.000582  min_lr: 0.000582  loss: 2.8640 (2.7290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9871 (1.1487)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [230]  [ 600/1251]  eta: 0:05:45  lr: 0.000579  min_lr: 0.000579  loss: 2.9914 (2.7612)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0275 (1.2534)  time: 0.5282  data: 0.0004  max mem: 43713
Epoch: [230]  [ 800/1251]  eta: 0:03:58  lr: 0.000577  min_lr: 0.000577  loss: 2.9419 (2.7598)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [230]  [1000/1251]  eta: 0:02:12  lr: 0.000574  min_lr: 0.000574  loss: 2.7295 (2.7522)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9294 (nan)  time: 0.5314  data: 0.0004  max mem: 43713
Epoch: [230]  [1200/1251]  eta: 0:00:26  lr: 0.000571  min_lr: 0.000571  loss: 3.0229 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0557 (nan)  time: 0.5274  data: 0.0005  max mem: 43713
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 3.0289 (2.7546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0540 (nan)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [230] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 3.0289 (2.7787)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0540 (nan)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5891 (0.5891)  acc1: 92.0000 (92.0000)  acc5: 99.6000 (99.6000)  time: 5.2033  data: 4.9174  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7550 (0.7498)  acc1: 87.6000 (86.8364)  acc5: 98.4000 (98.0364)  time: 0.7139  data: 0.4473  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9514 (0.8732)  acc1: 82.4000 (83.4667)  acc5: 96.4000 (96.9143)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9707 (0.8885)  acc1: 80.8000 (82.9760)  acc5: 96.8000 (96.9120)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4697 s / it)
* Acc@1 83.556 Acc@5 96.748 loss 0.881
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.56%
Epoch: [231]  [   0/1251]  eta: 1:06:08  lr: 0.000571  min_lr: 0.000571  loss: 3.1710 (3.1710)  weight_decay: 0.0500 (0.0500)  time: 3.1726  data: 2.6365  max mem: 43713
Epoch: [231]  [ 200/1251]  eta: 0:09:25  lr: 0.000568  min_lr: 0.000568  loss: 2.8733 (2.7684)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2409 (1.1828)  time: 0.5246  data: 0.0004  max mem: 43713
Epoch: [231]  [ 400/1251]  eta: 0:07:32  lr: 0.000566  min_lr: 0.000566  loss: 2.8546 (2.7379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0814 (1.1621)  time: 0.5298  data: 0.0005  max mem: 43713
Epoch: [231]  [ 600/1251]  eta: 0:05:44  lr: 0.000563  min_lr: 0.000563  loss: 2.9184 (2.7542)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2349 (1.1627)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [231]  [ 800/1251]  eta: 0:03:58  lr: 0.000561  min_lr: 0.000561  loss: 2.8263 (2.7725)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0720 (1.1795)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [231]  [1000/1251]  eta: 0:02:12  lr: 0.000558  min_lr: 0.000558  loss: 2.7817 (2.7709)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1464 (1.2059)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [231]  [1200/1251]  eta: 0:00:26  lr: 0.000556  min_lr: 0.000556  loss: 2.8308 (2.7720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2736 (1.2394)  time: 0.5283  data: 0.0005  max mem: 43713
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 2.8010 (2.7727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.2428)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [231] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 2.8010 (2.7717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2657 (1.2428)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.5385 (0.5385)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.1435  data: 4.8571  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7163 (0.7101)  acc1: 86.8000 (86.8364)  acc5: 98.0000 (98.0000)  time: 0.7085  data: 0.4419  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9118 (0.8259)  acc1: 81.6000 (83.5429)  acc5: 96.4000 (96.9333)  time: 0.2651  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9179 (0.8435)  acc1: 81.2000 (83.0880)  acc5: 96.0000 (96.8160)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4660 s / it)
* Acc@1 83.610 Acc@5 96.714 loss 0.834
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.61%
Epoch: [232]  [   0/1251]  eta: 0:59:54  lr: 0.000555  min_lr: 0.000555  loss: 3.2063 (3.2063)  weight_decay: 0.0500 (0.0500)  time: 2.8734  data: 2.3283  max mem: 43713
Epoch: [232]  [ 200/1251]  eta: 0:09:23  lr: 0.000553  min_lr: 0.000553  loss: 2.7644 (2.7576)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0704 (1.1604)  time: 0.5311  data: 0.0004  max mem: 43713
Epoch: [232]  [ 400/1251]  eta: 0:07:32  lr: 0.000550  min_lr: 0.000550  loss: 2.7898 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2179 (1.1711)  time: 0.5264  data: 0.0005  max mem: 43713
Epoch: [232]  [ 600/1251]  eta: 0:05:44  lr: 0.000548  min_lr: 0.000548  loss: 2.9350 (2.7448)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1909 (1.1791)  time: 0.5272  data: 0.0005  max mem: 43713
Epoch: [232]  [ 800/1251]  eta: 0:03:58  lr: 0.000545  min_lr: 0.000545  loss: 2.7740 (2.7627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1967 (1.2418)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [232]  [1000/1251]  eta: 0:02:12  lr: 0.000543  min_lr: 0.000543  loss: 2.4476 (2.7562)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5840 (1.3132)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [232]  [1200/1251]  eta: 0:00:26  lr: 0.000540  min_lr: 0.000540  loss: 2.9602 (2.7580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9265 (1.2890)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.8638 (2.7604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.2909)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [232] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.8638 (2.7650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.2909)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5999 (0.5999)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.5207  data: 5.2126  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7669 (0.7629)  acc1: 86.8000 (86.7273)  acc5: 98.0000 (97.7818)  time: 0.7426  data: 0.4742  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9331 (0.8788)  acc1: 82.4000 (83.6762)  acc5: 96.4000 (96.6667)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9643 (0.8984)  acc1: 82.0000 (83.0240)  acc5: 96.4000 (96.5440)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4791 s / it)
* Acc@1 83.456 Acc@5 96.626 loss 0.886
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.61%
Epoch: [233]  [   0/1251]  eta: 1:11:57  lr: 0.000540  min_lr: 0.000540  loss: 1.7059 (1.7059)  weight_decay: 0.0500 (0.0500)  time: 3.4515  data: 2.4646  max mem: 43713
Epoch: [233]  [ 200/1251]  eta: 0:09:31  lr: 0.000537  min_lr: 0.000537  loss: 2.9131 (2.7551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9998 (1.2450)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [233]  [ 400/1251]  eta: 0:07:33  lr: 0.000535  min_lr: 0.000535  loss: 3.0026 (2.7379)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2209 (1.3061)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [233]  [ 600/1251]  eta: 0:05:45  lr: 0.000533  min_lr: 0.000533  loss: 2.8673 (2.7409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0636 (nan)  time: 0.5360  data: 0.0004  max mem: 43713
Epoch: [233]  [ 800/1251]  eta: 0:03:58  lr: 0.000530  min_lr: 0.000530  loss: 2.7023 (2.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4426 (nan)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [233]  [1000/1251]  eta: 0:02:12  lr: 0.000528  min_lr: 0.000528  loss: 2.7982 (2.7679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0267 (nan)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [233]  [1200/1251]  eta: 0:00:26  lr: 0.000525  min_lr: 0.000525  loss: 2.9111 (2.7780)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0474 (nan)  time: 0.5282  data: 0.0004  max mem: 43713
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 3.0543 (2.7815)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2107 (nan)  time: 0.4481  data: 0.0005  max mem: 43713
Epoch: [233] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 3.0543 (2.7688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2107 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6756 (0.6756)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.3935  data: 5.0842  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.8371 (0.8298)  acc1: 87.2000 (86.7636)  acc5: 98.0000 (98.0000)  time: 0.7306  data: 0.4625  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0194 (0.9500)  acc1: 82.0000 (83.5810)  acc5: 96.4000 (96.8571)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0318 (0.9652)  acc1: 80.8000 (83.0720)  acc5: 96.4000 (96.7840)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4732 s / it)
* Acc@1 83.660 Acc@5 96.650 loss 0.955
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.66%
Epoch: [234]  [   0/1251]  eta: 0:57:22  lr: 0.000525  min_lr: 0.000525  loss: 2.0530 (2.0530)  weight_decay: 0.0500 (0.0500)  time: 2.7515  data: 2.2086  max mem: 43713
Epoch: [234]  [ 200/1251]  eta: 0:09:22  lr: 0.000522  min_lr: 0.000522  loss: 2.8943 (2.7132)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9314 (1.4969)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [234]  [ 400/1251]  eta: 0:07:31  lr: 0.000520  min_lr: 0.000520  loss: 2.9083 (2.7398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1675 (1.4783)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [234]  [ 600/1251]  eta: 0:05:44  lr: 0.000517  min_lr: 0.000517  loss: 2.8992 (2.7801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1316 (1.4724)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [234]  [ 800/1251]  eta: 0:03:58  lr: 0.000515  min_lr: 0.000515  loss: 2.9871 (2.8025)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1291 (1.4530)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [234]  [1000/1251]  eta: 0:02:12  lr: 0.000513  min_lr: 0.000513  loss: 2.8687 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1085 (1.3936)  time: 0.5318  data: 0.0004  max mem: 43713
Epoch: [234]  [1200/1251]  eta: 0:00:26  lr: 0.000510  min_lr: 0.000510  loss: 2.9512 (2.7870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0966 (1.3598)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.8819 (2.7893)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0805 (1.3620)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [234] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.8819 (2.7737)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0805 (1.3620)
Test:  [ 0/25]  eta: 0:02:02  loss: 0.5938 (0.5938)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 4.8892  data: 4.5769  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7600 (0.7481)  acc1: 86.0000 (86.5455)  acc5: 98.4000 (98.0364)  time: 0.7083  data: 0.4399  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9629 (0.8771)  acc1: 82.4000 (83.7333)  acc5: 96.4000 (96.8191)  time: 0.2772  data: 0.0131  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9751 (0.8943)  acc1: 82.0000 (83.1680)  acc5: 96.0000 (96.6560)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4645 s / it)
* Acc@1 83.536 Acc@5 96.672 loss 0.882
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.66%
Epoch: [235]  [   0/1251]  eta: 1:15:25  lr: 0.000510  min_lr: 0.000510  loss: 2.7615 (2.7615)  weight_decay: 0.0500 (0.0500)  time: 3.6174  data: 3.0354  max mem: 43713
Epoch: [235]  [ 200/1251]  eta: 0:09:28  lr: 0.000507  min_lr: 0.000507  loss: 2.7887 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1883 (1.4274)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [235]  [ 400/1251]  eta: 0:07:34  lr: 0.000505  min_lr: 0.000505  loss: 2.9162 (2.7752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4406 (1.4498)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [235]  [ 600/1251]  eta: 0:05:45  lr: 0.000502  min_lr: 0.000502  loss: 2.8062 (2.7703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4662 (1.4788)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [235]  [ 800/1251]  eta: 0:03:58  lr: 0.000500  min_lr: 0.000500  loss: 2.7706 (2.7582)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2229 (1.4153)  time: 0.5289  data: 0.0004  max mem: 43713
Epoch: [235]  [1000/1251]  eta: 0:02:12  lr: 0.000498  min_lr: 0.000498  loss: 3.0522 (2.7659)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1373 (1.3664)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [235]  [1200/1251]  eta: 0:00:26  lr: 0.000495  min_lr: 0.000495  loss: 2.6246 (2.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2587 (1.3526)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 2.9753 (2.7611)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2268 (1.3509)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [235] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 2.9753 (2.7631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2268 (1.3509)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6134 (0.6134)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.3855  data: 5.0828  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7742 (0.7620)  acc1: 87.6000 (86.8000)  acc5: 98.4000 (98.0727)  time: 0.7303  data: 0.4624  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9329 (0.8894)  acc1: 81.6000 (83.6381)  acc5: 96.4000 (96.9333)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9948 (0.9089)  acc1: 81.2000 (83.1840)  acc5: 96.0000 (96.7840)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4737 s / it)
* Acc@1 83.670 Acc@5 96.744 loss 0.902
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.67%
Epoch: [236]  [   0/1251]  eta: 0:55:15  lr: 0.000495  min_lr: 0.000495  loss: 2.6406 (2.6406)  weight_decay: 0.0500 (0.0500)  time: 2.6506  data: 2.1059  max mem: 43713
Epoch: [236]  [ 200/1251]  eta: 0:09:27  lr: 0.000492  min_lr: 0.000492  loss: 2.6220 (2.7514)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3915 (2.0307)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [236]  [ 400/1251]  eta: 0:07:33  lr: 0.000490  min_lr: 0.000490  loss: 2.5872 (2.7532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1288 (1.6170)  time: 0.5241  data: 0.0005  max mem: 43713
Epoch: [236]  [ 600/1251]  eta: 0:05:44  lr: 0.000488  min_lr: 0.000488  loss: 2.7954 (2.7633)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3460 (1.5403)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [236]  [ 800/1251]  eta: 0:03:58  lr: 0.000485  min_lr: 0.000485  loss: 2.7410 (2.7589)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1617 (1.4477)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [236]  [1000/1251]  eta: 0:02:12  lr: 0.000483  min_lr: 0.000483  loss: 2.9687 (2.7512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0149 (1.4566)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [236]  [1200/1251]  eta: 0:00:26  lr: 0.000481  min_lr: 0.000481  loss: 2.9040 (2.7552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0323 (1.4183)  time: 0.5424  data: 0.0005  max mem: 43713
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 3.0075 (2.7548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1920 (1.4180)  time: 0.4477  data: 0.0006  max mem: 43713
Epoch: [236] Total time: 0:10:59 (0.5273 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 3.0075 (2.7560)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1920 (1.4180)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6334 (0.6334)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.4374  data: 5.1384  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7828 (0.7747)  acc1: 87.6000 (86.7636)  acc5: 98.0000 (97.9273)  time: 0.7352  data: 0.4674  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9555 (0.8921)  acc1: 82.0000 (83.7524)  acc5: 96.4000 (96.8952)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9836 (0.9079)  acc1: 81.6000 (83.1520)  acc5: 96.0000 (96.7360)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4771 s / it)
* Acc@1 83.590 Acc@5 96.740 loss 0.902
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.67%
Epoch: [237]  [   0/1251]  eta: 1:15:05  lr: 0.000480  min_lr: 0.000480  loss: 2.9359 (2.9359)  weight_decay: 0.0500 (0.0500)  time: 3.6017  data: 2.9024  max mem: 43713
Epoch: [237]  [ 200/1251]  eta: 0:09:28  lr: 0.000478  min_lr: 0.000478  loss: 2.9703 (2.7322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2613 (1.2569)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [237]  [ 400/1251]  eta: 0:07:32  lr: 0.000475  min_lr: 0.000475  loss: 2.8781 (2.7251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3947 (1.3430)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [237]  [ 600/1251]  eta: 0:05:45  lr: 0.000473  min_lr: 0.000473  loss: 2.7484 (2.7357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9410 (1.2882)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [237]  [ 800/1251]  eta: 0:03:58  lr: 0.000471  min_lr: 0.000471  loss: 2.7938 (2.7435)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2841 (1.2760)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [237]  [1000/1251]  eta: 0:02:12  lr: 0.000468  min_lr: 0.000468  loss: 2.8502 (2.7428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3375 (1.3235)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [237]  [1200/1251]  eta: 0:00:26  lr: 0.000466  min_lr: 0.000466  loss: 2.8665 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2339 (1.4159)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.8528 (2.7477)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3159 (1.4177)  time: 0.4436  data: 0.0006  max mem: 43713
Epoch: [237] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.8528 (2.7480)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3159 (1.4177)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5873 (0.5873)  acc1: 89.2000 (89.2000)  acc5: 99.6000 (99.6000)  time: 5.4794  data: 5.1889  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7216 (0.7332)  acc1: 87.6000 (86.7273)  acc5: 98.4000 (98.1455)  time: 0.7387  data: 0.4720  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9366 (0.8451)  acc1: 82.4000 (84.0381)  acc5: 96.8000 (97.0286)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9469 (0.8632)  acc1: 82.0000 (83.5360)  acc5: 96.4000 (96.9440)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4775 s / it)
* Acc@1 83.676 Acc@5 96.792 loss 0.850
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.68%
Epoch: [238]  [   0/1251]  eta: 0:51:40  lr: 0.000466  min_lr: 0.000466  loss: 2.7005 (2.7005)  weight_decay: 0.0500 (0.0500)  time: 2.4783  data: 1.9455  max mem: 43713
Epoch: [238]  [ 200/1251]  eta: 0:09:20  lr: 0.000463  min_lr: 0.000463  loss: 2.8201 (2.7368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4915 (1.5135)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [238]  [ 400/1251]  eta: 0:07:31  lr: 0.000461  min_lr: 0.000461  loss: 2.5542 (2.7307)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2210 (1.3753)  time: 0.5307  data: 0.0004  max mem: 43713
Epoch: [238]  [ 600/1251]  eta: 0:05:43  lr: 0.000459  min_lr: 0.000459  loss: 2.8480 (2.7295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0654 (1.3048)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [238]  [ 800/1251]  eta: 0:03:57  lr: 0.000456  min_lr: 0.000456  loss: 2.9187 (2.7451)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2718 (1.3388)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [238]  [1000/1251]  eta: 0:02:12  lr: 0.000454  min_lr: 0.000454  loss: 2.7601 (2.7483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1429 (1.3480)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [238]  [1200/1251]  eta: 0:00:26  lr: 0.000452  min_lr: 0.000452  loss: 2.5537 (2.7482)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5269 (1.3838)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.8614 (2.7471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3238 (1.3792)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [238] Total time: 0:10:57 (0.5253 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.8614 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3238 (1.3792)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5972 (0.5972)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.4066  data: 5.1070  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7372 (0.7498)  acc1: 87.2000 (86.8364)  acc5: 98.0000 (97.8909)  time: 0.7326  data: 0.4646  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9196 (0.8719)  acc1: 81.2000 (83.7143)  acc5: 96.4000 (96.5714)  time: 0.2652  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9766 (0.8904)  acc1: 80.8000 (83.0560)  acc5: 96.0000 (96.4800)  time: 0.2652  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4762 s / it)
* Acc@1 83.636 Acc@5 96.762 loss 0.875
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.68%
Epoch: [239]  [   0/1251]  eta: 1:10:14  lr: 0.000451  min_lr: 0.000451  loss: 2.5225 (2.5225)  weight_decay: 0.0500 (0.0500)  time: 3.3685  data: 2.4812  max mem: 43713
Epoch: [239]  [ 200/1251]  eta: 0:09:28  lr: 0.000449  min_lr: 0.000449  loss: 2.9202 (2.7320)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2005 (1.2525)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [239]  [ 400/1251]  eta: 0:07:33  lr: 0.000447  min_lr: 0.000447  loss: 2.8354 (2.7428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2139 (1.4522)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [239]  [ 600/1251]  eta: 0:05:45  lr: 0.000445  min_lr: 0.000445  loss: 2.9375 (2.7530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2562 (1.4040)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [239]  [ 800/1251]  eta: 0:03:58  lr: 0.000442  min_lr: 0.000442  loss: 2.9086 (2.7504)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.4103)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [239]  [1000/1251]  eta: 0:02:12  lr: 0.000440  min_lr: 0.000440  loss: 2.8993 (2.7580)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2343 (1.4607)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [239]  [1200/1251]  eta: 0:00:26  lr: 0.000438  min_lr: 0.000438  loss: 2.5538 (2.7572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2926 (1.4467)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.8169 (2.7577)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2421 (1.4412)  time: 0.4504  data: 0.0005  max mem: 43713
Epoch: [239] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.8169 (2.7482)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2421 (1.4412)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5946 (0.5946)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.5507  data: 5.2475  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7246 (0.7452)  acc1: 88.0000 (87.0909)  acc5: 98.4000 (98.0364)  time: 0.7452  data: 0.4773  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9136 (0.8646)  acc1: 82.0000 (83.9048)  acc5: 96.8000 (96.7619)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9509 (0.8798)  acc1: 81.6000 (83.4240)  acc5: 96.4000 (96.7520)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4826 s / it)
* Acc@1 83.744 Acc@5 96.754 loss 0.874
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.74%
Epoch: [240]  [   0/1251]  eta: 0:58:03  lr: 0.000437  min_lr: 0.000437  loss: 2.5079 (2.5079)  weight_decay: 0.0500 (0.0500)  time: 2.7848  data: 2.2576  max mem: 43713
Epoch: [240]  [ 200/1251]  eta: 0:09:25  lr: 0.000435  min_lr: 0.000435  loss: 2.6841 (2.7293)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1621 (1.2680)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [240]  [ 400/1251]  eta: 0:07:31  lr: 0.000433  min_lr: 0.000433  loss: 2.8550 (2.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [240]  [ 600/1251]  eta: 0:05:44  lr: 0.000431  min_lr: 0.000431  loss: 2.9071 (2.7198)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3926 (nan)  time: 0.5381  data: 0.0005  max mem: 43713
Epoch: [240]  [ 800/1251]  eta: 0:03:58  lr: 0.000428  min_lr: 0.000428  loss: 3.0114 (2.7321)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1499 (nan)  time: 0.5221  data: 0.0005  max mem: 43713
Epoch: [240]  [1000/1251]  eta: 0:02:12  lr: 0.000426  min_lr: 0.000426  loss: 2.7711 (2.7315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3512 (nan)  time: 0.5303  data: 0.0005  max mem: 43713
Epoch: [240]  [1200/1251]  eta: 0:00:26  lr: 0.000424  min_lr: 0.000424  loss: 2.8887 (2.7354)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3144 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.6346 (2.7377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3648 (nan)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [240] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.6346 (2.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3648 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5523 (0.5523)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.5791  data: 5.2820  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6803 (0.6962)  acc1: 87.6000 (87.2727)  acc5: 98.0000 (98.0727)  time: 0.7472  data: 0.4805  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8855 (0.8134)  acc1: 82.0000 (84.3048)  acc5: 96.8000 (96.9524)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9001 (0.8290)  acc1: 82.0000 (83.8400)  acc5: 96.4000 (96.8800)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4801 s / it)
* Acc@1 83.830 Acc@5 96.890 loss 0.822
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [241]  [   0/1251]  eta: 0:54:47  lr: 0.000423  min_lr: 0.000423  loss: 3.1175 (3.1175)  weight_decay: 0.0500 (0.0500)  time: 2.6283  data: 2.0920  max mem: 43713
Epoch: [241]  [ 200/1251]  eta: 0:09:23  lr: 0.000421  min_lr: 0.000421  loss: 2.9213 (2.7373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1860 (1.6310)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [241]  [ 400/1251]  eta: 0:07:31  lr: 0.000419  min_lr: 0.000419  loss: 2.6518 (2.7448)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4970 (1.7228)  time: 0.5311  data: 0.0005  max mem: 43713
Epoch: [241]  [ 600/1251]  eta: 0:05:44  lr: 0.000417  min_lr: 0.000417  loss: 2.6041 (2.7380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6389 (1.6676)  time: 0.5295  data: 0.0005  max mem: 43713
Epoch: [241]  [ 800/1251]  eta: 0:03:57  lr: 0.000415  min_lr: 0.000415  loss: 2.7156 (2.7349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1620 (1.6189)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [241]  [1000/1251]  eta: 0:02:12  lr: 0.000412  min_lr: 0.000412  loss: 2.8952 (2.7446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0974 (1.5653)  time: 0.5300  data: 0.0004  max mem: 43713
Epoch: [241]  [1200/1251]  eta: 0:00:26  lr: 0.000410  min_lr: 0.000410  loss: 2.8768 (2.7461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4351 (1.5420)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.7170 (2.7461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0721 (1.5311)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [241] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.7170 (2.7378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0721 (1.5311)
Test:  [ 0/25]  eta: 0:01:56  loss: 0.5787 (0.5787)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 4.6470  data: 4.3444  max mem: 43713
Test:  [10/25]  eta: 0:00:09  loss: 0.7080 (0.7133)  acc1: 87.2000 (86.6182)  acc5: 97.6000 (97.9636)  time: 0.6632  data: 0.3953  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8854 (0.8301)  acc1: 82.0000 (83.7524)  acc5: 96.8000 (96.8000)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9252 (0.8455)  acc1: 82.0000 (83.2800)  acc5: 96.4000 (96.7200)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4446 s / it)
* Acc@1 83.698 Acc@5 96.784 loss 0.839
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.83%
Epoch: [242]  [   0/1251]  eta: 1:16:32  lr: 0.000410  min_lr: 0.000410  loss: 2.8558 (2.8558)  weight_decay: 0.0500 (0.0500)  time: 3.6714  data: 2.4558  max mem: 43713
Epoch: [242]  [ 200/1251]  eta: 0:09:27  lr: 0.000407  min_lr: 0.000407  loss: 2.6397 (2.6813)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1704 (1.2053)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [242]  [ 400/1251]  eta: 0:07:34  lr: 0.000405  min_lr: 0.000405  loss: 2.8893 (2.7256)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3619 (1.3882)  time: 0.5318  data: 0.0005  max mem: 43713
Epoch: [242]  [ 600/1251]  eta: 0:05:45  lr: 0.000403  min_lr: 0.000403  loss: 2.6773 (2.7187)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0815 (1.3329)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [242]  [ 800/1251]  eta: 0:03:58  lr: 0.000401  min_lr: 0.000401  loss: 2.7304 (2.7220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3453 (1.3729)  time: 0.5362  data: 0.0004  max mem: 43713
Epoch: [242]  [1000/1251]  eta: 0:02:12  lr: 0.000399  min_lr: 0.000399  loss: 2.7089 (2.7209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4549 (1.4809)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [242]  [1200/1251]  eta: 0:00:26  lr: 0.000397  min_lr: 0.000397  loss: 2.8596 (2.7232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8006 (1.5887)  time: 0.5283  data: 0.0004  max mem: 43713
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.9575 (2.7260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3972 (1.5829)  time: 0.4442  data: 0.0007  max mem: 43713
Epoch: [242] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.9575 (2.7259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3972 (1.5829)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6311 (0.6311)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.4727  data: 5.1719  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7414 (0.7650)  acc1: 87.6000 (86.5455)  acc5: 98.0000 (97.9636)  time: 0.7382  data: 0.4704  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9470 (0.8867)  acc1: 81.6000 (83.6381)  acc5: 96.4000 (96.9524)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9944 (0.9021)  acc1: 81.2000 (83.1200)  acc5: 96.0000 (96.8480)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4789 s / it)
* Acc@1 83.612 Acc@5 96.796 loss 0.893
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.83%
Epoch: [243]  [   0/1251]  eta: 1:09:23  lr: 0.000396  min_lr: 0.000396  loss: 2.9791 (2.9791)  weight_decay: 0.0500 (0.0500)  time: 3.3280  data: 2.2019  max mem: 43713
Epoch: [243]  [ 200/1251]  eta: 0:09:30  lr: 0.000394  min_lr: 0.000394  loss: 2.5209 (2.7082)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2285 (1.3421)  time: 0.5256  data: 0.0004  max mem: 43713
Epoch: [243]  [ 400/1251]  eta: 0:07:34  lr: 0.000392  min_lr: 0.000392  loss: 2.8474 (2.7175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3779 (1.2525)  time: 0.5241  data: 0.0004  max mem: 43713
Epoch: [243]  [ 600/1251]  eta: 0:05:45  lr: 0.000390  min_lr: 0.000390  loss: 2.5199 (2.7012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4760 (1.3206)  time: 0.5310  data: 0.0004  max mem: 43713
Epoch: [243]  [ 800/1251]  eta: 0:03:59  lr: 0.000388  min_lr: 0.000388  loss: 2.7572 (2.7041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7444 (1.3927)  time: 0.5244  data: 0.0004  max mem: 43713
Epoch: [243]  [1000/1251]  eta: 0:02:12  lr: 0.000385  min_lr: 0.000385  loss: 2.8051 (2.7113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1522 (1.3590)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [243]  [1200/1251]  eta: 0:00:26  lr: 0.000383  min_lr: 0.000383  loss: 2.7309 (2.7174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2821 (1.3685)  time: 0.5408  data: 0.0004  max mem: 43713
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.9841 (2.7195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3200 (1.3770)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [243] Total time: 0:11:00 (0.5277 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.9841 (2.7226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3200 (1.3770)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6367 (0.6367)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.2908  data: 4.9922  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7600 (0.7738)  acc1: 86.8000 (86.9455)  acc5: 98.0000 (97.9273)  time: 0.7217  data: 0.4542  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9600 (0.8826)  acc1: 81.6000 (83.8857)  acc5: 97.2000 (96.8762)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9844 (0.9026)  acc1: 81.6000 (83.3440)  acc5: 95.6000 (96.7040)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4710 s / it)
* Acc@1 83.764 Acc@5 96.764 loss 0.892
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [244]  [   0/1251]  eta: 1:15:13  lr: 0.000383  min_lr: 0.000383  loss: 2.4547 (2.4547)  weight_decay: 0.0500 (0.0500)  time: 3.6082  data: 2.9630  max mem: 43713
Epoch: [244]  [ 200/1251]  eta: 0:09:26  lr: 0.000381  min_lr: 0.000381  loss: 2.6292 (2.6975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1218 (1.2378)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [244]  [ 400/1251]  eta: 0:07:32  lr: 0.000379  min_lr: 0.000379  loss: 2.8921 (2.7026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1332 (1.2810)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [244]  [ 600/1251]  eta: 0:05:45  lr: 0.000377  min_lr: 0.000377  loss: 2.6409 (2.7193)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0829 (1.3647)  time: 0.5253  data: 0.0006  max mem: 43713
Epoch: [244]  [ 800/1251]  eta: 0:03:58  lr: 0.000374  min_lr: 0.000374  loss: 2.9058 (2.7047)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1634 (1.3503)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [244]  [1000/1251]  eta: 0:02:12  lr: 0.000372  min_lr: 0.000372  loss: 2.6129 (2.7085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4741 (1.3702)  time: 0.5318  data: 0.0004  max mem: 43713
Epoch: [244]  [1200/1251]  eta: 0:00:26  lr: 0.000370  min_lr: 0.000370  loss: 2.7760 (2.7104)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8146 (1.4458)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.7926 (2.7117)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6522 (1.4736)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [244] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.7926 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6522 (1.4736)
Test:  [ 0/25]  eta: 0:01:49  loss: 0.5930 (0.5930)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 4.3627  data: 4.0490  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6910 (0.7148)  acc1: 87.6000 (87.2364)  acc5: 98.0000 (98.0364)  time: 0.6674  data: 0.3985  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9157 (0.8374)  acc1: 80.8000 (83.6952)  acc5: 96.8000 (96.8191)  time: 0.2814  data: 0.0168  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9260 (0.8536)  acc1: 80.4000 (83.2320)  acc5: 96.0000 (96.6720)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4455 s / it)
* Acc@1 83.756 Acc@5 96.756 loss 0.846
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [245]  [   0/1251]  eta: 1:09:57  lr: 0.000370  min_lr: 0.000370  loss: 3.0611 (3.0611)  weight_decay: 0.0500 (0.0500)  time: 3.3553  data: 1.6090  max mem: 43713
Epoch: [245]  [ 200/1251]  eta: 0:09:26  lr: 0.000368  min_lr: 0.000368  loss: 2.8299 (2.6712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5500 (1.7973)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [245]  [ 400/1251]  eta: 0:07:33  lr: 0.000366  min_lr: 0.000366  loss: 2.8698 (2.6939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2114 (1.6398)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [245]  [ 600/1251]  eta: 0:05:45  lr: 0.000364  min_lr: 0.000364  loss: 2.9417 (2.7041)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1042 (1.5895)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [245]  [ 800/1251]  eta: 0:03:58  lr: 0.000362  min_lr: 0.000362  loss: 2.7104 (2.7170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8736 (1.6722)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [245]  [1000/1251]  eta: 0:02:12  lr: 0.000359  min_lr: 0.000359  loss: 2.7168 (2.7118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8122 (1.7620)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [245]  [1200/1251]  eta: 0:00:26  lr: 0.000357  min_lr: 0.000357  loss: 2.8035 (2.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7309 (1.7310)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 2.8583 (2.7138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9070 (1.7406)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [245] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 2.8583 (2.7171)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9070 (1.7406)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5891 (0.5891)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.8362  data: 5.5342  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7250 (0.7393)  acc1: 87.2000 (87.1273)  acc5: 98.0000 (97.9636)  time: 0.7712  data: 0.5034  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9331 (0.8565)  acc1: 82.4000 (83.9619)  acc5: 96.8000 (96.7048)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9456 (0.8721)  acc1: 82.0000 (83.5360)  acc5: 96.4000 (96.6240)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4929 s / it)
* Acc@1 83.764 Acc@5 96.830 loss 0.866
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [246]  [   0/1251]  eta: 1:09:43  lr: 0.000357  min_lr: 0.000357  loss: 2.7572 (2.7572)  weight_decay: 0.0500 (0.0500)  time: 3.3445  data: 2.7159  max mem: 43713
Epoch: [246]  [ 200/1251]  eta: 0:09:30  lr: 0.000355  min_lr: 0.000355  loss: 2.8484 (2.7400)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2111 (1.5219)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [246]  [ 400/1251]  eta: 0:07:33  lr: 0.000353  min_lr: 0.000353  loss: 2.7369 (2.7196)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6384 (1.6540)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [246]  [ 600/1251]  eta: 0:05:44  lr: 0.000351  min_lr: 0.000351  loss: 2.6703 (2.7161)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7216 (1.7426)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [246]  [ 800/1251]  eta: 0:03:58  lr: 0.000349  min_lr: 0.000349  loss: 2.7045 (2.7214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4066 (1.6998)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [246]  [1000/1251]  eta: 0:02:12  lr: 0.000347  min_lr: 0.000347  loss: 2.8833 (2.7162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2458 (1.6907)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [246]  [1200/1251]  eta: 0:00:26  lr: 0.000345  min_lr: 0.000345  loss: 2.8680 (2.7249)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5910 (1.6600)  time: 0.5301  data: 0.0004  max mem: 43713
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 2.7416 (2.7283)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5149 (1.6557)  time: 0.4474  data: 0.0005  max mem: 43713
Epoch: [246] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 2.7416 (2.7115)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5149 (1.6557)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6029 (0.6029)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.4073  data: 5.1238  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7331 (0.7443)  acc1: 86.8000 (87.0909)  acc5: 97.6000 (97.8909)  time: 0.7315  data: 0.4661  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8961 (0.8646)  acc1: 82.4000 (84.1143)  acc5: 96.8000 (96.6286)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9773 (0.8836)  acc1: 82.0000 (83.5840)  acc5: 96.0000 (96.5760)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4758 s / it)
* Acc@1 83.900 Acc@5 96.800 loss 0.873
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.90%
Epoch: [247]  [   0/1251]  eta: 0:58:55  lr: 0.000344  min_lr: 0.000344  loss: 2.6414 (2.6414)  weight_decay: 0.0500 (0.0500)  time: 2.8265  data: 2.2806  max mem: 43713
Epoch: [247]  [ 200/1251]  eta: 0:09:23  lr: 0.000342  min_lr: 0.000342  loss: 2.3517 (2.6084)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3716 (1.5840)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [247]  [ 400/1251]  eta: 0:07:31  lr: 0.000340  min_lr: 0.000340  loss: 2.8483 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.4573)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [247]  [ 600/1251]  eta: 0:05:44  lr: 0.000338  min_lr: 0.000338  loss: 2.5383 (2.6713)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [247]  [ 800/1251]  eta: 0:03:57  lr: 0.000336  min_lr: 0.000336  loss: 2.7200 (2.6833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5017 (nan)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [247]  [1000/1251]  eta: 0:02:12  lr: 0.000334  min_lr: 0.000334  loss: 2.9416 (2.6892)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3969 (nan)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [247]  [1200/1251]  eta: 0:00:26  lr: 0.000332  min_lr: 0.000332  loss: 2.5529 (2.6885)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1536 (nan)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.8468 (2.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6027 (nan)  time: 0.4436  data: 0.0008  max mem: 43713
Epoch: [247] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.8468 (2.7005)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6027 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5565 (0.5565)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.6657  data: 5.3654  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6897 (0.6933)  acc1: 86.8000 (86.6182)  acc5: 97.6000 (97.8545)  time: 0.7556  data: 0.4880  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8526 (0.8073)  acc1: 83.2000 (83.9048)  acc5: 96.8000 (96.7619)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8803 (0.8229)  acc1: 83.2000 (83.4080)  acc5: 96.4000 (96.6720)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4855 s / it)
* Acc@1 83.874 Acc@5 96.792 loss 0.810
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.90%
Epoch: [248]  [   0/1251]  eta: 1:10:22  lr: 0.000332  min_lr: 0.000332  loss: 3.1260 (3.1260)  weight_decay: 0.0500 (0.0500)  time: 3.3753  data: 2.2018  max mem: 43713
Epoch: [248]  [ 200/1251]  eta: 0:09:27  lr: 0.000330  min_lr: 0.000330  loss: 2.7456 (2.7147)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0943 (1.3266)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [248]  [ 400/1251]  eta: 0:07:34  lr: 0.000328  min_lr: 0.000328  loss: 2.7589 (2.7373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4042 (1.4757)  time: 0.5243  data: 0.0005  max mem: 43713
Epoch: [248]  [ 600/1251]  eta: 0:05:45  lr: 0.000326  min_lr: 0.000326  loss: 2.9189 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3584 (1.5212)  time: 0.5298  data: 0.0004  max mem: 43713
Epoch: [248]  [ 800/1251]  eta: 0:03:58  lr: 0.000324  min_lr: 0.000324  loss: 2.3548 (2.7074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1183 (1.5805)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [248]  [1000/1251]  eta: 0:02:12  lr: 0.000322  min_lr: 0.000322  loss: 2.6763 (2.7151)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4189 (1.5972)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [248]  [1200/1251]  eta: 0:00:26  lr: 0.000320  min_lr: 0.000320  loss: 2.6524 (2.7142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4319 (1.5753)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.9775 (2.7167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0333 (1.5582)  time: 0.4432  data: 0.0007  max mem: 43713
Epoch: [248] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.9775 (2.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0333 (1.5582)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6573 (0.6573)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7334  data: 5.4449  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7992 (0.8045)  acc1: 86.8000 (87.1636)  acc5: 97.6000 (97.9636)  time: 0.7613  data: 0.4953  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9699 (0.9166)  acc1: 83.2000 (84.3619)  acc5: 96.4000 (96.7238)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0103 (0.9330)  acc1: 82.0000 (83.8560)  acc5: 96.0000 (96.6720)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4860 s / it)
* Acc@1 84.016 Acc@5 96.806 loss 0.924
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.02%
Epoch: [249]  [   0/1251]  eta: 1:00:42  lr: 0.000320  min_lr: 0.000320  loss: 2.5498 (2.5498)  weight_decay: 0.0500 (0.0500)  time: 2.9116  data: 2.3838  max mem: 43713
Epoch: [249]  [ 200/1251]  eta: 0:09:26  lr: 0.000318  min_lr: 0.000318  loss: 2.7577 (2.7045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2968 (1.4561)  time: 0.5395  data: 0.0004  max mem: 43713
Epoch: [249]  [ 400/1251]  eta: 0:07:32  lr: 0.000316  min_lr: 0.000316  loss: 2.6646 (2.6887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2728 (1.4683)  time: 0.5248  data: 0.0005  max mem: 43713
Epoch: [249]  [ 600/1251]  eta: 0:05:44  lr: 0.000314  min_lr: 0.000314  loss: 3.0262 (2.7145)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3966 (1.4757)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [249]  [ 800/1251]  eta: 0:03:58  lr: 0.000312  min_lr: 0.000312  loss: 2.5510 (2.6942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3507 (1.4425)  time: 0.5296  data: 0.0005  max mem: 43713
Epoch: [249]  [1000/1251]  eta: 0:02:12  lr: 0.000310  min_lr: 0.000310  loss: 2.8482 (2.6866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2028 (1.4119)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [249]  [1200/1251]  eta: 0:00:26  lr: 0.000308  min_lr: 0.000308  loss: 2.8242 (2.6818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3313 (1.4367)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.8392 (2.6803)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2618 (1.4275)  time: 0.4507  data: 0.0005  max mem: 43713
Epoch: [249] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.8392 (2.6937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2618 (1.4275)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5160 (0.5160)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.3578  data: 5.0610  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6748 (0.6750)  acc1: 86.4000 (86.7636)  acc5: 98.0000 (98.1455)  time: 0.7277  data: 0.4604  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8753 (0.7914)  acc1: 82.8000 (84.1524)  acc5: 96.8000 (96.8952)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8924 (0.8079)  acc1: 82.4000 (83.6800)  acc5: 96.0000 (96.8320)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4731 s / it)
* Acc@1 84.114 Acc@5 96.876 loss 0.795
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.11%
Epoch: [250]  [   0/1251]  eta: 0:58:34  lr: 0.000307  min_lr: 0.000307  loss: 3.0128 (3.0128)  weight_decay: 0.0500 (0.0500)  time: 2.8092  data: 2.2633  max mem: 43713
Epoch: [250]  [ 200/1251]  eta: 0:09:26  lr: 0.000306  min_lr: 0.000306  loss: 2.8023 (2.7265)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3836 (1.7499)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [250]  [ 400/1251]  eta: 0:07:31  lr: 0.000304  min_lr: 0.000304  loss: 2.6520 (2.7203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4646 (1.7763)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [250]  [ 600/1251]  eta: 0:05:45  lr: 0.000302  min_lr: 0.000302  loss: 2.7239 (2.7038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3519 (1.6896)  time: 0.5246  data: 0.0006  max mem: 43713
Epoch: [250]  [ 800/1251]  eta: 0:03:58  lr: 0.000300  min_lr: 0.000300  loss: 2.8495 (2.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0780 (1.6150)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [250]  [1000/1251]  eta: 0:02:12  lr: 0.000298  min_lr: 0.000298  loss: 2.5564 (2.6964)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3047 (1.5471)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [250]  [1200/1251]  eta: 0:00:26  lr: 0.000296  min_lr: 0.000296  loss: 2.8162 (2.6984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3739 (1.5159)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.6344 (2.6972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0148 (1.4987)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [250] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.6344 (2.6914)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0148 (1.4987)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5623 (0.5623)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.5920  data: 5.2958  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7245 (0.7229)  acc1: 86.4000 (87.0182)  acc5: 98.0000 (97.9273)  time: 0.7484  data: 0.4818  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8869 (0.8389)  acc1: 83.2000 (84.3619)  acc5: 96.8000 (96.7238)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9196 (0.8534)  acc1: 83.2000 (83.8880)  acc5: 96.0000 (96.6560)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4840 s / it)
* Acc@1 84.028 Acc@5 96.854 loss 0.843
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.11%
Epoch: [251]  [   0/1251]  eta: 1:10:37  lr: 0.000296  min_lr: 0.000296  loss: 3.2616 (3.2616)  weight_decay: 0.0500 (0.0500)  time: 3.3873  data: 2.7495  max mem: 43713
Epoch: [251]  [ 200/1251]  eta: 0:09:25  lr: 0.000294  min_lr: 0.000294  loss: 2.7368 (2.7114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1945 (1.2995)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [251]  [ 400/1251]  eta: 0:07:33  lr: 0.000292  min_lr: 0.000292  loss: 2.7848 (2.7022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1769 (1.2831)  time: 0.5335  data: 0.0004  max mem: 43713
Epoch: [251]  [ 600/1251]  eta: 0:05:45  lr: 0.000290  min_lr: 0.000290  loss: 2.8538 (2.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2579 (1.3174)  time: 0.5258  data: 0.0005  max mem: 43713
Epoch: [251]  [ 800/1251]  eta: 0:03:58  lr: 0.000288  min_lr: 0.000288  loss: 2.8431 (2.6835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.3235)  time: 0.5347  data: 0.0005  max mem: 43713
Epoch: [251]  [1000/1251]  eta: 0:02:12  lr: 0.000286  min_lr: 0.000286  loss: 2.5947 (2.6804)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2020 (1.3722)  time: 0.5317  data: 0.0004  max mem: 43713
Epoch: [251]  [1200/1251]  eta: 0:00:26  lr: 0.000284  min_lr: 0.000284  loss: 2.7545 (2.6842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2234 (1.3577)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.5651 (2.6803)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.3485)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [251] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.5651 (2.6818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0988 (1.3485)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5057 (0.5057)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.4864  data: 5.1764  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6560 (0.6697)  acc1: 87.2000 (86.7636)  acc5: 97.6000 (97.8546)  time: 0.7394  data: 0.4708  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8507 (0.7854)  acc1: 82.8000 (83.9619)  acc5: 96.8000 (96.7810)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8672 (0.7974)  acc1: 82.4000 (83.5840)  acc5: 96.8000 (96.8480)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4779 s / it)
* Acc@1 84.052 Acc@5 96.890 loss 0.788
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.11%
Epoch: [252]  [   0/1251]  eta: 1:12:21  lr: 0.000284  min_lr: 0.000284  loss: 3.2758 (3.2758)  weight_decay: 0.0500 (0.0500)  time: 3.4705  data: 2.5070  max mem: 43713
Epoch: [252]  [ 200/1251]  eta: 0:09:29  lr: 0.000282  min_lr: 0.000282  loss: 2.7466 (2.6712)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1191 (1.1792)  time: 0.5330  data: 0.0005  max mem: 43713
Epoch: [252]  [ 400/1251]  eta: 0:07:35  lr: 0.000280  min_lr: 0.000280  loss: 2.8655 (2.6822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2766 (1.2475)  time: 0.5246  data: 0.0004  max mem: 43713
Epoch: [252]  [ 600/1251]  eta: 0:05:45  lr: 0.000279  min_lr: 0.000279  loss: 2.6849 (2.6733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4925 (1.3432)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [252]  [ 800/1251]  eta: 0:03:59  lr: 0.000277  min_lr: 0.000277  loss: 2.6311 (2.6626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7841 (1.4279)  time: 0.5286  data: 0.0004  max mem: 43713
Epoch: [252]  [1000/1251]  eta: 0:02:12  lr: 0.000275  min_lr: 0.000275  loss: 2.7351 (2.6671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7125 (1.4773)  time: 0.5248  data: 0.0004  max mem: 43713
Epoch: [252]  [1200/1251]  eta: 0:00:26  lr: 0.000273  min_lr: 0.000273  loss: 2.8364 (2.6710)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3525 (1.4582)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.6314 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3188 (1.4639)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [252] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.6314 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3188 (1.4639)
Test:  [ 0/25]  eta: 0:01:44  loss: 0.5687 (0.5687)  acc1: 92.0000 (92.0000)  acc5: 99.2000 (99.2000)  time: 4.1640  data: 3.8702  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7338 (0.7264)  acc1: 86.8000 (86.9818)  acc5: 98.0000 (97.8546)  time: 0.6866  data: 0.4191  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9033 (0.8525)  acc1: 82.8000 (84.2095)  acc5: 96.4000 (96.7619)  time: 0.3015  data: 0.0371  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9518 (0.8691)  acc1: 81.6000 (83.6480)  acc5: 96.8000 (96.7520)  time: 0.2642  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4539 s / it)
* Acc@1 84.014 Acc@5 96.810 loss 0.858
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.11%
Epoch: [253]  [   0/1251]  eta: 1:15:56  lr: 0.000273  min_lr: 0.000273  loss: 3.1005 (3.1005)  weight_decay: 0.0500 (0.0500)  time: 3.6420  data: 2.5991  max mem: 43713
Epoch: [253]  [ 200/1251]  eta: 0:09:30  lr: 0.000271  min_lr: 0.000271  loss: 2.6699 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3160 (1.4923)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [253]  [ 400/1251]  eta: 0:07:33  lr: 0.000269  min_lr: 0.000269  loss: 2.4992 (2.6957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2275 (1.4734)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [253]  [ 600/1251]  eta: 0:05:45  lr: 0.000267  min_lr: 0.000267  loss: 2.7779 (2.6924)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2124 (1.4904)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [253]  [ 800/1251]  eta: 0:03:58  lr: 0.000265  min_lr: 0.000265  loss: 2.6841 (2.6746)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0619 (1.4854)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [253]  [1000/1251]  eta: 0:02:12  lr: 0.000264  min_lr: 0.000264  loss: 2.8033 (2.6747)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2121 (1.4664)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [253]  [1200/1251]  eta: 0:00:26  lr: 0.000262  min_lr: 0.000262  loss: 2.7868 (2.6756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1892 (1.4759)  time: 0.5297  data: 0.0005  max mem: 43713
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.7429 (2.6776)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3673 (1.4721)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [253] Total time: 0:10:59 (0.5271 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.7429 (2.6697)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3673 (1.4721)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6510 (0.6510)  acc1: 90.8000 (90.8000)  acc5: 100.0000 (100.0000)  time: 5.9101  data: 5.6132  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8215 (0.8083)  acc1: 86.8000 (86.9091)  acc5: 97.6000 (97.8909)  time: 0.7780  data: 0.5106  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9733 (0.9240)  acc1: 82.0000 (84.1143)  acc5: 96.8000 (96.7810)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0292 (0.9385)  acc1: 82.0000 (83.7120)  acc5: 96.4000 (96.7520)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4953 s / it)
* Acc@1 83.982 Acc@5 96.834 loss 0.931
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.11%
Epoch: [254]  [   0/1251]  eta: 1:13:45  lr: 0.000261  min_lr: 0.000261  loss: 2.4738 (2.4738)  weight_decay: 0.0500 (0.0500)  time: 3.5379  data: 2.9311  max mem: 43713
Epoch: [254]  [ 200/1251]  eta: 0:09:27  lr: 0.000260  min_lr: 0.000260  loss: 2.9324 (2.7179)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1336 (1.2566)  time: 0.5242  data: 0.0004  max mem: 43713
Epoch: [254]  [ 400/1251]  eta: 0:07:32  lr: 0.000258  min_lr: 0.000258  loss: 2.6579 (2.6766)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0550 (1.2984)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [254]  [ 600/1251]  eta: 0:05:44  lr: 0.000256  min_lr: 0.000256  loss: 2.5844 (2.6629)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3322 (1.3063)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [254]  [ 800/1251]  eta: 0:03:58  lr: 0.000254  min_lr: 0.000254  loss: 2.7496 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5134 (1.3638)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [254]  [1000/1251]  eta: 0:02:12  lr: 0.000253  min_lr: 0.000253  loss: 2.6285 (2.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3502 (1.4438)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [254]  [1200/1251]  eta: 0:00:26  lr: 0.000251  min_lr: 0.000251  loss: 2.8483 (2.6481)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2825 (1.4820)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.8313 (2.6465)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3847 (1.4805)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [254] Total time: 0:10:57 (0.5259 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.8313 (2.6653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3847 (1.4805)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6070 (0.6070)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.5350  data: 5.2409  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7499 (0.7439)  acc1: 86.0000 (86.9818)  acc5: 98.0000 (98.0364)  time: 0.7440  data: 0.4767  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9002 (0.8646)  acc1: 83.6000 (84.2857)  acc5: 96.0000 (96.7429)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9562 (0.8822)  acc1: 83.6000 (83.7600)  acc5: 96.0000 (96.7200)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4815 s / it)
* Acc@1 84.064 Acc@5 96.858 loss 0.876
Accuracy of the model on the 50000 test images: 84.1%
Max accuracy: 84.11%
Epoch: [255]  [   0/1251]  eta: 1:08:52  lr: 0.000250  min_lr: 0.000250  loss: 2.9077 (2.9077)  weight_decay: 0.0500 (0.0500)  time: 3.3034  data: 2.1881  max mem: 43713
Epoch: [255]  [ 200/1251]  eta: 0:09:26  lr: 0.000249  min_lr: 0.000249  loss: 2.6655 (2.6175)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4463 (1.6092)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [255]  [ 400/1251]  eta: 0:07:33  lr: 0.000247  min_lr: 0.000247  loss: 2.7101 (2.6234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2219 (1.4385)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [255]  [ 600/1251]  eta: 0:05:44  lr: 0.000245  min_lr: 0.000245  loss: 2.8406 (2.6501)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2911 (1.4158)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [255]  [ 800/1251]  eta: 0:03:58  lr: 0.000244  min_lr: 0.000244  loss: 2.7009 (2.6494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3788 (1.4099)  time: 0.5294  data: 0.0004  max mem: 43713
Epoch: [255]  [1000/1251]  eta: 0:02:12  lr: 0.000242  min_lr: 0.000242  loss: 2.6494 (2.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3073 (nan)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [255]  [1200/1251]  eta: 0:00:26  lr: 0.000240  min_lr: 0.000240  loss: 2.2023 (2.6548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2758 (nan)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.5767 (2.6509)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1154 (nan)  time: 0.4475  data: 0.0007  max mem: 43713
Epoch: [255] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.5767 (2.6586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1154 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5212 (0.5212)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6602  data: 5.3501  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6806 (0.6789)  acc1: 87.2000 (87.2364)  acc5: 97.6000 (98.0000)  time: 0.7552  data: 0.4867  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8415 (0.7991)  acc1: 83.2000 (84.1714)  acc5: 96.8000 (96.8762)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9000 (0.8191)  acc1: 82.4000 (83.6160)  acc5: 96.4000 (96.8160)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4852 s / it)
* Acc@1 84.000 Acc@5 96.920 loss 0.811
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.11%
Epoch: [256]  [   0/1251]  eta: 1:14:20  lr: 0.000240  min_lr: 0.000240  loss: 2.4244 (2.4244)  weight_decay: 0.0500 (0.0500)  time: 3.5655  data: 2.7831  max mem: 43713
Epoch: [256]  [ 200/1251]  eta: 0:09:30  lr: 0.000238  min_lr: 0.000238  loss: 2.6920 (2.6526)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3913 (1.4994)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [256]  [ 400/1251]  eta: 0:07:34  lr: 0.000236  min_lr: 0.000236  loss: 2.6501 (2.6453)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3650 (1.4760)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [256]  [ 600/1251]  eta: 0:05:45  lr: 0.000235  min_lr: 0.000235  loss: 2.6667 (2.6612)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3917 (1.5324)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [256]  [ 800/1251]  eta: 0:03:58  lr: 0.000233  min_lr: 0.000233  loss: 2.8044 (2.6759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7987 (1.5615)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [256]  [1000/1251]  eta: 0:02:12  lr: 0.000231  min_lr: 0.000231  loss: 2.8878 (2.6680)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4122 (1.5564)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [256]  [1200/1251]  eta: 0:00:26  lr: 0.000230  min_lr: 0.000230  loss: 2.8021 (2.6728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5314 (1.5706)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.9194 (2.6744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7493 (1.5799)  time: 0.4498  data: 0.0005  max mem: 43713
Epoch: [256] Total time: 0:10:59 (0.5274 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.9194 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7493 (1.5799)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5963 (0.5963)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.8071  data: 5.5032  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7406 (0.7544)  acc1: 86.4000 (87.1636)  acc5: 98.0000 (98.0000)  time: 0.7687  data: 0.5006  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9216 (0.8710)  acc1: 83.6000 (84.3810)  acc5: 96.8000 (96.8191)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9645 (0.8882)  acc1: 82.4000 (83.8080)  acc5: 96.0000 (96.7200)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4912 s / it)
* Acc@1 84.170 Acc@5 96.860 loss 0.879
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.17%
Epoch: [257]  [   0/1251]  eta: 0:59:17  lr: 0.000229  min_lr: 0.000229  loss: 2.8740 (2.8740)  weight_decay: 0.0500 (0.0500)  time: 2.8440  data: 2.3097  max mem: 43713
Epoch: [257]  [ 200/1251]  eta: 0:09:23  lr: 0.000228  min_lr: 0.000228  loss: 2.7787 (2.6300)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3073 (1.6946)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [257]  [ 400/1251]  eta: 0:07:31  lr: 0.000226  min_lr: 0.000226  loss: 2.7058 (2.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1516 (1.6054)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [257]  [ 600/1251]  eta: 0:05:44  lr: 0.000224  min_lr: 0.000224  loss: 2.8433 (2.6506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2760 (1.5344)  time: 0.5222  data: 0.0004  max mem: 43713
Epoch: [257]  [ 800/1251]  eta: 0:03:57  lr: 0.000223  min_lr: 0.000223  loss: 2.5936 (2.6584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3151 (1.5224)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [257]  [1000/1251]  eta: 0:02:12  lr: 0.000221  min_lr: 0.000221  loss: 2.6825 (2.6604)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3159 (1.5276)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [257]  [1200/1251]  eta: 0:00:26  lr: 0.000219  min_lr: 0.000219  loss: 2.7969 (2.6493)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4480 (1.5131)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.8030 (2.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4793 (1.5115)  time: 0.4431  data: 0.0005  max mem: 43713
Epoch: [257] Total time: 0:10:57 (0.5256 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.8030 (2.6596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4793 (1.5115)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.5815 (0.5815)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 4.9833  data: 4.6718  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7626 (0.7552)  acc1: 87.2000 (87.4546)  acc5: 97.6000 (97.8909)  time: 0.7078  data: 0.4391  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9135 (0.8703)  acc1: 83.2000 (84.3429)  acc5: 96.8000 (96.8191)  time: 0.2724  data: 0.0080  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9390 (0.8865)  acc1: 82.0000 (83.7920)  acc5: 96.8000 (96.8160)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4654 s / it)
* Acc@1 84.172 Acc@5 96.770 loss 0.881
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.17%
Epoch: [258]  [   0/1251]  eta: 1:03:31  lr: 0.000219  min_lr: 0.000219  loss: 2.7771 (2.7771)  weight_decay: 0.0500 (0.0500)  time: 3.0469  data: 2.5044  max mem: 43713
Epoch: [258]  [ 200/1251]  eta: 0:09:24  lr: 0.000217  min_lr: 0.000217  loss: 2.7765 (2.6704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6360 (1.7041)  time: 0.5313  data: 0.0004  max mem: 43713
Epoch: [258]  [ 400/1251]  eta: 0:07:33  lr: 0.000216  min_lr: 0.000216  loss: 2.8706 (2.6695)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3988 (1.7508)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [258]  [ 600/1251]  eta: 0:05:44  lr: 0.000214  min_lr: 0.000214  loss: 2.5456 (2.6609)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4102 (1.6685)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [258]  [ 800/1251]  eta: 0:03:58  lr: 0.000212  min_lr: 0.000212  loss: 2.8130 (2.6672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3806 (1.6126)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [258]  [1000/1251]  eta: 0:02:12  lr: 0.000211  min_lr: 0.000211  loss: 2.6374 (2.6623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5549 (1.6207)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [258]  [1200/1251]  eta: 0:00:26  lr: 0.000209  min_lr: 0.000209  loss: 2.7624 (2.6557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3986 (1.6312)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.5929 (2.6554)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2926 (1.6142)  time: 0.4432  data: 0.0006  max mem: 43713
Epoch: [258] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.5929 (2.6591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2926 (1.6142)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5597 (0.5597)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6272  data: 5.3210  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7095 (0.7079)  acc1: 87.2000 (87.0546)  acc5: 97.6000 (98.0364)  time: 0.7522  data: 0.4841  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8510 (0.8312)  acc1: 82.0000 (83.9048)  acc5: 97.2000 (96.7429)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9289 (0.8488)  acc1: 82.0000 (83.4560)  acc5: 96.0000 (96.6720)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4838 s / it)
* Acc@1 84.048 Acc@5 96.842 loss 0.839
Accuracy of the model on the 50000 test images: 84.0%
Max accuracy: 84.17%
Epoch: [259]  [   0/1251]  eta: 1:16:06  lr: 0.000209  min_lr: 0.000209  loss: 2.8359 (2.8359)  weight_decay: 0.0500 (0.0500)  time: 3.6506  data: 2.5858  max mem: 43713
Epoch: [259]  [ 200/1251]  eta: 0:09:32  lr: 0.000207  min_lr: 0.000207  loss: 2.6482 (2.6694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2943 (1.5652)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [259]  [ 400/1251]  eta: 0:07:34  lr: 0.000206  min_lr: 0.000206  loss: 2.8139 (2.6525)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2012 (1.5168)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [259]  [ 600/1251]  eta: 0:05:45  lr: 0.000204  min_lr: 0.000204  loss: 2.6853 (2.6535)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3850 (1.4940)  time: 0.5284  data: 0.0004  max mem: 43713
Epoch: [259]  [ 800/1251]  eta: 0:03:58  lr: 0.000203  min_lr: 0.000203  loss: 2.7942 (2.6490)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1018 (1.4923)  time: 0.5286  data: 0.0004  max mem: 43713
Epoch: [259]  [1000/1251]  eta: 0:02:12  lr: 0.000201  min_lr: 0.000201  loss: 2.4567 (2.6466)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2681 (1.4843)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [259]  [1200/1251]  eta: 0:00:26  lr: 0.000199  min_lr: 0.000199  loss: 2.6992 (2.6464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6412 (1.5123)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.7946 (2.6475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4079 (1.5095)  time: 0.4479  data: 0.0007  max mem: 43713
Epoch: [259] Total time: 0:10:58 (0.5268 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.7946 (2.6459)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4079 (1.5095)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5916 (0.5916)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.5904  data: 5.3065  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7274 (0.7343)  acc1: 87.2000 (87.4909)  acc5: 97.6000 (97.9636)  time: 0.7491  data: 0.4827  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8905 (0.8490)  acc1: 82.4000 (84.4952)  acc5: 96.8000 (96.8191)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9406 (0.8672)  acc1: 82.4000 (83.9520)  acc5: 96.8000 (96.8480)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4828 s / it)
* Acc@1 84.186 Acc@5 96.852 loss 0.862
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.19%
Epoch: [260]  [   0/1251]  eta: 0:55:06  lr: 0.000199  min_lr: 0.000199  loss: 2.6825 (2.6825)  weight_decay: 0.0500 (0.0500)  time: 2.6428  data: 2.1096  max mem: 43713
Epoch: [260]  [ 200/1251]  eta: 0:09:23  lr: 0.000197  min_lr: 0.000197  loss: 2.7451 (2.5653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3729 (1.6269)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [260]  [ 400/1251]  eta: 0:07:31  lr: 0.000196  min_lr: 0.000196  loss: 2.3886 (2.5795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5781 (1.5915)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [260]  [ 600/1251]  eta: 0:05:44  lr: 0.000194  min_lr: 0.000194  loss: 2.5734 (2.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4374 (1.6000)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [260]  [ 800/1251]  eta: 0:03:58  lr: 0.000193  min_lr: 0.000193  loss: 2.8536 (2.6228)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5337 (1.6344)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [260]  [1000/1251]  eta: 0:02:12  lr: 0.000191  min_lr: 0.000191  loss: 2.7367 (2.6357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4128 (1.6279)  time: 0.5220  data: 0.0004  max mem: 43713
Epoch: [260]  [1200/1251]  eta: 0:00:26  lr: 0.000190  min_lr: 0.000190  loss: 2.7347 (2.6411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1475 (1.6303)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.6745 (2.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.6194)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [260] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.6745 (2.6414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.6194)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5831 (0.5831)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.3809  data: 5.0946  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7340 (0.7290)  acc1: 87.6000 (87.3091)  acc5: 98.0000 (98.0727)  time: 0.7298  data: 0.4634  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8782 (0.8436)  acc1: 83.6000 (84.6476)  acc5: 96.8000 (96.8952)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9467 (0.8619)  acc1: 83.2000 (84.1440)  acc5: 96.8000 (96.8640)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4738 s / it)
* Acc@1 84.320 Acc@5 96.866 loss 0.853
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.32%
Epoch: [261]  [   0/1251]  eta: 0:58:38  lr: 0.000189  min_lr: 0.000189  loss: 2.6019 (2.6019)  weight_decay: 0.0500 (0.0500)  time: 2.8124  data: 2.2684  max mem: 43713
Epoch: [261]  [ 200/1251]  eta: 0:09:21  lr: 0.000188  min_lr: 0.000188  loss: 2.8786 (2.6591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5529 (1.5942)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [261]  [ 400/1251]  eta: 0:07:30  lr: 0.000186  min_lr: 0.000186  loss: 2.6568 (2.6583)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2723 (1.4987)  time: 0.5308  data: 0.0004  max mem: 43713
Epoch: [261]  [ 600/1251]  eta: 0:05:44  lr: 0.000185  min_lr: 0.000185  loss: 2.7533 (2.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3987 (1.5046)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [261]  [ 800/1251]  eta: 0:03:57  lr: 0.000183  min_lr: 0.000183  loss: 2.7662 (2.6413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1801 (1.4506)  time: 0.5239  data: 0.0004  max mem: 43713
Epoch: [261]  [1000/1251]  eta: 0:02:12  lr: 0.000182  min_lr: 0.000182  loss: 2.8526 (2.6408)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4219 (1.4822)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [261]  [1200/1251]  eta: 0:00:26  lr: 0.000180  min_lr: 0.000180  loss: 2.8059 (2.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4477 (1.5149)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.6044 (2.6363)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.5079)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [261] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.6044 (2.6427)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.5079)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5247 (0.5247)  acc1: 90.8000 (90.8000)  acc5: 100.0000 (100.0000)  time: 5.6369  data: 5.3398  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6631 (0.6742)  acc1: 88.0000 (87.3091)  acc5: 98.0000 (98.0727)  time: 0.7532  data: 0.4857  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8342 (0.7903)  acc1: 82.4000 (84.4952)  acc5: 96.8000 (96.8571)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8898 (0.8088)  acc1: 81.6000 (84.0320)  acc5: 96.0000 (96.7520)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4848 s / it)
* Acc@1 84.248 Acc@5 96.850 loss 0.801
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.32%
Epoch: [262]  [   0/1251]  eta: 1:09:40  lr: 0.000180  min_lr: 0.000180  loss: 2.7681 (2.7681)  weight_decay: 0.0500 (0.0500)  time: 3.3415  data: 2.3584  max mem: 43713
Epoch: [262]  [ 200/1251]  eta: 0:09:27  lr: 0.000179  min_lr: 0.000179  loss: 2.3776 (2.6079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3262 (1.4135)  time: 0.5355  data: 0.0006  max mem: 43713
Epoch: [262]  [ 400/1251]  eta: 0:07:34  lr: 0.000177  min_lr: 0.000177  loss: 2.6590 (2.6437)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5178 (1.6008)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [262]  [ 600/1251]  eta: 0:05:45  lr: 0.000176  min_lr: 0.000176  loss: 2.8857 (2.6507)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9257 (nan)  time: 0.5235  data: 0.0006  max mem: 43713
Epoch: [262]  [ 800/1251]  eta: 0:03:58  lr: 0.000174  min_lr: 0.000174  loss: 2.7550 (2.6532)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5494 (nan)  time: 0.5301  data: 0.0005  max mem: 43713
Epoch: [262]  [1000/1251]  eta: 0:02:12  lr: 0.000173  min_lr: 0.000173  loss: 2.3816 (2.6421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4303 (nan)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [262]  [1200/1251]  eta: 0:00:26  lr: 0.000171  min_lr: 0.000171  loss: 2.9037 (2.6386)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (nan)  time: 0.5288  data: 0.0005  max mem: 43713
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.8287 (2.6417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (nan)  time: 0.4433  data: 0.0007  max mem: 43713
Epoch: [262] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.8287 (2.6448)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6556 (0.6556)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.7487  data: 5.4359  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7911 (0.7942)  acc1: 86.8000 (86.9091)  acc5: 98.0000 (98.0000)  time: 0.7631  data: 0.4944  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9366 (0.9094)  acc1: 83.2000 (84.5143)  acc5: 96.8000 (96.8952)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0010 (0.9251)  acc1: 82.8000 (84.0320)  acc5: 96.0000 (96.8000)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4887 s / it)
* Acc@1 84.214 Acc@5 96.874 loss 0.917
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.32%
Epoch: [263]  [   0/1251]  eta: 1:12:51  lr: 0.000171  min_lr: 0.000171  loss: 2.5748 (2.5748)  weight_decay: 0.0500 (0.0500)  time: 3.4942  data: 2.3932  max mem: 43713
Epoch: [263]  [ 200/1251]  eta: 0:09:29  lr: 0.000169  min_lr: 0.000169  loss: 2.5382 (2.6537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2782 (1.7975)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [263]  [ 400/1251]  eta: 0:07:33  lr: 0.000168  min_lr: 0.000168  loss: 2.7837 (2.6421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3046 (1.7983)  time: 0.5235  data: 0.0005  max mem: 43713
Epoch: [263]  [ 600/1251]  eta: 0:05:44  lr: 0.000167  min_lr: 0.000167  loss: 2.6402 (2.6567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4577 (1.6680)  time: 0.5310  data: 0.0004  max mem: 43713
Epoch: [263]  [ 800/1251]  eta: 0:03:58  lr: 0.000165  min_lr: 0.000165  loss: 2.6077 (2.6454)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1935 (1.6429)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [263]  [1000/1251]  eta: 0:02:12  lr: 0.000164  min_lr: 0.000164  loss: 2.6064 (2.6397)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1835 (1.6498)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [263]  [1200/1251]  eta: 0:00:26  lr: 0.000162  min_lr: 0.000162  loss: 2.5973 (2.6394)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3722 (1.6244)  time: 0.5354  data: 0.0005  max mem: 43713
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.7898 (2.6399)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3982 (1.6227)  time: 0.4475  data: 0.0005  max mem: 43713
Epoch: [263] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.7898 (2.6288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3982 (1.6227)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.4976 (0.4976)  acc1: 91.2000 (91.2000)  acc5: 100.0000 (100.0000)  time: 5.4910  data: 5.1786  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6617 (0.6593)  acc1: 87.2000 (87.2727)  acc5: 97.6000 (98.0364)  time: 0.7398  data: 0.4711  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8237 (0.7793)  acc1: 83.2000 (84.4952)  acc5: 96.8000 (96.7619)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8833 (0.7963)  acc1: 82.8000 (84.0480)  acc5: 96.4000 (96.7360)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4780 s / it)
* Acc@1 84.378 Acc@5 96.980 loss 0.786
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.38%
Epoch: [264]  [   0/1251]  eta: 1:01:22  lr: 0.000162  min_lr: 0.000162  loss: 2.8393 (2.8393)  weight_decay: 0.0500 (0.0500)  time: 2.9440  data: 2.4185  max mem: 43713
Epoch: [264]  [ 200/1251]  eta: 0:09:22  lr: 0.000160  min_lr: 0.000160  loss: 2.8133 (2.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1902 (1.4337)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [264]  [ 400/1251]  eta: 0:07:31  lr: 0.000159  min_lr: 0.000159  loss: 2.8942 (2.6381)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5585 (1.6522)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [264]  [ 600/1251]  eta: 0:05:44  lr: 0.000158  min_lr: 0.000158  loss: 2.7041 (2.6296)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2509 (1.6342)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [264]  [ 800/1251]  eta: 0:03:57  lr: 0.000156  min_lr: 0.000156  loss: 2.8047 (2.6343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1301 (1.6259)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [264]  [1000/1251]  eta: 0:02:12  lr: 0.000155  min_lr: 0.000155  loss: 2.5114 (2.6313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7961 (1.6475)  time: 0.5352  data: 0.0005  max mem: 43713
Epoch: [264]  [1200/1251]  eta: 0:00:26  lr: 0.000154  min_lr: 0.000154  loss: 2.6636 (2.6330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5273 (1.7612)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.8355 (2.6355)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6388 (1.8038)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [264] Total time: 0:10:57 (0.5255 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.8355 (2.6364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6388 (1.8038)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5735 (0.5735)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.6331  data: 5.3420  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7138 (0.7275)  acc1: 87.6000 (87.8909)  acc5: 98.0000 (98.0727)  time: 0.7527  data: 0.4859  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8919 (0.8565)  acc1: 82.4000 (84.7238)  acc5: 96.4000 (96.8571)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9752 (0.8738)  acc1: 82.0000 (84.2880)  acc5: 96.0000 (96.7840)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4851 s / it)
* Acc@1 84.376 Acc@5 96.938 loss 0.866
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.38%
Epoch: [265]  [   0/1251]  eta: 1:10:38  lr: 0.000153  min_lr: 0.000153  loss: 2.4800 (2.4800)  weight_decay: 0.0500 (0.0500)  time: 3.3878  data: 2.5366  max mem: 43713
Epoch: [265]  [ 200/1251]  eta: 0:09:26  lr: 0.000152  min_lr: 0.000152  loss: 2.5925 (2.5948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1428 (1.8977)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [265]  [ 400/1251]  eta: 0:07:33  lr: 0.000150  min_lr: 0.000150  loss: 2.8112 (2.5980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1435 (1.6427)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [265]  [ 600/1251]  eta: 0:05:44  lr: 0.000149  min_lr: 0.000149  loss: 2.9501 (2.6149)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1016 (1.5330)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [265]  [ 800/1251]  eta: 0:03:58  lr: 0.000148  min_lr: 0.000148  loss: 2.9097 (2.6250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4122 (1.5034)  time: 0.5324  data: 0.0004  max mem: 43713
Epoch: [265]  [1000/1251]  eta: 0:02:12  lr: 0.000146  min_lr: 0.000146  loss: 2.7144 (2.6253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2035 (1.4986)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [265]  [1200/1251]  eta: 0:00:26  lr: 0.000145  min_lr: 0.000145  loss: 2.6555 (2.6294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4182 (1.5442)  time: 0.5276  data: 0.0004  max mem: 43713
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.7273 (2.6263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4182 (1.5381)  time: 0.4491  data: 0.0005  max mem: 43713
Epoch: [265] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.7273 (2.6247)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4182 (1.5381)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5976 (0.5976)  acc1: 90.8000 (90.8000)  acc5: 100.0000 (100.0000)  time: 5.5833  data: 5.2870  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7601 (0.7523)  acc1: 86.8000 (87.2364)  acc5: 97.6000 (98.1091)  time: 0.7475  data: 0.4810  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9135 (0.8719)  acc1: 82.8000 (84.3048)  acc5: 96.8000 (96.8952)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9498 (0.8864)  acc1: 82.8000 (83.9200)  acc5: 96.4000 (96.8960)  time: 0.2636  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4801 s / it)
* Acc@1 84.166 Acc@5 96.946 loss 0.879
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.38%
Epoch: [266]  [   0/1251]  eta: 1:14:29  lr: 0.000145  min_lr: 0.000145  loss: 2.6536 (2.6536)  weight_decay: 0.0500 (0.0500)  time: 3.5725  data: 2.6667  max mem: 43713
Epoch: [266]  [ 200/1251]  eta: 0:09:29  lr: 0.000143  min_lr: 0.000143  loss: 2.7975 (2.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2893 (1.3682)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [266]  [ 400/1251]  eta: 0:07:33  lr: 0.000142  min_lr: 0.000142  loss: 2.7158 (2.5778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3333 (1.3754)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [266]  [ 600/1251]  eta: 0:05:45  lr: 0.000141  min_lr: 0.000141  loss: 2.7912 (2.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2154 (1.4355)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [266]  [ 800/1251]  eta: 0:03:58  lr: 0.000139  min_lr: 0.000139  loss: 2.6867 (2.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0843 (1.3965)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [266]  [1000/1251]  eta: 0:02:12  lr: 0.000138  min_lr: 0.000138  loss: 2.6122 (2.6078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2499 (1.3834)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [266]  [1200/1251]  eta: 0:00:26  lr: 0.000137  min_lr: 0.000137  loss: 2.7184 (2.6115)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0541 (1.3730)  time: 0.5397  data: 0.0004  max mem: 43713
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.7379 (2.6142)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2576 (1.3925)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [266] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.7379 (2.6289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2576 (1.3925)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6059 (0.6059)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.4882  data: 5.1942  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7419 (0.7508)  acc1: 87.2000 (87.3455)  acc5: 97.6000 (97.9636)  time: 0.7397  data: 0.4726  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9054 (0.8698)  acc1: 82.0000 (84.3429)  acc5: 96.4000 (96.8571)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9595 (0.8856)  acc1: 81.6000 (83.9840)  acc5: 96.4000 (96.8000)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4793 s / it)
* Acc@1 84.244 Acc@5 96.936 loss 0.879
Accuracy of the model on the 50000 test images: 84.2%
Max accuracy: 84.38%
Epoch: [267]  [   0/1251]  eta: 1:10:08  lr: 0.000136  min_lr: 0.000136  loss: 3.0117 (3.0117)  weight_decay: 0.0500 (0.0500)  time: 3.3645  data: 2.3547  max mem: 43713
Epoch: [267]  [ 200/1251]  eta: 0:09:25  lr: 0.000135  min_lr: 0.000135  loss: 2.6505 (2.5301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1584 (1.3184)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [267]  [ 400/1251]  eta: 0:07:32  lr: 0.000134  min_lr: 0.000134  loss: 2.8835 (2.5540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9808 (1.3871)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [267]  [ 600/1251]  eta: 0:05:44  lr: 0.000133  min_lr: 0.000133  loss: 2.7453 (2.5826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4603 (1.5048)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [267]  [ 800/1251]  eta: 0:03:58  lr: 0.000131  min_lr: 0.000131  loss: 2.7260 (2.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1697 (1.4686)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [267]  [1000/1251]  eta: 0:02:12  lr: 0.000130  min_lr: 0.000130  loss: 2.7093 (2.6139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0592 (1.4360)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [267]  [1200/1251]  eta: 0:00:26  lr: 0.000129  min_lr: 0.000129  loss: 2.6051 (2.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5824 (1.4948)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.6029 (2.6103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3283 (1.4899)  time: 0.4437  data: 0.0007  max mem: 43713
Epoch: [267] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.6029 (2.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3283 (1.4899)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5302 (0.5302)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.2614  data: 4.9516  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6792 (0.6852)  acc1: 86.8000 (87.2727)  acc5: 98.0000 (98.1091)  time: 0.7184  data: 0.4504  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8457 (0.8031)  acc1: 82.4000 (84.3048)  acc5: 96.8000 (96.9524)  time: 0.2640  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9001 (0.8183)  acc1: 82.0000 (83.9360)  acc5: 96.4000 (96.8800)  time: 0.2640  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4718 s / it)
* Acc@1 84.266 Acc@5 96.912 loss 0.809
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.38%
Epoch: [268]  [   0/1251]  eta: 1:14:14  lr: 0.000128  min_lr: 0.000128  loss: 2.6784 (2.6784)  weight_decay: 0.0500 (0.0500)  time: 3.5604  data: 1.6497  max mem: 43713
Epoch: [268]  [ 200/1251]  eta: 0:09:27  lr: 0.000127  min_lr: 0.000127  loss: 2.5999 (2.6707)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2254 (1.5155)  time: 0.5311  data: 0.0004  max mem: 43713
Epoch: [268]  [ 400/1251]  eta: 0:07:34  lr: 0.000126  min_lr: 0.000126  loss: 2.7089 (2.6517)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3996 (1.5784)  time: 0.5342  data: 0.0004  max mem: 43713
Epoch: [268]  [ 600/1251]  eta: 0:05:45  lr: 0.000125  min_lr: 0.000125  loss: 2.8631 (2.6512)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4993 (1.6344)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [268]  [ 800/1251]  eta: 0:03:58  lr: 0.000123  min_lr: 0.000123  loss: 2.5957 (2.6376)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1563 (1.5750)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [268]  [1000/1251]  eta: 0:02:12  lr: 0.000122  min_lr: 0.000122  loss: 2.6203 (2.6339)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3255 (1.5647)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [268]  [1200/1251]  eta: 0:00:26  lr: 0.000121  min_lr: 0.000121  loss: 2.7411 (2.6364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6861 (1.5691)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.5935 (2.6350)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3662 (1.5657)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [268] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.5935 (2.6170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3662 (1.5657)
Test:  [ 0/25]  eta: 0:01:52  loss: 0.5277 (0.5277)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 4.5148  data: 4.2187  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6793 (0.6793)  acc1: 87.2000 (87.3091)  acc5: 97.6000 (98.0364)  time: 0.7364  data: 0.4693  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8294 (0.7988)  acc1: 82.8000 (84.5143)  acc5: 97.2000 (97.0095)  time: 0.3117  data: 0.0472  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8947 (0.8138)  acc1: 82.8000 (84.0640)  acc5: 96.8000 (96.9920)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4762 s / it)
* Acc@1 84.270 Acc@5 97.008 loss 0.808
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.38%
Epoch: [269]  [   0/1251]  eta: 1:15:10  lr: 0.000121  min_lr: 0.000121  loss: 2.1144 (2.1144)  weight_decay: 0.0500 (0.0500)  time: 3.6053  data: 2.4698  max mem: 43713
Epoch: [269]  [ 200/1251]  eta: 0:09:29  lr: 0.000120  min_lr: 0.000120  loss: 2.8157 (2.6331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4811 (1.6377)  time: 0.5254  data: 0.0005  max mem: 43713
Epoch: [269]  [ 400/1251]  eta: 0:07:33  lr: 0.000118  min_lr: 0.000118  loss: 2.7327 (2.6255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2940 (1.5305)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [269]  [ 600/1251]  eta: 0:05:45  lr: 0.000117  min_lr: 0.000117  loss: 2.6769 (2.6106)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4756 (1.6235)  time: 0.5238  data: 0.0004  max mem: 43713
Epoch: [269]  [ 800/1251]  eta: 0:03:58  lr: 0.000116  min_lr: 0.000116  loss: 2.7541 (2.6126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2010 (1.5816)  time: 0.5264  data: 0.0004  max mem: 43713
Epoch: [269]  [1000/1251]  eta: 0:02:12  lr: 0.000115  min_lr: 0.000115  loss: 2.7830 (2.6115)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3307 (1.6062)  time: 0.5253  data: 0.0004  max mem: 43713
Epoch: [269]  [1200/1251]  eta: 0:00:26  lr: 0.000113  min_lr: 0.000113  loss: 2.5874 (2.6074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2735 (1.6038)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.8398 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1410 (1.5974)  time: 0.4536  data: 0.0007  max mem: 43713
Epoch: [269] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.8398 (2.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1410 (1.5974)
Test:  [ 0/25]  eta: 0:02:05  loss: 0.5608 (0.5608)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.0366  data: 4.7270  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6993 (0.7113)  acc1: 87.2000 (87.4909)  acc5: 97.6000 (98.0364)  time: 0.7081  data: 0.4397  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8680 (0.8325)  acc1: 82.8000 (84.4762)  acc5: 97.2000 (96.9524)  time: 0.2698  data: 0.0055  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9234 (0.8484)  acc1: 82.4000 (84.0160)  acc5: 96.4000 (96.8480)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4628 s / it)
* Acc@1 84.276 Acc@5 96.938 loss 0.841
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.38%
Epoch: [270]  [   0/1251]  eta: 1:15:32  lr: 0.000113  min_lr: 0.000113  loss: 2.5893 (2.5893)  weight_decay: 0.0500 (0.0500)  time: 3.6228  data: 2.7542  max mem: 43713
Epoch: [270]  [ 200/1251]  eta: 0:09:29  lr: 0.000112  min_lr: 0.000112  loss: 2.7322 (2.6301)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5173 (1.8109)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [270]  [ 400/1251]  eta: 0:07:33  lr: 0.000111  min_lr: 0.000111  loss: 2.6750 (2.6392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5905 (1.7115)  time: 0.5321  data: 0.0004  max mem: 43713
Epoch: [270]  [ 600/1251]  eta: 0:05:45  lr: 0.000110  min_lr: 0.000110  loss: 2.6117 (2.6227)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1568 (1.6341)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [270]  [ 800/1251]  eta: 0:03:58  lr: 0.000109  min_lr: 0.000109  loss: 2.5714 (2.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5672 (1.7182)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [270]  [1000/1251]  eta: 0:02:12  lr: 0.000107  min_lr: 0.000107  loss: 2.7369 (2.6209)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3443 (1.7375)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [270]  [1200/1251]  eta: 0:00:26  lr: 0.000106  min_lr: 0.000106  loss: 2.5935 (2.6176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5482 (1.7388)  time: 0.5223  data: 0.0005  max mem: 43713
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.7018 (2.6203)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2958 (1.7509)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [270] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.7018 (2.6128)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2958 (1.7509)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6047 (0.6047)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.5185  data: 5.2302  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7522 (0.7642)  acc1: 87.2000 (87.4182)  acc5: 97.6000 (97.9273)  time: 0.7421  data: 0.4758  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9144 (0.8823)  acc1: 82.8000 (84.4000)  acc5: 96.8000 (96.9524)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9716 (0.8983)  acc1: 82.8000 (83.9840)  acc5: 96.4000 (96.8800)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4787 s / it)
* Acc@1 84.270 Acc@5 96.888 loss 0.889
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.38%
Epoch: [271]  [   0/1251]  eta: 1:14:51  lr: 0.000106  min_lr: 0.000106  loss: 2.9401 (2.9401)  weight_decay: 0.0500 (0.0500)  time: 3.5902  data: 2.7252  max mem: 43713
Epoch: [271]  [ 200/1251]  eta: 0:09:26  lr: 0.000105  min_lr: 0.000105  loss: 2.7129 (2.5992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3409 (1.6586)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [271]  [ 400/1251]  eta: 0:07:34  lr: 0.000104  min_lr: 0.000104  loss: 2.8250 (2.6162)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4529 (1.6491)  time: 0.5413  data: 0.0005  max mem: 43713
Epoch: [271]  [ 600/1251]  eta: 0:05:45  lr: 0.000102  min_lr: 0.000102  loss: 2.6970 (2.6071)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2720 (1.5762)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [271]  [ 800/1251]  eta: 0:03:58  lr: 0.000101  min_lr: 0.000101  loss: 2.8595 (2.6109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4139 (1.5954)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [271]  [1000/1251]  eta: 0:02:12  lr: 0.000100  min_lr: 0.000100  loss: 2.4784 (2.6139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5387 (1.5883)  time: 0.5293  data: 0.0005  max mem: 43713
Epoch: [271]  [1200/1251]  eta: 0:00:26  lr: 0.000099  min_lr: 0.000099  loss: 2.6458 (2.6109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.5960)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.7497 (2.6119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2019 (1.5912)  time: 0.4437  data: 0.0005  max mem: 43713
Epoch: [271] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.7497 (2.6097)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2019 (1.5912)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6053 (0.6053)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.3631  data: 5.0393  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7362 (0.7591)  acc1: 87.6000 (87.6727)  acc5: 98.4000 (98.1091)  time: 0.7282  data: 0.4584  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9090 (0.8744)  acc1: 82.8000 (84.4381)  acc5: 97.2000 (96.9714)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9619 (0.8901)  acc1: 82.8000 (84.0800)  acc5: 96.0000 (96.9280)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4744 s / it)
* Acc@1 84.280 Acc@5 96.856 loss 0.884
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.38%
Epoch: [272]  [   0/1251]  eta: 1:15:02  lr: 0.000099  min_lr: 0.000099  loss: 2.2634 (2.2634)  weight_decay: 0.0500 (0.0500)  time: 3.5988  data: 2.9757  max mem: 43713
Epoch: [272]  [ 200/1251]  eta: 0:09:29  lr: 0.000098  min_lr: 0.000098  loss: 2.6294 (2.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5991 (1.8985)  time: 0.5314  data: 0.0005  max mem: 43713
Epoch: [272]  [ 400/1251]  eta: 0:07:34  lr: 0.000097  min_lr: 0.000097  loss: 2.8247 (2.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3473 (1.7142)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [272]  [ 600/1251]  eta: 0:05:45  lr: 0.000096  min_lr: 0.000096  loss: 2.2634 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3951 (1.6565)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [272]  [ 800/1251]  eta: 0:03:58  lr: 0.000094  min_lr: 0.000094  loss: 2.4249 (2.5923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2195 (1.7250)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [272]  [1000/1251]  eta: 0:02:12  lr: 0.000093  min_lr: 0.000093  loss: 2.7130 (2.5966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2933 (nan)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [272]  [1200/1251]  eta: 0:00:26  lr: 0.000092  min_lr: 0.000092  loss: 2.6627 (2.6045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4163 (nan)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.5784 (2.6016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2764 (nan)  time: 0.4435  data: 0.0006  max mem: 43713
Epoch: [272] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.5784 (2.5952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2764 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5251 (0.5251)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.6381  data: 5.3340  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6701 (0.6819)  acc1: 87.2000 (87.6727)  acc5: 97.6000 (98.0000)  time: 0.7534  data: 0.4852  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8340 (0.7992)  acc1: 83.2000 (84.3429)  acc5: 97.2000 (97.0667)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8992 (0.8171)  acc1: 81.6000 (83.9360)  acc5: 96.8000 (96.9920)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4847 s / it)
* Acc@1 84.380 Acc@5 96.942 loss 0.809
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.38%
Epoch: [273]  [   0/1251]  eta: 1:02:07  lr: 0.000092  min_lr: 0.000092  loss: 2.6520 (2.6520)  weight_decay: 0.0500 (0.0500)  time: 2.9799  data: 2.4440  max mem: 43713
Epoch: [273]  [ 200/1251]  eta: 0:09:26  lr: 0.000091  min_lr: 0.000091  loss: 2.7788 (2.6133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2085 (1.6919)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [273]  [ 400/1251]  eta: 0:07:32  lr: 0.000090  min_lr: 0.000090  loss: 2.4261 (2.5835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3571 (1.7098)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [273]  [ 600/1251]  eta: 0:05:45  lr: 0.000089  min_lr: 0.000089  loss: 2.5965 (2.5860)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2952 (1.6499)  time: 0.5447  data: 0.0004  max mem: 43713
Epoch: [273]  [ 800/1251]  eta: 0:03:58  lr: 0.000088  min_lr: 0.000088  loss: 2.5692 (2.5799)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3781 (1.6898)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [273]  [1000/1251]  eta: 0:02:12  lr: 0.000087  min_lr: 0.000087  loss: 2.3404 (2.5768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2457 (1.6541)  time: 0.5303  data: 0.0004  max mem: 43713
Epoch: [273]  [1200/1251]  eta: 0:00:26  lr: 0.000086  min_lr: 0.000086  loss: 2.5999 (2.5786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2040 (1.6245)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.4029 (2.5775)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2688 (1.6258)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [273] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.4029 (2.6035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2688 (1.6258)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5217 (0.5217)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.2219  data: 4.9278  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6680 (0.6791)  acc1: 87.6000 (87.3091)  acc5: 97.6000 (98.0364)  time: 0.7153  data: 0.4483  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8342 (0.7947)  acc1: 83.2000 (84.2857)  acc5: 97.2000 (96.9524)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8910 (0.8120)  acc1: 82.0000 (83.8720)  acc5: 96.8000 (96.8960)  time: 0.2645  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4688 s / it)
* Acc@1 84.390 Acc@5 96.922 loss 0.804
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.39%
Epoch: [274]  [   0/1251]  eta: 0:54:20  lr: 0.000085  min_lr: 0.000085  loss: 2.9678 (2.9678)  weight_decay: 0.0500 (0.0500)  time: 2.6061  data: 2.0681  max mem: 43713
Epoch: [274]  [ 200/1251]  eta: 0:09:22  lr: 0.000084  min_lr: 0.000084  loss: 2.5803 (2.6046)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7431 (1.5528)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [274]  [ 400/1251]  eta: 0:07:30  lr: 0.000083  min_lr: 0.000083  loss: 2.5828 (2.5875)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3637 (1.6553)  time: 0.5239  data: 0.0005  max mem: 43713
Epoch: [274]  [ 600/1251]  eta: 0:05:43  lr: 0.000082  min_lr: 0.000082  loss: 2.4609 (2.6015)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1659 (1.6620)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [274]  [ 800/1251]  eta: 0:03:57  lr: 0.000081  min_lr: 0.000081  loss: 2.6565 (2.6005)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3345 (1.6692)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [274]  [1000/1251]  eta: 0:02:12  lr: 0.000080  min_lr: 0.000080  loss: 2.3989 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3909 (1.5979)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [274]  [1200/1251]  eta: 0:00:26  lr: 0.000079  min_lr: 0.000079  loss: 2.6701 (2.5938)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6033 (1.6358)  time: 0.5238  data: 0.0005  max mem: 43713
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.7678 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4791 (1.6362)  time: 0.4439  data: 0.0005  max mem: 43713
Epoch: [274] Total time: 0:10:57 (0.5254 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.7678 (2.5989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4791 (1.6362)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5500 (0.5500)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.4147  data: 5.1102  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6983 (0.7109)  acc1: 87.2000 (87.5273)  acc5: 97.6000 (98.0364)  time: 0.7323  data: 0.4648  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8711 (0.8232)  acc1: 82.8000 (84.4571)  acc5: 96.8000 (97.0095)  time: 0.2639  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9127 (0.8394)  acc1: 82.8000 (83.9520)  acc5: 96.4000 (96.9440)  time: 0.2638  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4752 s / it)
* Acc@1 84.414 Acc@5 96.920 loss 0.831
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.41%
Epoch: [275]  [   0/1251]  eta: 0:59:11  lr: 0.000079  min_lr: 0.000079  loss: 2.3653 (2.3653)  weight_decay: 0.0500 (0.0500)  time: 2.8386  data: 2.3019  max mem: 43713
Epoch: [275]  [ 200/1251]  eta: 0:09:22  lr: 0.000078  min_lr: 0.000078  loss: 2.6753 (2.6103)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0566 (1.3003)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [275]  [ 400/1251]  eta: 0:07:32  lr: 0.000077  min_lr: 0.000077  loss: 2.6575 (2.6253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6591 (1.4820)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [275]  [ 600/1251]  eta: 0:05:44  lr: 0.000076  min_lr: 0.000076  loss: 2.6969 (2.6218)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3718 (1.5065)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [275]  [ 800/1251]  eta: 0:03:57  lr: 0.000075  min_lr: 0.000075  loss: 2.8162 (2.6105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2627 (1.4726)  time: 0.5300  data: 0.0005  max mem: 43713
Epoch: [275]  [1000/1251]  eta: 0:02:12  lr: 0.000074  min_lr: 0.000074  loss: 2.7416 (2.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5381 (1.4869)  time: 0.5242  data: 0.0005  max mem: 43713
Epoch: [275]  [1200/1251]  eta: 0:00:26  lr: 0.000073  min_lr: 0.000073  loss: 2.8044 (2.6077)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1716 (1.4893)  time: 0.5302  data: 0.0005  max mem: 43713
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.7615 (2.6095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (1.4782)  time: 0.4437  data: 0.0006  max mem: 43713
Epoch: [275] Total time: 0:10:57 (0.5260 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.7615 (2.6019)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0731 (1.4782)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5799 (0.5799)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.4120  data: 5.0864  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7128 (0.7286)  acc1: 88.0000 (87.4909)  acc5: 98.0000 (98.1091)  time: 0.7339  data: 0.4627  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8816 (0.8427)  acc1: 82.8000 (84.4762)  acc5: 97.2000 (97.0476)  time: 0.2658  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9364 (0.8595)  acc1: 82.8000 (84.0800)  acc5: 96.4000 (96.9600)  time: 0.2654  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4756 s / it)
* Acc@1 84.346 Acc@5 96.878 loss 0.852
Accuracy of the model on the 50000 test images: 84.3%
Max accuracy: 84.41%
Epoch: [276]  [   0/1251]  eta: 1:09:58  lr: 0.000073  min_lr: 0.000073  loss: 3.1801 (3.1801)  weight_decay: 0.0500 (0.0500)  time: 3.3561  data: 2.6969  max mem: 43713
Epoch: [276]  [ 200/1251]  eta: 0:09:30  lr: 0.000072  min_lr: 0.000072  loss: 2.6016 (2.5638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3824 (1.7189)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [276]  [ 400/1251]  eta: 0:07:33  lr: 0.000071  min_lr: 0.000071  loss: 2.5631 (2.5799)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.5237  data: 0.0005  max mem: 43713
Epoch: [276]  [ 600/1251]  eta: 0:05:44  lr: 0.000070  min_lr: 0.000070  loss: 2.7480 (2.5839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4769 (nan)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [276]  [ 800/1251]  eta: 0:03:58  lr: 0.000069  min_lr: 0.000069  loss: 2.4722 (2.5867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3122 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [276]  [1000/1251]  eta: 0:02:12  lr: 0.000068  min_lr: 0.000068  loss: 2.6250 (2.5879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2205 (nan)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [276]  [1200/1251]  eta: 0:00:26  lr: 0.000067  min_lr: 0.000067  loss: 2.7882 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3317 (nan)  time: 0.5360  data: 0.0004  max mem: 43713
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.6841 (2.5961)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3570 (nan)  time: 0.4439  data: 0.0007  max mem: 43713
Epoch: [276] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.6841 (2.5979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3570 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5477 (0.5477)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.6314  data: 5.3003  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6846 (0.7028)  acc1: 87.2000 (87.3818)  acc5: 98.0000 (98.1455)  time: 0.7528  data: 0.4821  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8694 (0.8171)  acc1: 82.8000 (84.5905)  acc5: 96.8000 (97.0286)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9106 (0.8335)  acc1: 82.4000 (84.0800)  acc5: 96.4000 (96.9600)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4851 s / it)
* Acc@1 84.394 Acc@5 96.914 loss 0.825
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.41%
Epoch: [277]  [   0/1251]  eta: 1:16:41  lr: 0.000067  min_lr: 0.000067  loss: 1.9361 (1.9361)  weight_decay: 0.0500 (0.0500)  time: 3.6781  data: 3.0790  max mem: 43713
Epoch: [277]  [ 200/1251]  eta: 0:09:27  lr: 0.000066  min_lr: 0.000066  loss: 2.5559 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2617 (1.5447)  time: 0.5244  data: 0.0005  max mem: 43713
Epoch: [277]  [ 400/1251]  eta: 0:07:33  lr: 0.000065  min_lr: 0.000065  loss: 2.4314 (2.6150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7441 (1.7456)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [277]  [ 600/1251]  eta: 0:05:45  lr: 0.000064  min_lr: 0.000064  loss: 2.6611 (2.6121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2727 (1.7385)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [277]  [ 800/1251]  eta: 0:03:58  lr: 0.000064  min_lr: 0.000064  loss: 2.7837 (2.6086)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4813 (1.7015)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [277]  [1000/1251]  eta: 0:02:12  lr: 0.000063  min_lr: 0.000063  loss: 2.5555 (2.5962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5960 (1.6920)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [277]  [1200/1251]  eta: 0:00:26  lr: 0.000062  min_lr: 0.000062  loss: 2.8365 (2.5989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3659 (1.6706)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.4901 (2.5987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4006 (1.6609)  time: 0.4434  data: 0.0007  max mem: 43713
Epoch: [277] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.4901 (2.5997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4006 (1.6609)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5696 (0.5696)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.6602  data: 5.3575  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7095 (0.7219)  acc1: 87.2000 (87.3818)  acc5: 97.6000 (98.0364)  time: 0.7551  data: 0.4873  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8847 (0.8366)  acc1: 82.8000 (84.6476)  acc5: 97.2000 (96.9714)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9345 (0.8531)  acc1: 82.4000 (84.1440)  acc5: 96.8000 (96.9440)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4846 s / it)
* Acc@1 84.368 Acc@5 96.984 loss 0.848
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.41%
Epoch: [278]  [   0/1251]  eta: 1:10:38  lr: 0.000062  min_lr: 0.000062  loss: 2.9868 (2.9868)  weight_decay: 0.0500 (0.0500)  time: 3.3883  data: 2.5959  max mem: 43713
Epoch: [278]  [ 200/1251]  eta: 0:09:28  lr: 0.000061  min_lr: 0.000061  loss: 2.7284 (2.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3304 (2.1637)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [278]  [ 400/1251]  eta: 0:07:34  lr: 0.000060  min_lr: 0.000060  loss: 2.5928 (2.6174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2092 (1.8922)  time: 0.5332  data: 0.0006  max mem: 43713
Epoch: [278]  [ 600/1251]  eta: 0:05:45  lr: 0.000059  min_lr: 0.000059  loss: 2.5878 (2.5992)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4362 (1.8982)  time: 0.5231  data: 0.0006  max mem: 43713
Epoch: [278]  [ 800/1251]  eta: 0:03:58  lr: 0.000058  min_lr: 0.000058  loss: 2.8146 (2.5969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2847 (1.7906)  time: 0.5230  data: 0.0006  max mem: 43713
Epoch: [278]  [1000/1251]  eta: 0:02:12  lr: 0.000057  min_lr: 0.000057  loss: 2.8408 (2.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3672 (1.7421)  time: 0.5230  data: 0.0006  max mem: 43713
Epoch: [278]  [1200/1251]  eta: 0:00:26  lr: 0.000056  min_lr: 0.000056  loss: 2.8457 (2.6038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3684 (1.7052)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.6441 (2.5995)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3683 (1.7031)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [278] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.6441 (2.5920)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3683 (1.7031)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5254 (0.5254)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.5259  data: 5.2210  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6736 (0.6774)  acc1: 87.2000 (87.3818)  acc5: 97.6000 (98.0727)  time: 0.7429  data: 0.4749  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8456 (0.7894)  acc1: 83.2000 (84.5524)  acc5: 97.2000 (97.0286)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8856 (0.8059)  acc1: 82.8000 (84.1920)  acc5: 97.2000 (97.0240)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4799 s / it)
* Acc@1 84.438 Acc@5 96.932 loss 0.800
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.44%
Epoch: [279]  [   0/1251]  eta: 0:51:56  lr: 0.000056  min_lr: 0.000056  loss: 2.9016 (2.9016)  weight_decay: 0.0500 (0.0500)  time: 2.4910  data: 1.9559  max mem: 43713
Epoch: [279]  [ 200/1251]  eta: 0:09:24  lr: 0.000055  min_lr: 0.000055  loss: 2.7294 (2.6294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1461 (1.7015)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [279]  [ 400/1251]  eta: 0:07:31  lr: 0.000055  min_lr: 0.000055  loss: 2.8724 (2.6062)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1146 (1.6628)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [279]  [ 600/1251]  eta: 0:05:43  lr: 0.000054  min_lr: 0.000054  loss: 2.7459 (2.6105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3433 (1.6118)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [279]  [ 800/1251]  eta: 0:03:58  lr: 0.000053  min_lr: 0.000053  loss: 2.7324 (2.6035)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2098 (1.6294)  time: 0.5234  data: 0.0004  max mem: 43713
Epoch: [279]  [1000/1251]  eta: 0:02:12  lr: 0.000052  min_lr: 0.000052  loss: 2.5275 (2.5965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4597 (1.6162)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [279]  [1200/1251]  eta: 0:00:26  lr: 0.000051  min_lr: 0.000051  loss: 2.5763 (2.5954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3838 (1.6108)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.5432 (2.5915)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3550 (1.6021)  time: 0.4494  data: 0.0005  max mem: 43713
Epoch: [279] Total time: 0:10:57 (0.5257 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.5432 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3550 (1.6021)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5033 (0.5033)  acc1: 90.8000 (90.8000)  acc5: 100.0000 (100.0000)  time: 5.5829  data: 5.2642  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.6392 (0.6523)  acc1: 87.6000 (87.4182)  acc5: 98.0000 (98.1455)  time: 0.7482  data: 0.4789  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8104 (0.7681)  acc1: 83.2000 (84.6095)  acc5: 97.2000 (97.0476)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8732 (0.7852)  acc1: 82.8000 (84.2400)  acc5: 96.8000 (96.9920)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4823 s / it)
* Acc@1 84.474 Acc@5 96.952 loss 0.777
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.47%
Epoch: [280]  [   0/1251]  eta: 1:00:28  lr: 0.000051  min_lr: 0.000051  loss: 3.0448 (3.0448)  weight_decay: 0.0500 (0.0500)  time: 2.9004  data: 2.3609  max mem: 43713
Epoch: [280]  [ 200/1251]  eta: 0:09:24  lr: 0.000050  min_lr: 0.000050  loss: 2.7422 (2.6006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5005 (1.5113)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [280]  [ 400/1251]  eta: 0:07:31  lr: 0.000050  min_lr: 0.000050  loss: 2.7844 (2.6058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.4613)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [280]  [ 600/1251]  eta: 0:05:44  lr: 0.000049  min_lr: 0.000049  loss: 2.5677 (2.5986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3849 (1.6052)  time: 0.5226  data: 0.0004  max mem: 43713
Epoch: [280]  [ 800/1251]  eta: 0:03:58  lr: 0.000048  min_lr: 0.000048  loss: 2.7506 (2.5957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2174 (1.5616)  time: 0.5242  data: 0.0004  max mem: 43713
Epoch: [280]  [1000/1251]  eta: 0:02:12  lr: 0.000047  min_lr: 0.000047  loss: 2.7513 (2.5963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5416 (1.5564)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [280]  [1200/1251]  eta: 0:00:26  lr: 0.000046  min_lr: 0.000046  loss: 2.7488 (2.5977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3729 (1.5601)  time: 0.5242  data: 0.0004  max mem: 43713
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.6205 (2.5958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4335 (1.5603)  time: 0.4492  data: 0.0005  max mem: 43713
Epoch: [280] Total time: 0:10:58 (0.5261 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.6205 (2.5897)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4335 (1.5603)
Test:  [ 0/25]  eta: 0:01:48  loss: 0.5599 (0.5599)  acc1: 90.8000 (90.8000)  acc5: 100.0000 (100.0000)  time: 4.3366  data: 4.0349  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6971 (0.7069)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (98.0727)  time: 0.6949  data: 0.4274  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8696 (0.8207)  acc1: 82.8000 (84.4191)  acc5: 97.2000 (96.9524)  time: 0.2973  data: 0.0334  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9211 (0.8372)  acc1: 82.4000 (84.0480)  acc5: 96.4000 (96.9440)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4566 s / it)
* Acc@1 84.420 Acc@5 96.966 loss 0.829
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [281]  [   0/1251]  eta: 1:11:58  lr: 0.000046  min_lr: 0.000046  loss: 2.8046 (2.8046)  weight_decay: 0.0500 (0.0500)  time: 3.4523  data: 2.6346  max mem: 43713
Epoch: [281]  [ 200/1251]  eta: 0:09:25  lr: 0.000046  min_lr: 0.000046  loss: 2.7011 (2.5731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2830 (1.6689)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [281]  [ 400/1251]  eta: 0:07:33  lr: 0.000045  min_lr: 0.000045  loss: 2.5597 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1959 (1.6122)  time: 0.5412  data: 0.0004  max mem: 43713
Epoch: [281]  [ 600/1251]  eta: 0:05:45  lr: 0.000044  min_lr: 0.000044  loss: 2.6620 (2.5773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7564 (1.6207)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [281]  [ 800/1251]  eta: 0:03:58  lr: 0.000043  min_lr: 0.000043  loss: 2.5868 (2.5817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5756 (1.7077)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [281]  [1000/1251]  eta: 0:02:12  lr: 0.000043  min_lr: 0.000043  loss: 2.7223 (2.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3601 (1.6698)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [281]  [1200/1251]  eta: 0:00:26  lr: 0.000042  min_lr: 0.000042  loss: 2.6873 (2.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3411 (1.6555)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.5374 (2.5850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4822 (1.6613)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [281] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.5374 (2.5796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4822 (1.6613)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5261 (0.5261)  acc1: 91.2000 (91.2000)  acc5: 100.0000 (100.0000)  time: 5.3210  data: 5.0116  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6770 (0.6797)  acc1: 87.2000 (87.4182)  acc5: 98.0000 (98.1091)  time: 0.7243  data: 0.4559  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8454 (0.7934)  acc1: 82.4000 (84.5714)  acc5: 97.2000 (97.1048)  time: 0.2645  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8888 (0.8110)  acc1: 82.4000 (84.1440)  acc5: 96.8000 (97.0240)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4711 s / it)
* Acc@1 84.412 Acc@5 96.978 loss 0.803
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [282]  [   0/1251]  eta: 1:13:43  lr: 0.000042  min_lr: 0.000042  loss: 2.5807 (2.5807)  weight_decay: 0.0500 (0.0500)  time: 3.5363  data: 1.6400  max mem: 43713
Epoch: [282]  [ 200/1251]  eta: 0:09:27  lr: 0.000041  min_lr: 0.000041  loss: 2.6728 (2.5200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3720 (1.4764)  time: 0.5313  data: 0.0004  max mem: 43713
Epoch: [282]  [ 400/1251]  eta: 0:07:33  lr: 0.000040  min_lr: 0.000040  loss: 2.6089 (2.5691)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3494 (1.6124)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [282]  [ 600/1251]  eta: 0:05:44  lr: 0.000040  min_lr: 0.000040  loss: 2.6599 (2.5772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2764 (1.5547)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [282]  [ 800/1251]  eta: 0:03:58  lr: 0.000039  min_lr: 0.000039  loss: 2.4541 (2.5761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4653 (1.5186)  time: 0.5278  data: 0.0005  max mem: 43713
Epoch: [282]  [1000/1251]  eta: 0:02:12  lr: 0.000038  min_lr: 0.000038  loss: 2.4361 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5849 (1.5410)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [282]  [1200/1251]  eta: 0:00:26  lr: 0.000037  min_lr: 0.000037  loss: 2.6115 (2.5749)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2232 (1.5593)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.6297 (2.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3457 (1.5586)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [282] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.6297 (2.5862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3457 (1.5586)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5934 (0.5934)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.3786  data: 5.0800  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7264 (0.7404)  acc1: 86.8000 (87.4182)  acc5: 98.0000 (98.0727)  time: 0.7290  data: 0.4622  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9036 (0.8552)  acc1: 82.8000 (84.4952)  acc5: 97.2000 (97.0857)  time: 0.2638  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9590 (0.8715)  acc1: 82.8000 (84.1120)  acc5: 96.8000 (97.0560)  time: 0.2637  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4725 s / it)
* Acc@1 84.440 Acc@5 96.972 loss 0.864
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [283]  [   0/1251]  eta: 1:14:40  lr: 0.000037  min_lr: 0.000037  loss: 2.4362 (2.4362)  weight_decay: 0.0500 (0.0500)  time: 3.5817  data: 2.2890  max mem: 43713
Epoch: [283]  [ 200/1251]  eta: 0:09:31  lr: 0.000037  min_lr: 0.000037  loss: 2.7682 (2.5871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3188 (1.4416)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [283]  [ 400/1251]  eta: 0:07:34  lr: 0.000036  min_lr: 0.000036  loss: 2.7002 (2.5831)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3336 (1.5928)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [283]  [ 600/1251]  eta: 0:05:45  lr: 0.000035  min_lr: 0.000035  loss: 2.7826 (2.5871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3038 (1.5692)  time: 0.5282  data: 0.0004  max mem: 43713
Epoch: [283]  [ 800/1251]  eta: 0:03:58  lr: 0.000035  min_lr: 0.000035  loss: 2.7661 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3303 (1.5643)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [283]  [1000/1251]  eta: 0:02:12  lr: 0.000034  min_lr: 0.000034  loss: 2.7193 (2.5774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2052 (1.5331)  time: 0.5234  data: 0.0005  max mem: 43713
Epoch: [283]  [1200/1251]  eta: 0:00:26  lr: 0.000033  min_lr: 0.000033  loss: 2.6325 (2.5855)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1406 (1.5108)  time: 0.5319  data: 0.0005  max mem: 43713
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.5944 (2.5850)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2134 (1.5023)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [283] Total time: 0:10:59 (0.5273 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.5944 (2.5861)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2134 (1.5023)
Test:  [ 0/25]  eta: 0:02:07  loss: 0.5963 (0.5963)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.1187  data: 4.8208  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7277 (0.7458)  acc1: 88.0000 (87.6364)  acc5: 98.0000 (98.0000)  time: 0.7061  data: 0.4385  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9064 (0.8613)  acc1: 82.8000 (84.4762)  acc5: 96.8000 (96.9905)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9594 (0.8775)  acc1: 82.4000 (84.0800)  acc5: 96.8000 (96.9760)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4633 s / it)
* Acc@1 84.398 Acc@5 96.938 loss 0.869
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [284]  [   0/1251]  eta: 1:13:22  lr: 0.000033  min_lr: 0.000033  loss: 2.8498 (2.8498)  weight_decay: 0.0500 (0.0500)  time: 3.5193  data: 1.5501  max mem: 43713
Epoch: [284]  [ 200/1251]  eta: 0:09:26  lr: 0.000032  min_lr: 0.000032  loss: 2.6280 (2.5855)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1364 (1.2785)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [284]  [ 400/1251]  eta: 0:07:32  lr: 0.000032  min_lr: 0.000032  loss: 2.7080 (2.5863)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1696 (nan)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [284]  [ 600/1251]  eta: 0:05:44  lr: 0.000031  min_lr: 0.000031  loss: 2.6905 (2.5745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0350 (nan)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [284]  [ 800/1251]  eta: 0:03:58  lr: 0.000031  min_lr: 0.000031  loss: 2.7026 (2.5741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3398 (nan)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [284]  [1000/1251]  eta: 0:02:12  lr: 0.000030  min_lr: 0.000030  loss: 2.7380 (2.5774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2045 (nan)  time: 0.5336  data: 0.0004  max mem: 43713
Epoch: [284]  [1200/1251]  eta: 0:00:26  lr: 0.000029  min_lr: 0.000029  loss: 2.6450 (2.5748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4753 (nan)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.6250 (2.5752)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2704 (nan)  time: 0.4434  data: 0.0005  max mem: 43713
Epoch: [284] Total time: 0:10:57 (0.5258 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.6250 (2.5778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2704 (nan)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.5407 (0.5407)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 4.4693  data: 4.1738  max mem: 43713
Test:  [10/25]  eta: 0:00:09  loss: 0.6751 (0.6893)  acc1: 87.2000 (87.5273)  acc5: 97.6000 (97.9636)  time: 0.6480  data: 0.3805  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8480 (0.8036)  acc1: 82.8000 (84.5905)  acc5: 96.8000 (96.9714)  time: 0.2655  data: 0.0007  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8978 (0.8201)  acc1: 82.4000 (84.0960)  acc5: 96.8000 (96.9440)  time: 0.2652  data: 0.0004  max mem: 43713
Test: Total time: 0:00:10 (0.4387 s / it)
* Acc@1 84.440 Acc@5 96.926 loss 0.811
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.47%
Epoch: [285]  [   0/1251]  eta: 1:18:09  lr: 0.000029  min_lr: 0.000029  loss: 2.4667 (2.4667)  weight_decay: 0.0500 (0.0500)  time: 3.7490  data: 2.3500  max mem: 43713
Epoch: [285]  [ 200/1251]  eta: 0:09:28  lr: 0.000029  min_lr: 0.000029  loss: 2.7527 (2.6210)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5351 (1.7181)  time: 0.5220  data: 0.0005  max mem: 43713
Epoch: [285]  [ 400/1251]  eta: 0:07:34  lr: 0.000028  min_lr: 0.000028  loss: 2.4035 (2.5978)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4394 (1.6776)  time: 0.5237  data: 0.0004  max mem: 43713
Epoch: [285]  [ 600/1251]  eta: 0:05:45  lr: 0.000027  min_lr: 0.000027  loss: 2.6256 (2.6021)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2665 (1.6677)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [285]  [ 800/1251]  eta: 0:03:58  lr: 0.000027  min_lr: 0.000027  loss: 2.7488 (2.5873)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5807 (1.6555)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [285]  [1000/1251]  eta: 0:02:12  lr: 0.000026  min_lr: 0.000026  loss: 2.5790 (2.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4142 (1.6506)  time: 0.5229  data: 0.0004  max mem: 43713
Epoch: [285]  [1200/1251]  eta: 0:00:26  lr: 0.000026  min_lr: 0.000026  loss: 2.7664 (2.5832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5736 (1.6139)  time: 0.5276  data: 0.0004  max mem: 43713
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.7069 (2.5851)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4583 (1.6129)  time: 0.4439  data: 0.0007  max mem: 43713
Epoch: [285] Total time: 0:10:58 (0.5268 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.7069 (2.5759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4583 (1.6129)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5500 (0.5500)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.3641  data: 5.0682  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6839 (0.6981)  acc1: 87.6000 (87.5636)  acc5: 98.0000 (98.0727)  time: 0.7283  data: 0.4610  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8551 (0.8138)  acc1: 82.4000 (84.5714)  acc5: 97.2000 (97.0667)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9051 (0.8304)  acc1: 82.4000 (84.1920)  acc5: 96.8000 (97.0560)  time: 0.2648  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4739 s / it)
* Acc@1 84.506 Acc@5 96.932 loss 0.821
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.51%
Epoch: [286]  [   0/1251]  eta: 1:03:01  lr: 0.000026  min_lr: 0.000026  loss: 3.0162 (3.0162)  weight_decay: 0.0500 (0.0500)  time: 3.0224  data: 2.4808  max mem: 43713
Epoch: [286]  [ 200/1251]  eta: 0:09:27  lr: 0.000025  min_lr: 0.000025  loss: 2.3503 (2.5369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3426 (1.4668)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [286]  [ 400/1251]  eta: 0:07:32  lr: 0.000025  min_lr: 0.000025  loss: 2.8624 (2.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1612 (1.4236)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [286]  [ 600/1251]  eta: 0:05:44  lr: 0.000024  min_lr: 0.000024  loss: 2.7985 (2.5778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4062 (1.5090)  time: 0.5308  data: 0.0005  max mem: 43713
Epoch: [286]  [ 800/1251]  eta: 0:03:58  lr: 0.000023  min_lr: 0.000023  loss: 2.6179 (2.5818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3855 (1.5168)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [286]  [1000/1251]  eta: 0:02:12  lr: 0.000023  min_lr: 0.000023  loss: 2.6670 (2.5845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1709 (1.5280)  time: 0.5225  data: 0.0005  max mem: 43713
Epoch: [286]  [1200/1251]  eta: 0:00:26  lr: 0.000022  min_lr: 0.000022  loss: 2.6843 (2.5833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7734 (1.6035)  time: 0.5282  data: 0.0005  max mem: 43713
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.5009 (2.5817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3255 (1.5896)  time: 0.4481  data: 0.0006  max mem: 43713
Epoch: [286] Total time: 0:10:58 (0.5262 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.5009 (2.5767)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3255 (1.5896)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5590 (0.5590)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.2108  data: 4.9098  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6844 (0.7052)  acc1: 87.6000 (87.7455)  acc5: 98.0000 (98.0727)  time: 0.7146  data: 0.4467  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8644 (0.8193)  acc1: 82.8000 (84.6476)  acc5: 97.2000 (97.0857)  time: 0.2649  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9164 (0.8355)  acc1: 82.8000 (84.1440)  acc5: 96.4000 (96.9760)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4674 s / it)
* Acc@1 84.472 Acc@5 96.934 loss 0.827
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.51%
Epoch: [287]  [   0/1251]  eta: 1:13:14  lr: 0.000022  min_lr: 0.000022  loss: 2.9418 (2.9418)  weight_decay: 0.0500 (0.0500)  time: 3.5127  data: 2.3558  max mem: 43713
Epoch: [287]  [ 200/1251]  eta: 0:09:26  lr: 0.000022  min_lr: 0.000022  loss: 2.6347 (2.5758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2771 (1.5065)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [287]  [ 400/1251]  eta: 0:07:32  lr: 0.000021  min_lr: 0.000021  loss: 2.7805 (2.5965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1202 (1.5810)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [287]  [ 600/1251]  eta: 0:05:45  lr: 0.000021  min_lr: 0.000021  loss: 2.6263 (2.6049)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3258 (1.5957)  time: 0.5240  data: 0.0004  max mem: 43713
Epoch: [287]  [ 800/1251]  eta: 0:03:58  lr: 0.000020  min_lr: 0.000020  loss: 2.7200 (2.5949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1603 (1.5268)  time: 0.5227  data: 0.0004  max mem: 43713
Epoch: [287]  [1000/1251]  eta: 0:02:12  lr: 0.000020  min_lr: 0.000020  loss: 2.6029 (2.5917)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3659 (1.5277)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [287]  [1200/1251]  eta: 0:00:26  lr: 0.000019  min_lr: 0.000019  loss: 2.7824 (2.5903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3696 (1.5407)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.8748 (2.5943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3621 (1.5382)  time: 0.4439  data: 0.0005  max mem: 43713
Epoch: [287] Total time: 0:10:59 (0.5268 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.8748 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3621 (1.5382)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.6965 (0.6965)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.9115  data: 5.5874  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.8187 (0.8396)  acc1: 87.2000 (87.6727)  acc5: 98.0000 (97.8909)  time: 0.7772  data: 0.5082  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 1.0002 (0.9563)  acc1: 82.8000 (84.4762)  acc5: 96.8000 (96.8762)  time: 0.2636  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0558 (0.9726)  acc1: 81.6000 (83.9680)  acc5: 96.4000 (96.8640)  time: 0.2635  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4926 s / it)
* Acc@1 84.392 Acc@5 96.866 loss 0.963
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.51%
Epoch: [288]  [   0/1251]  eta: 1:11:06  lr: 0.000019  min_lr: 0.000019  loss: 2.6707 (2.6707)  weight_decay: 0.0500 (0.0500)  time: 3.4103  data: 2.3926  max mem: 43713
Epoch: [288]  [ 200/1251]  eta: 0:09:27  lr: 0.000019  min_lr: 0.000019  loss: 2.6634 (2.5868)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2955 (1.4798)  time: 0.5285  data: 0.0004  max mem: 43713
Epoch: [288]  [ 400/1251]  eta: 0:07:35  lr: 0.000018  min_lr: 0.000018  loss: 2.5996 (2.6150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1653 (1.3905)  time: 0.5317  data: 0.0004  max mem: 43713
Epoch: [288]  [ 600/1251]  eta: 0:05:46  lr: 0.000018  min_lr: 0.000018  loss: 2.7215 (2.5946)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2365 (1.4160)  time: 0.5256  data: 0.0005  max mem: 43713
Epoch: [288]  [ 800/1251]  eta: 0:03:58  lr: 0.000017  min_lr: 0.000017  loss: 2.6805 (2.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3527 (1.4202)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [288]  [1000/1251]  eta: 0:02:12  lr: 0.000017  min_lr: 0.000017  loss: 2.6798 (2.5877)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3218 (1.4249)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [288]  [1200/1251]  eta: 0:00:26  lr: 0.000016  min_lr: 0.000016  loss: 2.6284 (2.5867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2259 (1.4373)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.6673 (2.5891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3015 (1.4396)  time: 0.4438  data: 0.0006  max mem: 43713
Epoch: [288] Total time: 0:11:00 (0.5276 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.6673 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3015 (1.4396)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.5811 (0.5811)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.2720  data: 4.9578  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7150 (0.7315)  acc1: 87.6000 (87.7818)  acc5: 98.0000 (98.1091)  time: 0.7199  data: 0.4510  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8916 (0.8477)  acc1: 82.8000 (84.6476)  acc5: 96.8000 (97.0857)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9493 (0.8644)  acc1: 82.0000 (84.1280)  acc5: 96.8000 (97.0080)  time: 0.2646  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4695 s / it)
* Acc@1 84.464 Acc@5 96.960 loss 0.856
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.51%
Epoch: [289]  [   0/1251]  eta: 1:08:47  lr: 0.000016  min_lr: 0.000016  loss: 2.8550 (2.8550)  weight_decay: 0.0500 (0.0500)  time: 3.2992  data: 2.3350  max mem: 43713
Epoch: [289]  [ 200/1251]  eta: 0:09:31  lr: 0.000016  min_lr: 0.000016  loss: 2.7877 (2.6065)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0573 (1.3864)  time: 0.5312  data: 0.0004  max mem: 43713
Epoch: [289]  [ 400/1251]  eta: 0:07:33  lr: 0.000015  min_lr: 0.000015  loss: 2.8086 (2.6002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2961 (1.4761)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [289]  [ 600/1251]  eta: 0:05:45  lr: 0.000015  min_lr: 0.000015  loss: 2.6539 (2.5931)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3040 (1.4790)  time: 0.5236  data: 0.0005  max mem: 43713
Epoch: [289]  [ 800/1251]  eta: 0:03:58  lr: 0.000014  min_lr: 0.000014  loss: 2.6803 (2.5827)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4678 (1.5248)  time: 0.5221  data: 0.0004  max mem: 43713
Epoch: [289]  [1000/1251]  eta: 0:02:12  lr: 0.000014  min_lr: 0.000014  loss: 2.6938 (2.5759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2207 (1.5168)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [289]  [1200/1251]  eta: 0:00:26  lr: 0.000014  min_lr: 0.000014  loss: 2.6184 (2.5757)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1510 (1.5417)  time: 0.5290  data: 0.0005  max mem: 43713
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.7157 (2.5745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1683 (1.5413)  time: 0.4515  data: 0.0006  max mem: 43713
Epoch: [289] Total time: 0:10:58 (0.5264 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.7157 (2.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1683 (1.5413)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.5165 (0.5165)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.3699  data: 5.0815  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6565 (0.6652)  acc1: 87.6000 (87.6000)  acc5: 97.6000 (98.0364)  time: 0.7292  data: 0.4623  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8264 (0.7811)  acc1: 82.8000 (84.5333)  acc5: 97.2000 (96.9905)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8765 (0.7976)  acc1: 82.4000 (84.1120)  acc5: 97.2000 (97.0080)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4744 s / it)
* Acc@1 84.522 Acc@5 96.988 loss 0.789
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [290]  [   0/1251]  eta: 0:57:05  lr: 0.000014  min_lr: 0.000014  loss: 3.0441 (3.0441)  weight_decay: 0.0500 (0.0500)  time: 2.7384  data: 2.1951  max mem: 43713
Epoch: [290]  [ 200/1251]  eta: 0:09:23  lr: 0.000013  min_lr: 0.000013  loss: 2.7613 (2.5807)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2649 (1.9294)  time: 0.5252  data: 0.0005  max mem: 43713
Epoch: [290]  [ 400/1251]  eta: 0:07:31  lr: 0.000013  min_lr: 0.000013  loss: 2.6075 (2.5743)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2226 (1.8224)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [290]  [ 600/1251]  eta: 0:05:44  lr: 0.000012  min_lr: 0.000012  loss: 2.6241 (2.5919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0855 (1.6559)  time: 0.5245  data: 0.0004  max mem: 43713
Epoch: [290]  [ 800/1251]  eta: 0:03:58  lr: 0.000012  min_lr: 0.000012  loss: 2.6469 (2.5891)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2068 (1.5911)  time: 0.5240  data: 0.0005  max mem: 43713
Epoch: [290]  [1000/1251]  eta: 0:02:12  lr: 0.000012  min_lr: 0.000012  loss: 2.7254 (2.5867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1152 (1.5430)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [290]  [1200/1251]  eta: 0:00:26  lr: 0.000011  min_lr: 0.000011  loss: 2.7305 (2.5854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1874 (1.5732)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.6758 (2.5849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4109 (1.5699)  time: 0.4435  data: 0.0005  max mem: 43713
Epoch: [290] Total time: 0:10:58 (0.5266 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.6758 (2.5766)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4109 (1.5699)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5895 (0.5895)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.4397  data: 5.1465  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7206 (0.7373)  acc1: 87.6000 (87.6364)  acc5: 97.6000 (98.0000)  time: 0.7353  data: 0.4681  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9004 (0.8531)  acc1: 82.4000 (84.4571)  acc5: 97.2000 (96.9905)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9510 (0.8693)  acc1: 82.0000 (84.0000)  acc5: 96.8000 (96.9440)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4771 s / it)
* Acc@1 84.458 Acc@5 96.966 loss 0.861
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [291]  [   0/1251]  eta: 1:09:04  lr: 0.000011  min_lr: 0.000011  loss: 2.8708 (2.8708)  weight_decay: 0.0500 (0.0500)  time: 3.3126  data: 2.3839  max mem: 43713
Epoch: [291]  [ 200/1251]  eta: 0:09:26  lr: 0.000011  min_lr: 0.000011  loss: 2.6741 (2.5805)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3045 (1.8047)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [291]  [ 400/1251]  eta: 0:07:33  lr: 0.000010  min_lr: 0.000010  loss: 2.4521 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4757 (1.5900)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [291]  [ 600/1251]  eta: 0:05:45  lr: 0.000010  min_lr: 0.000010  loss: 2.8255 (2.5773)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2752 (1.6020)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [291]  [ 800/1251]  eta: 0:03:58  lr: 0.000010  min_lr: 0.000010  loss: 2.6814 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2429 (1.5620)  time: 0.5246  data: 0.0005  max mem: 43713
Epoch: [291]  [1000/1251]  eta: 0:02:12  lr: 0.000009  min_lr: 0.000009  loss: 2.7075 (2.5689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2729 (1.5634)  time: 0.5224  data: 0.0005  max mem: 43713
Epoch: [291]  [1200/1251]  eta: 0:00:26  lr: 0.000009  min_lr: 0.000009  loss: 2.6309 (2.5788)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0529 (1.5500)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.7081 (2.5763)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0855 (1.5573)  time: 0.4435  data: 0.0007  max mem: 43713
Epoch: [291] Total time: 0:10:58 (0.5267 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.7081 (2.5733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0855 (1.5573)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5939 (0.5939)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.4836  data: 5.1839  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7253 (0.7407)  acc1: 87.2000 (87.6727)  acc5: 98.0000 (98.1091)  time: 0.7384  data: 0.4716  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9016 (0.8559)  acc1: 82.8000 (84.5714)  acc5: 97.2000 (97.0286)  time: 0.2637  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9572 (0.8729)  acc1: 82.0000 (84.0960)  acc5: 97.2000 (96.9920)  time: 0.2636  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4774 s / it)
* Acc@1 84.478 Acc@5 96.950 loss 0.865
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [292]  [   0/1251]  eta: 1:13:19  lr: 0.000009  min_lr: 0.000009  loss: 2.8471 (2.8471)  weight_decay: 0.0500 (0.0500)  time: 3.5166  data: 2.4130  max mem: 43713
Epoch: [292]  [ 200/1251]  eta: 0:09:28  lr: 0.000009  min_lr: 0.000009  loss: 2.7440 (2.5282)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3280 (1.4287)  time: 0.5309  data: 0.0004  max mem: 43713
Epoch: [292]  [ 400/1251]  eta: 0:07:34  lr: 0.000008  min_lr: 0.000008  loss: 2.7268 (2.5537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3608 (1.5092)  time: 0.5243  data: 0.0004  max mem: 43713
Epoch: [292]  [ 600/1251]  eta: 0:05:45  lr: 0.000008  min_lr: 0.000008  loss: 2.4412 (2.5750)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1337 (1.4887)  time: 0.5250  data: 0.0004  max mem: 43713
Epoch: [292]  [ 800/1251]  eta: 0:03:58  lr: 0.000008  min_lr: 0.000008  loss: 2.7734 (2.5882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3823 (1.4667)  time: 0.5293  data: 0.0004  max mem: 43713
Epoch: [292]  [1000/1251]  eta: 0:02:12  lr: 0.000008  min_lr: 0.000008  loss: 2.7940 (2.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1591 (1.5169)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [292]  [1200/1251]  eta: 0:00:26  lr: 0.000007  min_lr: 0.000007  loss: 2.4687 (2.5999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1711 (1.5149)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.7872 (2.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2636 (1.5113)  time: 0.4438  data: 0.0005  max mem: 43713
Epoch: [292] Total time: 0:10:59 (0.5269 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.7872 (2.5702)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2636 (1.5113)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6574 (0.6574)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.6860  data: 5.4005  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7791 (0.8016)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (97.9273)  time: 0.7576  data: 0.4912  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9606 (0.9172)  acc1: 82.4000 (84.5905)  acc5: 96.8000 (96.8381)  time: 0.2647  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 1.0179 (0.9340)  acc1: 82.4000 (83.9520)  acc5: 96.4000 (96.7840)  time: 0.2647  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4865 s / it)
* Acc@1 84.450 Acc@5 96.894 loss 0.925
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [293]  [   0/1251]  eta: 1:14:11  lr: 0.000007  min_lr: 0.000007  loss: 2.5779 (2.5779)  weight_decay: 0.0500 (0.0500)  time: 3.5583  data: 2.3090  max mem: 43713
Epoch: [293]  [ 200/1251]  eta: 0:09:29  lr: 0.000007  min_lr: 0.000007  loss: 2.6996 (2.6403)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2322 (2.0926)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [293]  [ 400/1251]  eta: 0:07:33  lr: 0.000007  min_lr: 0.000007  loss: 2.3659 (2.5770)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0648 (1.7792)  time: 0.5302  data: 0.0004  max mem: 43713
Epoch: [293]  [ 600/1251]  eta: 0:05:45  lr: 0.000006  min_lr: 0.000006  loss: 2.6069 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5003 (1.7244)  time: 0.5276  data: 0.0005  max mem: 43713
Epoch: [293]  [ 800/1251]  eta: 0:03:58  lr: 0.000006  min_lr: 0.000006  loss: 2.4835 (2.5668)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2412 (1.6616)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [293]  [1000/1251]  eta: 0:02:12  lr: 0.000006  min_lr: 0.000006  loss: 2.4188 (2.5644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4403 (1.6480)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [293]  [1200/1251]  eta: 0:00:26  lr: 0.000006  min_lr: 0.000006  loss: 2.6603 (2.5671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2819 (1.6299)  time: 0.5271  data: 0.0004  max mem: 43713
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.7263 (2.5694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3639 (1.6491)  time: 0.4433  data: 0.0005  max mem: 43713
Epoch: [293] Total time: 0:10:58 (0.5265 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.7263 (2.5664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3639 (1.6491)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.5730 (0.5730)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.0665  data: 4.7701  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7055 (0.7204)  acc1: 87.6000 (87.7818)  acc5: 97.6000 (98.0000)  time: 0.7014  data: 0.4339  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8798 (0.8369)  acc1: 82.4000 (84.6286)  acc5: 96.8000 (96.9524)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9322 (0.8540)  acc1: 82.4000 (84.1120)  acc5: 96.4000 (96.9120)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4617 s / it)
* Acc@1 84.458 Acc@5 96.950 loss 0.844
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [294]  [   0/1251]  eta: 1:14:14  lr: 0.000006  min_lr: 0.000006  loss: 2.8392 (2.8392)  weight_decay: 0.0500 (0.0500)  time: 3.5611  data: 2.7803  max mem: 43713
Epoch: [294]  [ 200/1251]  eta: 0:09:26  lr: 0.000005  min_lr: 0.000005  loss: 2.7042 (2.5858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2073 (1.6366)  time: 0.5233  data: 0.0005  max mem: 43713
Epoch: [294]  [ 400/1251]  eta: 0:07:33  lr: 0.000005  min_lr: 0.000005  loss: 2.6566 (2.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0944 (1.4970)  time: 0.5227  data: 0.0005  max mem: 43713
Epoch: [294]  [ 600/1251]  eta: 0:05:45  lr: 0.000005  min_lr: 0.000005  loss: 2.5606 (2.5692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1059 (1.5567)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [294]  [ 800/1251]  eta: 0:03:58  lr: 0.000005  min_lr: 0.000005  loss: 2.6597 (2.5572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3375 (1.5502)  time: 0.5254  data: 0.0005  max mem: 43713
Epoch: [294]  [1000/1251]  eta: 0:02:12  lr: 0.000004  min_lr: 0.000004  loss: 2.7779 (2.5568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3337 (1.5640)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [294]  [1200/1251]  eta: 0:00:26  lr: 0.000004  min_lr: 0.000004  loss: 2.7454 (2.5597)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1052 (1.5386)  time: 0.5264  data: 0.0006  max mem: 43713
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.6236 (2.5598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1511 (1.5294)  time: 0.4436  data: 0.0007  max mem: 43713
Epoch: [294] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.6236 (2.5744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1511 (1.5294)
Test:  [ 0/25]  eta: 0:02:08  loss: 0.5911 (0.5911)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.1504  data: 4.8471  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7202 (0.7345)  acc1: 87.6000 (87.7818)  acc5: 98.0000 (98.0727)  time: 0.7085  data: 0.4410  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8974 (0.8493)  acc1: 82.8000 (84.5333)  acc5: 97.2000 (96.9333)  time: 0.2642  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9495 (0.8662)  acc1: 82.4000 (84.0960)  acc5: 96.8000 (96.8800)  time: 0.2641  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4652 s / it)
* Acc@1 84.486 Acc@5 96.936 loss 0.858
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [295]  [   0/1251]  eta: 1:13:14  lr: 0.000004  min_lr: 0.000004  loss: 1.7407 (1.7407)  weight_decay: 0.0500 (0.0500)  time: 3.5125  data: 1.8026  max mem: 43713
Epoch: [295]  [ 200/1251]  eta: 0:09:27  lr: 0.000004  min_lr: 0.000004  loss: 2.7122 (2.5506)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2701 (1.5719)  time: 0.5230  data: 0.0005  max mem: 43713
Epoch: [295]  [ 400/1251]  eta: 0:07:34  lr: 0.000004  min_lr: 0.000004  loss: 2.6442 (2.5530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5600 (1.5564)  time: 0.5313  data: 0.0005  max mem: 43713
Epoch: [295]  [ 600/1251]  eta: 0:05:45  lr: 0.000004  min_lr: 0.000004  loss: 2.6203 (2.5642)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2785 (1.5838)  time: 0.5315  data: 0.0005  max mem: 43713
Epoch: [295]  [ 800/1251]  eta: 0:03:58  lr: 0.000003  min_lr: 0.000003  loss: 2.6905 (2.5630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4273 (1.5939)  time: 0.5328  data: 0.0005  max mem: 43713
Epoch: [295]  [1000/1251]  eta: 0:02:12  lr: 0.000003  min_lr: 0.000003  loss: 2.6594 (2.5694)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2590 (1.6132)  time: 0.5229  data: 0.0005  max mem: 43713
Epoch: [295]  [1200/1251]  eta: 0:00:26  lr: 0.000003  min_lr: 0.000003  loss: 2.7412 (2.5720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2571 (1.6510)  time: 0.5228  data: 0.0005  max mem: 43713
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.3415 (2.5723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1548 (1.6439)  time: 0.4442  data: 0.0005  max mem: 43713
Epoch: [295] Total time: 0:10:59 (0.5273 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.3415 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1548 (1.6439)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5320 (0.5320)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.1851  data: 4.8780  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.6680 (0.6770)  acc1: 87.2000 (87.6727)  acc5: 97.6000 (98.0000)  time: 0.7121  data: 0.4438  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8394 (0.7925)  acc1: 83.2000 (84.5905)  acc5: 97.2000 (96.9714)  time: 0.2646  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.8940 (0.8088)  acc1: 82.0000 (84.1760)  acc5: 96.8000 (96.9760)  time: 0.2644  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4662 s / it)
* Acc@1 84.492 Acc@5 96.972 loss 0.799
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [296]  [   0/1251]  eta: 1:13:16  lr: 0.000003  min_lr: 0.000003  loss: 2.5655 (2.5655)  weight_decay: 0.0500 (0.0500)  time: 3.5144  data: 1.6918  max mem: 43713
Epoch: [296]  [ 200/1251]  eta: 0:09:31  lr: 0.000003  min_lr: 0.000003  loss: 2.6262 (2.5685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0545 (1.5287)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [296]  [ 400/1251]  eta: 0:07:33  lr: 0.000003  min_lr: 0.000003  loss: 2.4313 (2.5584)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3145 (1.5438)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [296]  [ 600/1251]  eta: 0:05:45  lr: 0.000003  min_lr: 0.000003  loss: 2.7666 (2.5600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4104 (1.5935)  time: 0.5314  data: 0.0004  max mem: 43713
Epoch: [296]  [ 800/1251]  eta: 0:03:58  lr: 0.000002  min_lr: 0.000002  loss: 2.5144 (2.5682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4888 (1.6181)  time: 0.5251  data: 0.0005  max mem: 43713
Epoch: [296]  [1000/1251]  eta: 0:02:12  lr: 0.000002  min_lr: 0.000002  loss: 2.4447 (2.5665)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1438 (nan)  time: 0.5228  data: 0.0004  max mem: 43713
Epoch: [296]  [1200/1251]  eta: 0:00:26  lr: 0.000002  min_lr: 0.000002  loss: 2.5738 (2.5704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1123 (nan)  time: 0.5375  data: 0.0004  max mem: 43713
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.7409 (2.5685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2695 (nan)  time: 0.4472  data: 0.0006  max mem: 43713
Epoch: [296] Total time: 0:10:59 (0.5272 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.7409 (2.5728)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2695 (nan)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5719 (0.5719)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.2101  data: 4.9230  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7045 (0.7180)  acc1: 87.6000 (87.7091)  acc5: 98.0000 (98.1091)  time: 0.7146  data: 0.4478  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8795 (0.8326)  acc1: 82.4000 (84.6095)  acc5: 96.8000 (96.9714)  time: 0.2651  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9308 (0.8499)  acc1: 82.4000 (84.0960)  acc5: 96.4000 (96.9600)  time: 0.2650  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4677 s / it)
* Acc@1 84.472 Acc@5 96.962 loss 0.841
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [297]  [   0/1251]  eta: 1:07:03  lr: 0.000002  min_lr: 0.000002  loss: 2.7901 (2.7901)  weight_decay: 0.0500 (0.0500)  time: 3.2161  data: 2.4761  max mem: 43713
Epoch: [297]  [ 200/1251]  eta: 0:09:24  lr: 0.000002  min_lr: 0.000002  loss: 2.5373 (2.5624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2369 (1.5013)  time: 0.5232  data: 0.0005  max mem: 43713
Epoch: [297]  [ 400/1251]  eta: 0:07:31  lr: 0.000002  min_lr: 0.000002  loss: 2.6010 (2.5643)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3099 (1.4665)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [297]  [ 600/1251]  eta: 0:05:45  lr: 0.000002  min_lr: 0.000002  loss: 2.6626 (2.5605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2943 (1.5692)  time: 0.5226  data: 0.0005  max mem: 43713
Epoch: [297]  [ 800/1251]  eta: 0:03:58  lr: 0.000002  min_lr: 0.000002  loss: 2.5681 (2.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2777 (1.5804)  time: 0.5225  data: 0.0004  max mem: 43713
Epoch: [297]  [1000/1251]  eta: 0:02:12  lr: 0.000002  min_lr: 0.000002  loss: 2.7520 (2.5760)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4009 (1.6101)  time: 0.5235  data: 0.0004  max mem: 43713
Epoch: [297]  [1200/1251]  eta: 0:00:26  lr: 0.000002  min_lr: 0.000002  loss: 2.6394 (2.5717)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3812 (1.5850)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.7925 (2.5732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3632 (1.5829)  time: 0.4436  data: 0.0005  max mem: 43713
Epoch: [297] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.7925 (2.5714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3632 (1.5829)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5853 (0.5853)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.4160  data: 5.1220  max mem: 43713
Test:  [10/25]  eta: 0:00:10  loss: 0.7185 (0.7337)  acc1: 87.6000 (87.6364)  acc5: 98.0000 (98.0364)  time: 0.7333  data: 0.4659  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8965 (0.8503)  acc1: 82.8000 (84.5333)  acc5: 96.8000 (96.9714)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9526 (0.8671)  acc1: 82.4000 (84.0480)  acc5: 96.8000 (96.8960)  time: 0.2651  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4773 s / it)
* Acc@1 84.460 Acc@5 96.974 loss 0.858
Accuracy of the model on the 50000 test images: 84.5%
Max accuracy: 84.52%
Epoch: [298]  [   0/1251]  eta: 1:11:36  lr: 0.000002  min_lr: 0.000002  loss: 2.8286 (2.8286)  weight_decay: 0.0500 (0.0500)  time: 3.4347  data: 2.6857  max mem: 43713
Epoch: [298]  [ 200/1251]  eta: 0:09:26  lr: 0.000001  min_lr: 0.000001  loss: 2.8013 (2.5856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1910 (1.5333)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [298]  [ 400/1251]  eta: 0:07:33  lr: 0.000001  min_lr: 0.000001  loss: 2.7258 (2.5776)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0457 (1.5098)  time: 0.5224  data: 0.0004  max mem: 43713
Epoch: [298]  [ 600/1251]  eta: 0:05:44  lr: 0.000001  min_lr: 0.000001  loss: 2.6593 (2.5557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3295 (1.5832)  time: 0.5230  data: 0.0004  max mem: 43713
Epoch: [298]  [ 800/1251]  eta: 0:03:58  lr: 0.000001  min_lr: 0.000001  loss: 2.6296 (2.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4522 (1.5833)  time: 0.5233  data: 0.0004  max mem: 43713
Epoch: [298]  [1000/1251]  eta: 0:02:12  lr: 0.000001  min_lr: 0.000001  loss: 2.6175 (2.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5585 (1.5981)  time: 0.5223  data: 0.0004  max mem: 43713
Epoch: [298]  [1200/1251]  eta: 0:00:26  lr: 0.000001  min_lr: 0.000001  loss: 2.6382 (2.5664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1490 (1.5650)  time: 0.5296  data: 0.0004  max mem: 43713
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.5985 (2.5679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2320 (1.5664)  time: 0.4434  data: 0.0006  max mem: 43713
Epoch: [298] Total time: 0:10:58 (0.5263 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.5985 (2.5722)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2320 (1.5664)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5979 (0.5979)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.4925  data: 5.1829  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7254 (0.7423)  acc1: 87.2000 (87.5273)  acc5: 98.0000 (97.9636)  time: 0.7400  data: 0.4715  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.9028 (0.8559)  acc1: 82.8000 (84.6095)  acc5: 96.8000 (96.9143)  time: 0.2650  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9441 (0.8726)  acc1: 82.0000 (84.0160)  acc5: 96.4000 (96.8480)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:11 (0.4786 s / it)
* Acc@1 84.426 Acc@5 96.904 loss 0.864
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.52%
Epoch: [299]  [   0/1251]  eta: 1:11:25  lr: 0.000001  min_lr: 0.000001  loss: 2.7342 (2.7342)  weight_decay: 0.0500 (0.0500)  time: 3.4257  data: 1.7809  max mem: 43713
Epoch: [299]  [ 200/1251]  eta: 0:09:30  lr: 0.000001  min_lr: 0.000001  loss: 2.6028 (2.5573)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4342 (1.6614)  time: 0.5236  data: 0.0004  max mem: 43713
Epoch: [299]  [ 400/1251]  eta: 0:07:34  lr: 0.000001  min_lr: 0.000001  loss: 2.7759 (2.5442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3622 (1.6166)  time: 0.5231  data: 0.0004  max mem: 43713
Epoch: [299]  [ 600/1251]  eta: 0:05:45  lr: 0.000001  min_lr: 0.000001  loss: 2.5711 (2.5402)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2532 (1.8256)  time: 0.5280  data: 0.0004  max mem: 43713
Epoch: [299]  [ 800/1251]  eta: 0:03:58  lr: 0.000001  min_lr: 0.000001  loss: 2.5277 (2.5499)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4263 (1.8387)  time: 0.5231  data: 0.0005  max mem: 43713
Epoch: [299]  [1000/1251]  eta: 0:02:12  lr: 0.000001  min_lr: 0.000001  loss: 2.7141 (2.5621)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3041 (1.7743)  time: 0.5232  data: 0.0004  max mem: 43713
Epoch: [299]  [1200/1251]  eta: 0:00:26  lr: 0.000001  min_lr: 0.000001  loss: 2.4998 (2.5601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3095 (1.7616)  time: 0.5293  data: 0.0005  max mem: 43713
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6242 (2.5603)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1700 (1.7495)  time: 0.4474  data: 0.0005  max mem: 43713
Epoch: [299] Total time: 0:10:59 (0.5270 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6242 (2.5687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1700 (1.7495)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5853 (0.5853)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.6066  data: 5.3099  max mem: 43713
Test:  [10/25]  eta: 0:00:11  loss: 0.7158 (0.7326)  acc1: 88.0000 (87.6727)  acc5: 98.0000 (98.0000)  time: 0.7504  data: 0.4830  max mem: 43713
Test:  [20/25]  eta: 0:00:02  loss: 0.8951 (0.8489)  acc1: 82.8000 (84.6286)  acc5: 96.8000 (96.9333)  time: 0.2648  data: 0.0002  max mem: 43713
Test:  [24/25]  eta: 0:00:00  loss: 0.9477 (0.8660)  acc1: 82.0000 (84.1120)  acc5: 96.8000 (96.8960)  time: 0.2649  data: 0.0001  max mem: 43713
Test: Total time: 0:00:12 (0.4833 s / it)
* Acc@1 84.440 Acc@5 96.964 loss 0.857
Accuracy of the model on the 50000 test images: 84.4%
Max accuracy: 84.52%
Training time 2 days, 8:05:25
