| distributed init (rank 0): env://, gpu 0
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 7): env://, gpu 7
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 3): env://, gpu 3
Namespace(batch_size=128, epochs=300, update_freq=4, model='small', drop_path=0, input_size=224, layer_scale_init_value=1e-06, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_ema_eval=False, opt='adamw', opt_eps=1e-08, opt_betas=None, clip_grad=5.0, momentum=0.9, weight_decay=0.05, weight_decay_end=None, lr=0.004, layer_decay=1.0, min_lr=1e-06, warmup_epochs=20, warmup_steps=-1, color_jitter=0.4, aa='rand-m9-mstd0.5-inc1', smoothing=0.1, train_interpolation='bicubic', crop_pct=None, reprob=0.25, remode='pixel', recount=1, resplit=False, mixup=0.8, cutmix=1.0, cutmix_minmax=None, mixup_prob=1.0, mixup_switch_prob=0.5, mixup_mode='batch', finetune='', head_init_scale=1.0, model_key='model|module', model_prefix='', data_path='/dev/shm/imagenet', eval_data_path=None, nb_classes=1000, imagenet_default_mean_and_std=True, data_set='IMNET', output_dir='./checkpoint_small_4.2G', log_dir=None, device='cuda', seed=0, resume='', auto_resume=True, save_ckpt=True, save_ckpt_freq=1, save_ckpt_num=3, start_epoch=0, eval=False, dist_eval=True, disable_eval=False, num_workers=10, pin_mem=True, world_size=8, local_rank=-1, dist_on_itp=False, dist_url='env://', use_amp=True, enable_wandb=False, project='convnext', wandb_ckpt=False, rank=0, gpu=0, distributed=True, dist_backend='nccl')
Transform = 
RandomResizedCropAndInterpolation(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic)
RandomHorizontalFlip(p=0.5)
RandAugment(n=2, ops=
	AugmentOp(name=AutoContrast, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Equalize, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Invert, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=Rotate, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=PosterizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SolarizeAdd, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ColorIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ContrastIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=BrightnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=SharpnessIncreasing, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearX, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=ShearY, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateXRel, p=0.5, m=9, mstd=0.5)
	AugmentOp(name=TranslateYRel, p=0.5, m=9, mstd=0.5))
ToTensor()
Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
RandomErasing(p=0.25, mode=pixel, count=(1, 1))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Transform = 
Resize(size=256, interpolation=bicubic, max_size=None, antialias=True)
CenterCrop(size=(224, 224))
ToTensor()
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
---------------------------
reading from datapath /dev/shm/imagenet
Number of the class = 1000
Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7ff841bd59d0>
Mixup is activated!
Model = RaCNN(
  (first_conv): ConvX(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (layer1): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
          (norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): Identity()
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.008)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (norm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(192, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(24, 192, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.016)
    )
  )
  (layer2): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
          (norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.024)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.032)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.040)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.048)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.056)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (norm): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(384, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(48, 384, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.064)
    )
  )
  (layer3): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(128, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
          (norm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.072)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.080)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.088)
    )
    (3): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.096)
    )
    (4): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.104)
    )
    (5): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.112)
    )
    (6): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.120)
    )
    (7): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.128)
    )
    (8): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.136)
    )
    (9): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.144)
    )
    (10): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.152)
    )
    (11): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.160)
    )
    (12): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.168)
    )
    (13): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=768, bias=False)
          (norm): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(768, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(96, 768, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.176)
    )
  )
  (layer4): Sequential(
    (0): DownBlock(
      (mlp): Sequential(
        (0): ConvX(
          (conv): Conv2d(256, 1536, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (1): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): Act(
            (act): GELU(approximate='none')
          )
        )
        (2): ConvX(
          (conv): Conv2d(1536, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (skip): Sequential(
        (0): ConvX(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256, bias=False)
          (norm): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): ConvX(
          (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (drop_path): DropPath(drop_prob=0.184)
    )
    (1): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(512, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.192)
    )
    (2): Block(
      (mlp): GateMLP(
        (conv_in): ConvX(
          (conv): Conv2d(512, 3072, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dw): ConvX(
          (conv): Conv2d(1536, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1536, bias=False)
          (norm): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (re): RE(
          (region): Sequential(
            (0): Conv2d(1536, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv2d(192, 1536, kernel_size=(1, 1), stride=(1, 1))
            (4): Sigmoid()
          )
        )
        (proj): ConvX(
          (conv): Conv2d(1536, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (act): Act(
          (act): GELU(approximate='none')
        )
      )
      (dcnn): DilatedCNN(
        (conv_in): ConvX(
          (conv): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (spe): ConvX(
          (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (att): Attention()
        (act): Act(
          (act): GELU(approximate='none')
        )
        (proj): ConvX(
          (conv): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (ln): LayerNorm()
      (drop_path): DropPath(drop_prob=0.200)
    )
  )
  (head): ConvX(
    (conv): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (norm): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
  )
  (gap): AdaptiveAvgPool2d(output_size=1)
  (classifier): MlpHead(
    (fc1): Linear(in_features=1024, out_features=2048, bias=False)
    (norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Act(
      (act): GELU(approximate='none')
    )
    (drop): Dropout(p=0.2, inplace=False)
    (fc2): Linear(in_features=2048, out_features=1000, bias=False)
  )
)
number of params: 27706656
LR = 0.00400000
Batch size = 4096
Update frequent = 4
Number of training examples = 1281167
Number of training training per epoch = 312
Param groups = {
  "decay": {
    "weight_decay": 0.05,
    "params": [
      "first_conv.conv.weight",
      "layer1.0.mlp.0.conv.weight",
      "layer1.0.mlp.1.conv.weight",
      "layer1.0.mlp.2.conv.weight",
      "layer1.0.skip.0.conv.weight",
      "layer1.0.skip.1.conv.weight",
      "layer1.1.mlp.conv_in.conv.weight",
      "layer1.1.mlp.dw.conv.weight",
      "layer1.1.mlp.re.region.0.weight",
      "layer1.1.mlp.re.region.3.weight",
      "layer1.1.mlp.proj.conv.weight",
      "layer1.1.dcnn.conv_in.conv.weight",
      "layer1.1.dcnn.spe.conv.weight",
      "layer1.1.dcnn.att.logit_scale",
      "layer1.1.dcnn.proj.conv.weight",
      "layer1.2.mlp.conv_in.conv.weight",
      "layer1.2.mlp.dw.conv.weight",
      "layer1.2.mlp.re.region.0.weight",
      "layer1.2.mlp.re.region.3.weight",
      "layer1.2.mlp.proj.conv.weight",
      "layer1.2.dcnn.conv_in.conv.weight",
      "layer1.2.dcnn.spe.conv.weight",
      "layer1.2.dcnn.att.logit_scale",
      "layer1.2.dcnn.proj.conv.weight",
      "layer2.0.mlp.0.conv.weight",
      "layer2.0.mlp.1.conv.weight",
      "layer2.0.mlp.2.conv.weight",
      "layer2.0.skip.0.conv.weight",
      "layer2.0.skip.1.conv.weight",
      "layer2.1.mlp.conv_in.conv.weight",
      "layer2.1.mlp.dw.conv.weight",
      "layer2.1.mlp.re.region.0.weight",
      "layer2.1.mlp.re.region.3.weight",
      "layer2.1.mlp.proj.conv.weight",
      "layer2.1.dcnn.conv_in.conv.weight",
      "layer2.1.dcnn.spe.conv.weight",
      "layer2.1.dcnn.att.logit_scale",
      "layer2.1.dcnn.proj.conv.weight",
      "layer2.2.mlp.conv_in.conv.weight",
      "layer2.2.mlp.dw.conv.weight",
      "layer2.2.mlp.re.region.0.weight",
      "layer2.2.mlp.re.region.3.weight",
      "layer2.2.mlp.proj.conv.weight",
      "layer2.2.dcnn.conv_in.conv.weight",
      "layer2.2.dcnn.spe.conv.weight",
      "layer2.2.dcnn.att.logit_scale",
      "layer2.2.dcnn.proj.conv.weight",
      "layer2.3.mlp.conv_in.conv.weight",
      "layer2.3.mlp.dw.conv.weight",
      "layer2.3.mlp.re.region.0.weight",
      "layer2.3.mlp.re.region.3.weight",
      "layer2.3.mlp.proj.conv.weight",
      "layer2.3.dcnn.conv_in.conv.weight",
      "layer2.3.dcnn.spe.conv.weight",
      "layer2.3.dcnn.att.logit_scale",
      "layer2.3.dcnn.proj.conv.weight",
      "layer2.4.mlp.conv_in.conv.weight",
      "layer2.4.mlp.dw.conv.weight",
      "layer2.4.mlp.re.region.0.weight",
      "layer2.4.mlp.re.region.3.weight",
      "layer2.4.mlp.proj.conv.weight",
      "layer2.4.dcnn.conv_in.conv.weight",
      "layer2.4.dcnn.spe.conv.weight",
      "layer2.4.dcnn.att.logit_scale",
      "layer2.4.dcnn.proj.conv.weight",
      "layer2.5.mlp.conv_in.conv.weight",
      "layer2.5.mlp.dw.conv.weight",
      "layer2.5.mlp.re.region.0.weight",
      "layer2.5.mlp.re.region.3.weight",
      "layer2.5.mlp.proj.conv.weight",
      "layer2.5.dcnn.conv_in.conv.weight",
      "layer2.5.dcnn.spe.conv.weight",
      "layer2.5.dcnn.att.logit_scale",
      "layer2.5.dcnn.proj.conv.weight",
      "layer3.0.mlp.0.conv.weight",
      "layer3.0.mlp.1.conv.weight",
      "layer3.0.mlp.2.conv.weight",
      "layer3.0.skip.0.conv.weight",
      "layer3.0.skip.1.conv.weight",
      "layer3.1.mlp.conv_in.conv.weight",
      "layer3.1.mlp.dw.conv.weight",
      "layer3.1.mlp.re.region.0.weight",
      "layer3.1.mlp.re.region.3.weight",
      "layer3.1.mlp.proj.conv.weight",
      "layer3.1.dcnn.conv_in.conv.weight",
      "layer3.1.dcnn.spe.conv.weight",
      "layer3.1.dcnn.att.logit_scale",
      "layer3.1.dcnn.proj.conv.weight",
      "layer3.2.mlp.conv_in.conv.weight",
      "layer3.2.mlp.dw.conv.weight",
      "layer3.2.mlp.re.region.0.weight",
      "layer3.2.mlp.re.region.3.weight",
      "layer3.2.mlp.proj.conv.weight",
      "layer3.2.dcnn.conv_in.conv.weight",
      "layer3.2.dcnn.spe.conv.weight",
      "layer3.2.dcnn.att.logit_scale",
      "layer3.2.dcnn.proj.conv.weight",
      "layer3.3.mlp.conv_in.conv.weight",
      "layer3.3.mlp.dw.conv.weight",
      "layer3.3.mlp.re.region.0.weight",
      "layer3.3.mlp.re.region.3.weight",
      "layer3.3.mlp.proj.conv.weight",
      "layer3.3.dcnn.conv_in.conv.weight",
      "layer3.3.dcnn.spe.conv.weight",
      "layer3.3.dcnn.att.logit_scale",
      "layer3.3.dcnn.proj.conv.weight",
      "layer3.4.mlp.conv_in.conv.weight",
      "layer3.4.mlp.dw.conv.weight",
      "layer3.4.mlp.re.region.0.weight",
      "layer3.4.mlp.re.region.3.weight",
      "layer3.4.mlp.proj.conv.weight",
      "layer3.4.dcnn.conv_in.conv.weight",
      "layer3.4.dcnn.spe.conv.weight",
      "layer3.4.dcnn.att.logit_scale",
      "layer3.4.dcnn.proj.conv.weight",
      "layer3.5.mlp.conv_in.conv.weight",
      "layer3.5.mlp.dw.conv.weight",
      "layer3.5.mlp.re.region.0.weight",
      "layer3.5.mlp.re.region.3.weight",
      "layer3.5.mlp.proj.conv.weight",
      "layer3.5.dcnn.conv_in.conv.weight",
      "layer3.5.dcnn.spe.conv.weight",
      "layer3.5.dcnn.att.logit_scale",
      "layer3.5.dcnn.proj.conv.weight",
      "layer3.6.mlp.conv_in.conv.weight",
      "layer3.6.mlp.dw.conv.weight",
      "layer3.6.mlp.re.region.0.weight",
      "layer3.6.mlp.re.region.3.weight",
      "layer3.6.mlp.proj.conv.weight",
      "layer3.6.dcnn.conv_in.conv.weight",
      "layer3.6.dcnn.spe.conv.weight",
      "layer3.6.dcnn.att.logit_scale",
      "layer3.6.dcnn.proj.conv.weight",
      "layer3.7.mlp.conv_in.conv.weight",
      "layer3.7.mlp.dw.conv.weight",
      "layer3.7.mlp.re.region.0.weight",
      "layer3.7.mlp.re.region.3.weight",
      "layer3.7.mlp.proj.conv.weight",
      "layer3.7.dcnn.conv_in.conv.weight",
      "layer3.7.dcnn.spe.conv.weight",
      "layer3.7.dcnn.att.logit_scale",
      "layer3.7.dcnn.proj.conv.weight",
      "layer3.8.mlp.conv_in.conv.weight",
      "layer3.8.mlp.dw.conv.weight",
      "layer3.8.mlp.re.region.0.weight",
      "layer3.8.mlp.re.region.3.weight",
      "layer3.8.mlp.proj.conv.weight",
      "layer3.8.dcnn.conv_in.conv.weight",
      "layer3.8.dcnn.spe.conv.weight",
      "layer3.8.dcnn.att.logit_scale",
      "layer3.8.dcnn.proj.conv.weight",
      "layer3.9.mlp.conv_in.conv.weight",
      "layer3.9.mlp.dw.conv.weight",
      "layer3.9.mlp.re.region.0.weight",
      "layer3.9.mlp.re.region.3.weight",
      "layer3.9.mlp.proj.conv.weight",
      "layer3.9.dcnn.conv_in.conv.weight",
      "layer3.9.dcnn.spe.conv.weight",
      "layer3.9.dcnn.att.logit_scale",
      "layer3.9.dcnn.proj.conv.weight",
      "layer3.10.mlp.conv_in.conv.weight",
      "layer3.10.mlp.dw.conv.weight",
      "layer3.10.mlp.re.region.0.weight",
      "layer3.10.mlp.re.region.3.weight",
      "layer3.10.mlp.proj.conv.weight",
      "layer3.10.dcnn.conv_in.conv.weight",
      "layer3.10.dcnn.spe.conv.weight",
      "layer3.10.dcnn.att.logit_scale",
      "layer3.10.dcnn.proj.conv.weight",
      "layer3.11.mlp.conv_in.conv.weight",
      "layer3.11.mlp.dw.conv.weight",
      "layer3.11.mlp.re.region.0.weight",
      "layer3.11.mlp.re.region.3.weight",
      "layer3.11.mlp.proj.conv.weight",
      "layer3.11.dcnn.conv_in.conv.weight",
      "layer3.11.dcnn.spe.conv.weight",
      "layer3.11.dcnn.att.logit_scale",
      "layer3.11.dcnn.proj.conv.weight",
      "layer3.12.mlp.conv_in.conv.weight",
      "layer3.12.mlp.dw.conv.weight",
      "layer3.12.mlp.re.region.0.weight",
      "layer3.12.mlp.re.region.3.weight",
      "layer3.12.mlp.proj.conv.weight",
      "layer3.12.dcnn.conv_in.conv.weight",
      "layer3.12.dcnn.spe.conv.weight",
      "layer3.12.dcnn.att.logit_scale",
      "layer3.12.dcnn.proj.conv.weight",
      "layer3.13.mlp.conv_in.conv.weight",
      "layer3.13.mlp.dw.conv.weight",
      "layer3.13.mlp.re.region.0.weight",
      "layer3.13.mlp.re.region.3.weight",
      "layer3.13.mlp.proj.conv.weight",
      "layer3.13.dcnn.conv_in.conv.weight",
      "layer3.13.dcnn.spe.conv.weight",
      "layer3.13.dcnn.att.logit_scale",
      "layer3.13.dcnn.proj.conv.weight",
      "layer4.0.mlp.0.conv.weight",
      "layer4.0.mlp.1.conv.weight",
      "layer4.0.mlp.2.conv.weight",
      "layer4.0.skip.0.conv.weight",
      "layer4.0.skip.1.conv.weight",
      "layer4.1.mlp.conv_in.conv.weight",
      "layer4.1.mlp.dw.conv.weight",
      "layer4.1.mlp.re.region.0.weight",
      "layer4.1.mlp.re.region.3.weight",
      "layer4.1.mlp.proj.conv.weight",
      "layer4.1.dcnn.conv_in.conv.weight",
      "layer4.1.dcnn.spe.conv.weight",
      "layer4.1.dcnn.att.logit_scale",
      "layer4.1.dcnn.proj.conv.weight",
      "layer4.2.mlp.conv_in.conv.weight",
      "layer4.2.mlp.dw.conv.weight",
      "layer4.2.mlp.re.region.0.weight",
      "layer4.2.mlp.re.region.3.weight",
      "layer4.2.mlp.proj.conv.weight",
      "layer4.2.dcnn.conv_in.conv.weight",
      "layer4.2.dcnn.spe.conv.weight",
      "layer4.2.dcnn.att.logit_scale",
      "layer4.2.dcnn.proj.conv.weight",
      "head.conv.weight",
      "classifier.fc1.weight",
      "classifier.fc2.weight"
    ],
    "lr_scale": 1.0
  },
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "first_conv.norm.weight",
      "first_conv.norm.bias",
      "layer1.0.mlp.0.norm.weight",
      "layer1.0.mlp.0.norm.bias",
      "layer1.0.mlp.1.norm.weight",
      "layer1.0.mlp.1.norm.bias",
      "layer1.0.mlp.2.norm.weight",
      "layer1.0.mlp.2.norm.bias",
      "layer1.0.skip.0.norm.weight",
      "layer1.0.skip.0.norm.bias",
      "layer1.0.skip.1.norm.weight",
      "layer1.0.skip.1.norm.bias",
      "layer1.1.mlp.conv_in.norm.weight",
      "layer1.1.mlp.conv_in.norm.bias",
      "layer1.1.mlp.dw.norm.weight",
      "layer1.1.mlp.dw.norm.bias",
      "layer1.1.mlp.re.region.1.weight",
      "layer1.1.mlp.re.region.1.bias",
      "layer1.1.mlp.re.region.3.bias",
      "layer1.1.mlp.proj.norm.weight",
      "layer1.1.mlp.proj.norm.bias",
      "layer1.1.dcnn.conv_in.norm.weight",
      "layer1.1.dcnn.conv_in.norm.bias",
      "layer1.1.dcnn.spe.norm.weight",
      "layer1.1.dcnn.spe.norm.bias",
      "layer1.1.dcnn.proj.norm.weight",
      "layer1.1.dcnn.proj.norm.bias",
      "layer1.2.mlp.conv_in.norm.weight",
      "layer1.2.mlp.conv_in.norm.bias",
      "layer1.2.mlp.dw.norm.weight",
      "layer1.2.mlp.dw.norm.bias",
      "layer1.2.mlp.re.region.1.weight",
      "layer1.2.mlp.re.region.1.bias",
      "layer1.2.mlp.re.region.3.bias",
      "layer1.2.mlp.proj.norm.weight",
      "layer1.2.mlp.proj.norm.bias",
      "layer1.2.dcnn.conv_in.norm.weight",
      "layer1.2.dcnn.conv_in.norm.bias",
      "layer1.2.dcnn.spe.norm.weight",
      "layer1.2.dcnn.spe.norm.bias",
      "layer1.2.dcnn.proj.norm.weight",
      "layer1.2.dcnn.proj.norm.bias",
      "layer2.0.mlp.0.norm.weight",
      "layer2.0.mlp.0.norm.bias",
      "layer2.0.mlp.1.norm.weight",
      "layer2.0.mlp.1.norm.bias",
      "layer2.0.mlp.2.norm.weight",
      "layer2.0.mlp.2.norm.bias",
      "layer2.0.skip.0.norm.weight",
      "layer2.0.skip.0.norm.bias",
      "layer2.0.skip.1.norm.weight",
      "layer2.0.skip.1.norm.bias",
      "layer2.1.mlp.conv_in.norm.weight",
      "layer2.1.mlp.conv_in.norm.bias",
      "layer2.1.mlp.dw.norm.weight",
      "layer2.1.mlp.dw.norm.bias",
      "layer2.1.mlp.re.region.1.weight",
      "layer2.1.mlp.re.region.1.bias",
      "layer2.1.mlp.re.region.3.bias",
      "layer2.1.mlp.proj.norm.weight",
      "layer2.1.mlp.proj.norm.bias",
      "layer2.1.dcnn.conv_in.norm.weight",
      "layer2.1.dcnn.conv_in.norm.bias",
      "layer2.1.dcnn.spe.norm.weight",
      "layer2.1.dcnn.spe.norm.bias",
      "layer2.1.dcnn.proj.norm.weight",
      "layer2.1.dcnn.proj.norm.bias",
      "layer2.2.mlp.conv_in.norm.weight",
      "layer2.2.mlp.conv_in.norm.bias",
      "layer2.2.mlp.dw.norm.weight",
      "layer2.2.mlp.dw.norm.bias",
      "layer2.2.mlp.re.region.1.weight",
      "layer2.2.mlp.re.region.1.bias",
      "layer2.2.mlp.re.region.3.bias",
      "layer2.2.mlp.proj.norm.weight",
      "layer2.2.mlp.proj.norm.bias",
      "layer2.2.dcnn.conv_in.norm.weight",
      "layer2.2.dcnn.conv_in.norm.bias",
      "layer2.2.dcnn.spe.norm.weight",
      "layer2.2.dcnn.spe.norm.bias",
      "layer2.2.dcnn.proj.norm.weight",
      "layer2.2.dcnn.proj.norm.bias",
      "layer2.3.mlp.conv_in.norm.weight",
      "layer2.3.mlp.conv_in.norm.bias",
      "layer2.3.mlp.dw.norm.weight",
      "layer2.3.mlp.dw.norm.bias",
      "layer2.3.mlp.re.region.1.weight",
      "layer2.3.mlp.re.region.1.bias",
      "layer2.3.mlp.re.region.3.bias",
      "layer2.3.mlp.proj.norm.weight",
      "layer2.3.mlp.proj.norm.bias",
      "layer2.3.dcnn.conv_in.norm.weight",
      "layer2.3.dcnn.conv_in.norm.bias",
      "layer2.3.dcnn.spe.norm.weight",
      "layer2.3.dcnn.spe.norm.bias",
      "layer2.3.dcnn.proj.norm.weight",
      "layer2.3.dcnn.proj.norm.bias",
      "layer2.4.mlp.conv_in.norm.weight",
      "layer2.4.mlp.conv_in.norm.bias",
      "layer2.4.mlp.dw.norm.weight",
      "layer2.4.mlp.dw.norm.bias",
      "layer2.4.mlp.re.region.1.weight",
      "layer2.4.mlp.re.region.1.bias",
      "layer2.4.mlp.re.region.3.bias",
      "layer2.4.mlp.proj.norm.weight",
      "layer2.4.mlp.proj.norm.bias",
      "layer2.4.dcnn.conv_in.norm.weight",
      "layer2.4.dcnn.conv_in.norm.bias",
      "layer2.4.dcnn.spe.norm.weight",
      "layer2.4.dcnn.spe.norm.bias",
      "layer2.4.dcnn.proj.norm.weight",
      "layer2.4.dcnn.proj.norm.bias",
      "layer2.5.mlp.conv_in.norm.weight",
      "layer2.5.mlp.conv_in.norm.bias",
      "layer2.5.mlp.dw.norm.weight",
      "layer2.5.mlp.dw.norm.bias",
      "layer2.5.mlp.re.region.1.weight",
      "layer2.5.mlp.re.region.1.bias",
      "layer2.5.mlp.re.region.3.bias",
      "layer2.5.mlp.proj.norm.weight",
      "layer2.5.mlp.proj.norm.bias",
      "layer2.5.dcnn.conv_in.norm.weight",
      "layer2.5.dcnn.conv_in.norm.bias",
      "layer2.5.dcnn.spe.norm.weight",
      "layer2.5.dcnn.spe.norm.bias",
      "layer2.5.dcnn.proj.norm.weight",
      "layer2.5.dcnn.proj.norm.bias",
      "layer3.0.mlp.0.norm.weight",
      "layer3.0.mlp.0.norm.bias",
      "layer3.0.mlp.1.norm.weight",
      "layer3.0.mlp.1.norm.bias",
      "layer3.0.mlp.2.norm.weight",
      "layer3.0.mlp.2.norm.bias",
      "layer3.0.skip.0.norm.weight",
      "layer3.0.skip.0.norm.bias",
      "layer3.0.skip.1.norm.weight",
      "layer3.0.skip.1.norm.bias",
      "layer3.1.mlp.conv_in.norm.weight",
      "layer3.1.mlp.conv_in.norm.bias",
      "layer3.1.mlp.dw.norm.weight",
      "layer3.1.mlp.dw.norm.bias",
      "layer3.1.mlp.re.region.1.weight",
      "layer3.1.mlp.re.region.1.bias",
      "layer3.1.mlp.re.region.3.bias",
      "layer3.1.mlp.proj.norm.weight",
      "layer3.1.mlp.proj.norm.bias",
      "layer3.1.dcnn.conv_in.norm.weight",
      "layer3.1.dcnn.conv_in.norm.bias",
      "layer3.1.dcnn.spe.norm.weight",
      "layer3.1.dcnn.spe.norm.bias",
      "layer3.1.dcnn.proj.norm.weight",
      "layer3.1.dcnn.proj.norm.bias",
      "layer3.2.mlp.conv_in.norm.weight",
      "layer3.2.mlp.conv_in.norm.bias",
      "layer3.2.mlp.dw.norm.weight",
      "layer3.2.mlp.dw.norm.bias",
      "layer3.2.mlp.re.region.1.weight",
      "layer3.2.mlp.re.region.1.bias",
      "layer3.2.mlp.re.region.3.bias",
      "layer3.2.mlp.proj.norm.weight",
      "layer3.2.mlp.proj.norm.bias",
      "layer3.2.dcnn.conv_in.norm.weight",
      "layer3.2.dcnn.conv_in.norm.bias",
      "layer3.2.dcnn.spe.norm.weight",
      "layer3.2.dcnn.spe.norm.bias",
      "layer3.2.dcnn.proj.norm.weight",
      "layer3.2.dcnn.proj.norm.bias",
      "layer3.3.mlp.conv_in.norm.weight",
      "layer3.3.mlp.conv_in.norm.bias",
      "layer3.3.mlp.dw.norm.weight",
      "layer3.3.mlp.dw.norm.bias",
      "layer3.3.mlp.re.region.1.weight",
      "layer3.3.mlp.re.region.1.bias",
      "layer3.3.mlp.re.region.3.bias",
      "layer3.3.mlp.proj.norm.weight",
      "layer3.3.mlp.proj.norm.bias",
      "layer3.3.dcnn.conv_in.norm.weight",
      "layer3.3.dcnn.conv_in.norm.bias",
      "layer3.3.dcnn.spe.norm.weight",
      "layer3.3.dcnn.spe.norm.bias",
      "layer3.3.dcnn.proj.norm.weight",
      "layer3.3.dcnn.proj.norm.bias",
      "layer3.4.mlp.conv_in.norm.weight",
      "layer3.4.mlp.conv_in.norm.bias",
      "layer3.4.mlp.dw.norm.weight",
      "layer3.4.mlp.dw.norm.bias",
      "layer3.4.mlp.re.region.1.weight",
      "layer3.4.mlp.re.region.1.bias",
      "layer3.4.mlp.re.region.3.bias",
      "layer3.4.mlp.proj.norm.weight",
      "layer3.4.mlp.proj.norm.bias",
      "layer3.4.dcnn.conv_in.norm.weight",
      "layer3.4.dcnn.conv_in.norm.bias",
      "layer3.4.dcnn.spe.norm.weight",
      "layer3.4.dcnn.spe.norm.bias",
      "layer3.4.dcnn.proj.norm.weight",
      "layer3.4.dcnn.proj.norm.bias",
      "layer3.5.mlp.conv_in.norm.weight",
      "layer3.5.mlp.conv_in.norm.bias",
      "layer3.5.mlp.dw.norm.weight",
      "layer3.5.mlp.dw.norm.bias",
      "layer3.5.mlp.re.region.1.weight",
      "layer3.5.mlp.re.region.1.bias",
      "layer3.5.mlp.re.region.3.bias",
      "layer3.5.mlp.proj.norm.weight",
      "layer3.5.mlp.proj.norm.bias",
      "layer3.5.dcnn.conv_in.norm.weight",
      "layer3.5.dcnn.conv_in.norm.bias",
      "layer3.5.dcnn.spe.norm.weight",
      "layer3.5.dcnn.spe.norm.bias",
      "layer3.5.dcnn.proj.norm.weight",
      "layer3.5.dcnn.proj.norm.bias",
      "layer3.6.mlp.conv_in.norm.weight",
      "layer3.6.mlp.conv_in.norm.bias",
      "layer3.6.mlp.dw.norm.weight",
      "layer3.6.mlp.dw.norm.bias",
      "layer3.6.mlp.re.region.1.weight",
      "layer3.6.mlp.re.region.1.bias",
      "layer3.6.mlp.re.region.3.bias",
      "layer3.6.mlp.proj.norm.weight",
      "layer3.6.mlp.proj.norm.bias",
      "layer3.6.dcnn.conv_in.norm.weight",
      "layer3.6.dcnn.conv_in.norm.bias",
      "layer3.6.dcnn.spe.norm.weight",
      "layer3.6.dcnn.spe.norm.bias",
      "layer3.6.dcnn.proj.norm.weight",
      "layer3.6.dcnn.proj.norm.bias",
      "layer3.7.mlp.conv_in.norm.weight",
      "layer3.7.mlp.conv_in.norm.bias",
      "layer3.7.mlp.dw.norm.weight",
      "layer3.7.mlp.dw.norm.bias",
      "layer3.7.mlp.re.region.1.weight",
      "layer3.7.mlp.re.region.1.bias",
      "layer3.7.mlp.re.region.3.bias",
      "layer3.7.mlp.proj.norm.weight",
      "layer3.7.mlp.proj.norm.bias",
      "layer3.7.dcnn.conv_in.norm.weight",
      "layer3.7.dcnn.conv_in.norm.bias",
      "layer3.7.dcnn.spe.norm.weight",
      "layer3.7.dcnn.spe.norm.bias",
      "layer3.7.dcnn.proj.norm.weight",
      "layer3.7.dcnn.proj.norm.bias",
      "layer3.8.mlp.conv_in.norm.weight",
      "layer3.8.mlp.conv_in.norm.bias",
      "layer3.8.mlp.dw.norm.weight",
      "layer3.8.mlp.dw.norm.bias",
      "layer3.8.mlp.re.region.1.weight",
      "layer3.8.mlp.re.region.1.bias",
      "layer3.8.mlp.re.region.3.bias",
      "layer3.8.mlp.proj.norm.weight",
      "layer3.8.mlp.proj.norm.bias",
      "layer3.8.dcnn.conv_in.norm.weight",
      "layer3.8.dcnn.conv_in.norm.bias",
      "layer3.8.dcnn.spe.norm.weight",
      "layer3.8.dcnn.spe.norm.bias",
      "layer3.8.dcnn.proj.norm.weight",
      "layer3.8.dcnn.proj.norm.bias",
      "layer3.9.mlp.conv_in.norm.weight",
      "layer3.9.mlp.conv_in.norm.bias",
      "layer3.9.mlp.dw.norm.weight",
      "layer3.9.mlp.dw.norm.bias",
      "layer3.9.mlp.re.region.1.weight",
      "layer3.9.mlp.re.region.1.bias",
      "layer3.9.mlp.re.region.3.bias",
      "layer3.9.mlp.proj.norm.weight",
      "layer3.9.mlp.proj.norm.bias",
      "layer3.9.dcnn.conv_in.norm.weight",
      "layer3.9.dcnn.conv_in.norm.bias",
      "layer3.9.dcnn.spe.norm.weight",
      "layer3.9.dcnn.spe.norm.bias",
      "layer3.9.dcnn.proj.norm.weight",
      "layer3.9.dcnn.proj.norm.bias",
      "layer3.10.mlp.conv_in.norm.weight",
      "layer3.10.mlp.conv_in.norm.bias",
      "layer3.10.mlp.dw.norm.weight",
      "layer3.10.mlp.dw.norm.bias",
      "layer3.10.mlp.re.region.1.weight",
      "layer3.10.mlp.re.region.1.bias",
      "layer3.10.mlp.re.region.3.bias",
      "layer3.10.mlp.proj.norm.weight",
      "layer3.10.mlp.proj.norm.bias",
      "layer3.10.dcnn.conv_in.norm.weight",
      "layer3.10.dcnn.conv_in.norm.bias",
      "layer3.10.dcnn.spe.norm.weight",
      "layer3.10.dcnn.spe.norm.bias",
      "layer3.10.dcnn.proj.norm.weight",
      "layer3.10.dcnn.proj.norm.bias",
      "layer3.11.mlp.conv_in.norm.weight",
      "layer3.11.mlp.conv_in.norm.bias",
      "layer3.11.mlp.dw.norm.weight",
      "layer3.11.mlp.dw.norm.bias",
      "layer3.11.mlp.re.region.1.weight",
      "layer3.11.mlp.re.region.1.bias",
      "layer3.11.mlp.re.region.3.bias",
      "layer3.11.mlp.proj.norm.weight",
      "layer3.11.mlp.proj.norm.bias",
      "layer3.11.dcnn.conv_in.norm.weight",
      "layer3.11.dcnn.conv_in.norm.bias",
      "layer3.11.dcnn.spe.norm.weight",
      "layer3.11.dcnn.spe.norm.bias",
      "layer3.11.dcnn.proj.norm.weight",
      "layer3.11.dcnn.proj.norm.bias",
      "layer3.12.mlp.conv_in.norm.weight",
      "layer3.12.mlp.conv_in.norm.bias",
      "layer3.12.mlp.dw.norm.weight",
      "layer3.12.mlp.dw.norm.bias",
      "layer3.12.mlp.re.region.1.weight",
      "layer3.12.mlp.re.region.1.bias",
      "layer3.12.mlp.re.region.3.bias",
      "layer3.12.mlp.proj.norm.weight",
      "layer3.12.mlp.proj.norm.bias",
      "layer3.12.dcnn.conv_in.norm.weight",
      "layer3.12.dcnn.conv_in.norm.bias",
      "layer3.12.dcnn.spe.norm.weight",
      "layer3.12.dcnn.spe.norm.bias",
      "layer3.12.dcnn.proj.norm.weight",
      "layer3.12.dcnn.proj.norm.bias",
      "layer3.13.mlp.conv_in.norm.weight",
      "layer3.13.mlp.conv_in.norm.bias",
      "layer3.13.mlp.dw.norm.weight",
      "layer3.13.mlp.dw.norm.bias",
      "layer3.13.mlp.re.region.1.weight",
      "layer3.13.mlp.re.region.1.bias",
      "layer3.13.mlp.re.region.3.bias",
      "layer3.13.mlp.proj.norm.weight",
      "layer3.13.mlp.proj.norm.bias",
      "layer3.13.dcnn.conv_in.norm.weight",
      "layer3.13.dcnn.conv_in.norm.bias",
      "layer3.13.dcnn.spe.norm.weight",
      "layer3.13.dcnn.spe.norm.bias",
      "layer3.13.dcnn.proj.norm.weight",
      "layer3.13.dcnn.proj.norm.bias",
      "layer4.0.mlp.0.norm.weight",
      "layer4.0.mlp.0.norm.bias",
      "layer4.0.mlp.1.norm.weight",
      "layer4.0.mlp.1.norm.bias",
      "layer4.0.mlp.2.norm.weight",
      "layer4.0.mlp.2.norm.bias",
      "layer4.0.skip.0.norm.weight",
      "layer4.0.skip.0.norm.bias",
      "layer4.0.skip.1.norm.weight",
      "layer4.0.skip.1.norm.bias",
      "layer4.1.mlp.conv_in.norm.weight",
      "layer4.1.mlp.conv_in.norm.bias",
      "layer4.1.mlp.dw.norm.weight",
      "layer4.1.mlp.dw.norm.bias",
      "layer4.1.mlp.re.region.1.weight",
      "layer4.1.mlp.re.region.1.bias",
      "layer4.1.mlp.re.region.3.bias",
      "layer4.1.mlp.proj.norm.weight",
      "layer4.1.mlp.proj.norm.bias",
      "layer4.1.dcnn.conv_in.norm.weight",
      "layer4.1.dcnn.conv_in.norm.bias",
      "layer4.1.dcnn.spe.norm.weight",
      "layer4.1.dcnn.spe.norm.bias",
      "layer4.1.dcnn.proj.norm.weight",
      "layer4.1.dcnn.proj.norm.bias",
      "layer4.2.mlp.conv_in.norm.weight",
      "layer4.2.mlp.conv_in.norm.bias",
      "layer4.2.mlp.dw.norm.weight",
      "layer4.2.mlp.dw.norm.bias",
      "layer4.2.mlp.re.region.1.weight",
      "layer4.2.mlp.re.region.1.bias",
      "layer4.2.mlp.re.region.3.bias",
      "layer4.2.mlp.proj.norm.weight",
      "layer4.2.mlp.proj.norm.bias",
      "layer4.2.dcnn.conv_in.norm.weight",
      "layer4.2.dcnn.conv_in.norm.bias",
      "layer4.2.dcnn.spe.norm.weight",
      "layer4.2.dcnn.spe.norm.bias",
      "layer4.2.dcnn.proj.norm.weight",
      "layer4.2.dcnn.proj.norm.bias",
      "head.norm.weight",
      "head.norm.bias",
      "classifier.norm.weight",
      "classifier.norm.bias"
    ],
    "lr_scale": 1.0
  }
}
Use Cosine LR scheduler
Set warmup steps = 6240
Set warmup steps = 0
Max WD = 0.0500000, Min WD = 0.0500000
criterion = SoftTargetCrossEntropy()
Auto resume checkpoint: 
Start training for 300 epochs
Epoch: [0]  [   0/1251]  eta: 4:19:00  lr: 0.000000  min_lr: 0.000000  loss: 7.0064 (7.0064)  weight_decay: 0.0500 (0.0500)  time: 12.4227  data: 3.3429  max mem: 28503
Epoch: [0]  [ 200/1251]  eta: 0:07:11  lr: 0.000032  min_lr: 0.000032  loss: 6.9479 (6.9574)  weight_decay: 0.0500 (0.0500)  grad_norm: 33.8149 (nan)  time: 0.3484  data: 0.0005  max mem: 28503
Epoch: [0]  [ 400/1251]  eta: 0:05:23  lr: 0.000064  min_lr: 0.000064  loss: 6.8338 (6.9282)  weight_decay: 0.0500 (0.0500)  grad_norm: 6.3190 (nan)  time: 0.3478  data: 0.0005  max mem: 28503
Epoch: [0]  [ 600/1251]  eta: 0:04:00  lr: 0.000096  min_lr: 0.000096  loss: 6.7151 (6.8823)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.4595 (nan)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [0]  [ 800/1251]  eta: 0:02:44  lr: 0.000128  min_lr: 0.000128  loss: 6.6473 (6.8331)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7998 (nan)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [0]  [1000/1251]  eta: 0:01:30  lr: 0.000160  min_lr: 0.000160  loss: 6.5908 (6.7827)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2175 (nan)  time: 0.3485  data: 0.0005  max mem: 28503
Epoch: [0]  [1200/1251]  eta: 0:00:18  lr: 0.000192  min_lr: 0.000192  loss: 6.5626 (6.7397)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6642 (nan)  time: 0.3504  data: 0.0004  max mem: 28503
Epoch: [0]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 6.4636 (6.7291)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8015 (nan)  time: 0.2949  data: 0.0004  max mem: 28503
Epoch: [0] Total time: 0:07:29 (0.3596 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 6.4636 (6.7288)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8015 (nan)
Test:  [ 0/25]  eta: 0:04:42  loss: 5.5311 (5.5311)  acc1: 6.4000 (6.4000)  acc5: 13.2000 (13.2000)  time: 11.3008  data: 7.3168  max mem: 28503
Test:  [10/25]  eta: 0:00:17  loss: 5.6088 (5.7144)  acc1: 2.4000 (3.6364)  acc5: 11.2000 (11.5273)  time: 1.1806  data: 0.6654  max mem: 28503
Test:  [20/25]  eta: 0:00:03  loss: 5.6257 (5.6883)  acc1: 3.2000 (3.8667)  acc5: 11.6000 (12.7238)  time: 0.1685  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 5.6440 (5.6335)  acc1: 3.6000 (4.5120)  acc5: 12.4000 (13.7600)  time: 0.1684  data: 0.0001  max mem: 28503
Test: Total time: 0:00:15 (0.6164 s / it)
* Acc@1 4.434 Acc@5 13.950 loss 5.637
Accuracy of the model on the 50000 test images: 4.4%
Max accuracy: 4.43%
Epoch: [1]  [   0/1251]  eta: 1:03:00  lr: 0.000200  min_lr: 0.000200  loss: 6.0461 (6.0461)  weight_decay: 0.0500 (0.0500)  time: 3.0221  data: 2.4914  max mem: 28503
Epoch: [1]  [ 200/1251]  eta: 0:06:18  lr: 0.000232  min_lr: 0.000232  loss: 6.4612 (6.4387)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9364 (4.0338)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [1]  [ 400/1251]  eta: 0:05:00  lr: 0.000264  min_lr: 0.000264  loss: 6.2415 (6.3889)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.0350 (4.0479)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [1]  [ 600/1251]  eta: 0:03:49  lr: 0.000296  min_lr: 0.000296  loss: 6.2589 (6.3519)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7001 (4.0650)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [1]  [ 800/1251]  eta: 0:02:38  lr: 0.000328  min_lr: 0.000328  loss: 6.3089 (6.3112)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8849 (4.0325)  time: 0.3483  data: 0.0005  max mem: 28503
Epoch: [1]  [1000/1251]  eta: 0:01:27  lr: 0.000360  min_lr: 0.000360  loss: 6.0263 (6.2783)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8324 (4.0237)  time: 0.3536  data: 0.0004  max mem: 28503
Epoch: [1]  [1200/1251]  eta: 0:00:17  lr: 0.000392  min_lr: 0.000392  loss: 6.0798 (6.2461)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.3990 (4.0903)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [1]  [1250/1251]  eta: 0:00:00  lr: 0.000399  min_lr: 0.000399  loss: 5.9306 (6.2381)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9459 (4.0744)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [1] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.000399  min_lr: 0.000399  loss: 5.9306 (6.2392)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9459 (4.0744)
Test:  [ 0/25]  eta: 0:02:19  loss: 4.2959 (4.2959)  acc1: 17.6000 (17.6000)  acc5: 41.2000 (41.2000)  time: 5.5848  data: 5.3832  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 4.2959 (4.3431)  acc1: 14.8000 (15.6000)  acc5: 40.8000 (37.7091)  time: 0.7122  data: 0.5401  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 4.5905 (4.5066)  acc1: 14.4000 (15.0286)  acc5: 34.0000 (35.5429)  time: 0.2030  data: 0.0342  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 4.5960 (4.4761)  acc1: 14.4000 (15.4240)  acc5: 34.0000 (35.8400)  time: 0.2026  data: 0.0341  max mem: 28503
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 15.280 Acc@5 35.540 loss 4.479
Accuracy of the model on the 50000 test images: 15.3%
Max accuracy: 15.28%
Epoch: [2]  [   0/1251]  eta: 0:55:05  lr: 0.000400  min_lr: 0.000400  loss: 5.7345 (5.7345)  weight_decay: 0.0500 (0.0500)  time: 2.6422  data: 2.2636  max mem: 28503
Epoch: [2]  [ 200/1251]  eta: 0:06:18  lr: 0.000432  min_lr: 0.000432  loss: 6.1631 (6.0527)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.2863 (4.3406)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [2]  [ 400/1251]  eta: 0:05:02  lr: 0.000464  min_lr: 0.000464  loss: 5.8461 (5.9769)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.8095 (4.1881)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [2]  [ 600/1251]  eta: 0:03:49  lr: 0.000496  min_lr: 0.000496  loss: 6.0007 (5.9412)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.9505 (4.1546)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [2]  [ 800/1251]  eta: 0:02:38  lr: 0.000528  min_lr: 0.000528  loss: 5.8402 (5.9172)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7662 (4.1562)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [2]  [1000/1251]  eta: 0:01:28  lr: 0.000560  min_lr: 0.000560  loss: 5.6081 (5.8849)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5204 (4.1072)  time: 0.3766  data: 0.0004  max mem: 28503
Epoch: [2]  [1200/1251]  eta: 0:00:17  lr: 0.000592  min_lr: 0.000592  loss: 5.8063 (5.8638)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7499 (4.0416)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [2]  [1250/1251]  eta: 0:00:00  lr: 0.000599  min_lr: 0.000599  loss: 5.8256 (5.8601)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7747 (4.0410)  time: 0.2917  data: 0.0005  max mem: 28503
Epoch: [2] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000599  min_lr: 0.000599  loss: 5.8256 (5.8476)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7747 (4.0410)
Test:  [ 0/25]  eta: 0:02:19  loss: 3.4842 (3.4842)  acc1: 28.0000 (28.0000)  acc5: 56.4000 (56.4000)  time: 5.5864  data: 5.3862  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 3.4689 (3.5165)  acc1: 28.0000 (28.0000)  acc5: 56.4000 (55.1636)  time: 0.7388  data: 0.5662  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 3.9122 (3.7886)  acc1: 22.0000 (24.6476)  acc5: 46.0000 (49.9429)  time: 0.2113  data: 0.0421  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 4.0392 (3.7495)  acc1: 22.0000 (25.5520)  acc5: 46.0000 (50.5760)  time: 0.2111  data: 0.0421  max mem: 28503
Test: Total time: 0:00:10 (0.4233 s / it)
* Acc@1 25.562 Acc@5 50.832 loss 3.752
Accuracy of the model on the 50000 test images: 25.6%
Max accuracy: 25.56%
Epoch: [3]  [   0/1251]  eta: 1:10:55  lr: 0.000600  min_lr: 0.000600  loss: 5.7756 (5.7756)  weight_decay: 0.0500 (0.0500)  time: 3.4019  data: 3.0477  max mem: 28503
Epoch: [3]  [ 200/1251]  eta: 0:06:20  lr: 0.000632  min_lr: 0.000632  loss: 5.6104 (5.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.5294 (4.0506)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [3]  [ 400/1251]  eta: 0:05:02  lr: 0.000664  min_lr: 0.000664  loss: 5.4385 (5.5933)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7881 (3.9016)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [3]  [ 600/1251]  eta: 0:03:50  lr: 0.000696  min_lr: 0.000696  loss: 5.1291 (5.5738)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.7868 (3.7767)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [3]  [ 800/1251]  eta: 0:02:38  lr: 0.000728  min_lr: 0.000728  loss: 5.5376 (5.5629)  weight_decay: 0.0500 (0.0500)  grad_norm: 4.1626 (3.8223)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [3]  [1000/1251]  eta: 0:01:28  lr: 0.000760  min_lr: 0.000760  loss: 5.4859 (5.5580)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.6795 (3.7436)  time: 0.3452  data: 0.0005  max mem: 28503
Epoch: [3]  [1200/1251]  eta: 0:00:17  lr: 0.000792  min_lr: 0.000792  loss: 5.7259 (5.5285)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.9723 (3.6437)  time: 0.3561  data: 0.0005  max mem: 28503
Epoch: [3]  [1250/1251]  eta: 0:00:00  lr: 0.000799  min_lr: 0.000799  loss: 5.6704 (5.5255)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2174 (3.6226)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [3] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.000799  min_lr: 0.000799  loss: 5.6704 (5.5325)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.2174 (3.6226)
Test:  [ 0/25]  eta: 0:02:26  loss: 2.7664 (2.7664)  acc1: 44.4000 (44.4000)  acc5: 73.2000 (73.2000)  time: 5.8647  data: 5.6567  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 2.7664 (2.8405)  acc1: 41.6000 (39.6727)  acc5: 69.6000 (69.2000)  time: 0.6961  data: 0.5237  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 3.2099 (3.1450)  acc1: 32.8000 (35.7143)  acc5: 58.0000 (62.8190)  time: 0.1822  data: 0.0135  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 3.3852 (3.1290)  acc1: 31.2000 (36.0160)  acc5: 56.4000 (62.8480)  time: 0.1820  data: 0.0134  max mem: 28503
Test: Total time: 0:00:10 (0.4163 s / it)
* Acc@1 35.148 Acc@5 62.102 loss 3.143
Accuracy of the model on the 50000 test images: 35.1%
Max accuracy: 35.15%
Epoch: [4]  [   0/1251]  eta: 1:03:15  lr: 0.000800  min_lr: 0.000800  loss: 5.3368 (5.3368)  weight_decay: 0.0500 (0.0500)  time: 3.0343  data: 2.6688  max mem: 28503
Epoch: [4]  [ 200/1251]  eta: 0:06:22  lr: 0.000832  min_lr: 0.000832  loss: 5.4284 (5.3601)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.8082 (3.2958)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [4]  [ 400/1251]  eta: 0:05:02  lr: 0.000864  min_lr: 0.000864  loss: 5.2998 (5.3655)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7433 (3.2084)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [4]  [ 600/1251]  eta: 0:03:50  lr: 0.000896  min_lr: 0.000896  loss: 5.5365 (5.3407)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.3264 (3.1765)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [4]  [ 800/1251]  eta: 0:02:38  lr: 0.000928  min_lr: 0.000928  loss: 5.3676 (5.3154)  weight_decay: 0.0500 (0.0500)  grad_norm: 3.0306 (3.1765)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [4]  [1000/1251]  eta: 0:01:28  lr: 0.000960  min_lr: 0.000960  loss: 5.4554 (5.3099)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7668 (3.0952)  time: 0.3652  data: 0.0004  max mem: 28503
Epoch: [4]  [1200/1251]  eta: 0:00:17  lr: 0.000992  min_lr: 0.000992  loss: 5.4768 (5.2904)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.7045 (3.0895)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [4]  [1250/1251]  eta: 0:00:00  lr: 0.001000  min_lr: 0.001000  loss: 5.2721 (5.2843)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6666 (3.0771)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [4] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.001000  min_lr: 0.001000  loss: 5.2721 (5.2750)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6666 (3.0771)
Test:  [ 0/25]  eta: 0:02:19  loss: 2.2233 (2.2233)  acc1: 56.4000 (56.4000)  acc5: 78.8000 (78.8000)  time: 5.5939  data: 5.3924  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 2.2950 (2.4337)  acc1: 48.4000 (46.9455)  acc5: 78.8000 (76.4727)  time: 0.6630  data: 0.4906  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.8899 (2.7604)  acc1: 37.6000 (42.4381)  acc5: 66.0000 (69.5810)  time: 0.1801  data: 0.0112  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 3.0194 (2.7508)  acc1: 38.8000 (42.7200)  acc5: 64.8000 (69.8560)  time: 0.1931  data: 0.0244  max mem: 28503
Test: Total time: 0:00:10 (0.4085 s / it)
* Acc@1 43.226 Acc@5 69.980 loss 2.733
Accuracy of the model on the 50000 test images: 43.2%
Max accuracy: 43.23%
Epoch: [5]  [   0/1251]  eta: 1:03:37  lr: 0.001000  min_lr: 0.001000  loss: 5.4384 (5.4384)  weight_decay: 0.0500 (0.0500)  time: 3.0513  data: 2.6934  max mem: 28503
Epoch: [5]  [ 200/1251]  eta: 0:06:20  lr: 0.001032  min_lr: 0.001032  loss: 5.0870 (5.2325)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2814 (2.7036)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [5]  [ 400/1251]  eta: 0:05:01  lr: 0.001064  min_lr: 0.001064  loss: 5.1716 (5.1655)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4906 (2.6854)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [5]  [ 600/1251]  eta: 0:03:49  lr: 0.001096  min_lr: 0.001096  loss: 5.0053 (5.1314)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9092 (2.6114)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [5]  [ 800/1251]  eta: 0:02:38  lr: 0.001128  min_lr: 0.001128  loss: 5.0375 (5.0849)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.4311 (2.5835)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [5]  [1000/1251]  eta: 0:01:27  lr: 0.001160  min_lr: 0.001160  loss: 5.1458 (5.0791)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0260 (2.5301)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [5]  [1200/1251]  eta: 0:00:17  lr: 0.001192  min_lr: 0.001192  loss: 4.9367 (5.0636)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.6999 (2.5576)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [5]  [1250/1251]  eta: 0:00:00  lr: 0.001200  min_lr: 0.001200  loss: 4.8919 (5.0596)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5559 (2.5582)  time: 0.2914  data: 0.0006  max mem: 28503
Epoch: [5] Total time: 0:07:16 (0.3493 s / it)
Averaged stats: lr: 0.001200  min_lr: 0.001200  loss: 4.8919 (5.0652)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.5559 (2.5582)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.9056 (1.9056)  acc1: 64.8000 (64.8000)  acc5: 83.2000 (83.2000)  time: 5.6741  data: 5.4786  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 2.0008 (2.1128)  acc1: 56.8000 (55.1273)  acc5: 83.2000 (80.4000)  time: 0.7364  data: 0.5642  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.5695 (2.4556)  acc1: 43.6000 (48.6286)  acc5: 69.6000 (73.7714)  time: 0.2055  data: 0.0364  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.6744 (2.4538)  acc1: 42.8000 (48.6240)  acc5: 67.6000 (73.8400)  time: 0.2048  data: 0.0364  max mem: 28503
Test: Total time: 0:00:10 (0.4215 s / it)
* Acc@1 47.902 Acc@5 74.196 loss 2.448
Accuracy of the model on the 50000 test images: 47.9%
Max accuracy: 47.90%
Epoch: [6]  [   0/1251]  eta: 1:09:35  lr: 0.001200  min_lr: 0.001200  loss: 5.4648 (5.4648)  weight_decay: 0.0500 (0.0500)  time: 3.3378  data: 2.9732  max mem: 28503
Epoch: [6]  [ 200/1251]  eta: 0:06:24  lr: 0.001232  min_lr: 0.001232  loss: 4.7076 (4.9267)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1725 (2.4861)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [6]  [ 400/1251]  eta: 0:05:04  lr: 0.001264  min_lr: 0.001264  loss: 4.9871 (4.9221)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.0317 (2.4156)  time: 0.3497  data: 0.0004  max mem: 28503
Epoch: [6]  [ 600/1251]  eta: 0:03:50  lr: 0.001296  min_lr: 0.001296  loss: 5.2071 (4.9102)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1322 (2.3483)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [6]  [ 800/1251]  eta: 0:02:39  lr: 0.001328  min_lr: 0.001328  loss: 4.9356 (4.9024)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9485 (2.3120)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [6]  [1000/1251]  eta: 0:01:28  lr: 0.001360  min_lr: 0.001360  loss: 4.6324 (4.8982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9231 (2.2433)  time: 0.3632  data: 0.0004  max mem: 28503
Epoch: [6]  [1200/1251]  eta: 0:00:17  lr: 0.001393  min_lr: 0.001393  loss: 4.7335 (4.8945)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.2090 (2.2243)  time: 0.3538  data: 0.0004  max mem: 28503
Epoch: [6]  [1250/1251]  eta: 0:00:00  lr: 0.001400  min_lr: 0.001400  loss: 5.0459 (4.8944)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1540 (2.2163)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [6] Total time: 0:07:19 (0.3515 s / it)
Averaged stats: lr: 0.001400  min_lr: 0.001400  loss: 5.0459 (4.9083)  weight_decay: 0.0500 (0.0500)  grad_norm: 2.1540 (2.2163)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.7974 (1.7974)  acc1: 66.4000 (66.4000)  acc5: 84.0000 (84.0000)  time: 5.5162  data: 5.3124  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 1.8863 (1.9728)  acc1: 59.2000 (58.0727)  acc5: 84.8000 (83.6727)  time: 0.7480  data: 0.5751  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.4219 (2.2846)  acc1: 48.0000 (52.9333)  acc5: 75.6000 (77.9238)  time: 0.2198  data: 0.0507  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.5462 (2.3014)  acc1: 46.8000 (52.5600)  acc5: 72.8000 (77.6800)  time: 0.2192  data: 0.0506  max mem: 28503
Test: Total time: 0:00:10 (0.4270 s / it)
* Acc@1 51.916 Acc@5 77.708 loss 2.299
Accuracy of the model on the 50000 test images: 51.9%
Max accuracy: 51.92%
Epoch: [7]  [   0/1251]  eta: 1:04:05  lr: 0.001400  min_lr: 0.001400  loss: 5.6834 (5.6834)  weight_decay: 0.0500 (0.0500)  time: 3.0739  data: 2.6858  max mem: 28503
Epoch: [7]  [ 200/1251]  eta: 0:06:18  lr: 0.001432  min_lr: 0.001432  loss: 5.0327 (4.8558)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.9208 (1.9399)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [7]  [ 400/1251]  eta: 0:05:02  lr: 0.001464  min_lr: 0.001464  loss: 4.7296 (4.8100)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8040 (1.9006)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [7]  [ 600/1251]  eta: 0:03:49  lr: 0.001496  min_lr: 0.001496  loss: 4.6193 (4.7864)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6107 (1.8867)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [7]  [ 800/1251]  eta: 0:02:38  lr: 0.001528  min_lr: 0.001528  loss: 5.0717 (4.7901)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8733 (1.8616)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [7]  [1000/1251]  eta: 0:01:27  lr: 0.001561  min_lr: 0.001561  loss: 4.7205 (4.7835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6106 (1.8316)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [7]  [1200/1251]  eta: 0:00:17  lr: 0.001593  min_lr: 0.001593  loss: 4.8069 (4.7685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7294 (1.8234)  time: 0.3539  data: 0.0004  max mem: 28503
Epoch: [7]  [1250/1251]  eta: 0:00:00  lr: 0.001600  min_lr: 0.001600  loss: 4.9306 (4.7656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5994 (1.8162)  time: 0.2920  data: 0.0005  max mem: 28503
Epoch: [7] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001600  min_lr: 0.001600  loss: 4.9306 (4.7591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5994 (1.8162)
Test:  [ 0/25]  eta: 0:02:16  loss: 1.6265 (1.6265)  acc1: 68.0000 (68.0000)  acc5: 86.0000 (86.0000)  time: 5.4684  data: 5.2663  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.6710 (1.7819)  acc1: 60.8000 (62.0364)  acc5: 88.4000 (86.2909)  time: 0.6710  data: 0.4986  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.2008 (2.1096)  acc1: 52.4000 (55.9810)  acc5: 77.2000 (80.8000)  time: 0.1799  data: 0.0110  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.3494 (2.1217)  acc1: 52.0000 (55.7600)  acc5: 76.4000 (80.4000)  time: 0.1796  data: 0.0109  max mem: 28503
Test: Total time: 0:00:09 (0.3967 s / it)
* Acc@1 55.168 Acc@5 80.412 loss 2.124
Accuracy of the model on the 50000 test images: 55.2%
Max accuracy: 55.17%
Epoch: [8]  [   0/1251]  eta: 1:03:29  lr: 0.001600  min_lr: 0.001600  loss: 4.4929 (4.4929)  weight_decay: 0.0500 (0.0500)  time: 3.0449  data: 2.6836  max mem: 28503
Epoch: [8]  [ 200/1251]  eta: 0:06:19  lr: 0.001632  min_lr: 0.001632  loss: 4.8626 (4.7644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7766 (1.8603)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [8]  [ 400/1251]  eta: 0:05:02  lr: 0.001664  min_lr: 0.001664  loss: 4.5340 (4.7342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7131 (1.7481)  time: 0.3568  data: 0.0004  max mem: 28503
Epoch: [8]  [ 600/1251]  eta: 0:03:49  lr: 0.001696  min_lr: 0.001696  loss: 4.9630 (4.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4636 (1.6843)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [8]  [ 800/1251]  eta: 0:02:38  lr: 0.001728  min_lr: 0.001728  loss: 4.6260 (4.6808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.8659 (1.6736)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [8]  [1000/1251]  eta: 0:01:27  lr: 0.001761  min_lr: 0.001761  loss: 4.6845 (4.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5630 (1.6756)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [8]  [1200/1251]  eta: 0:00:17  lr: 0.001793  min_lr: 0.001793  loss: 4.6617 (4.6696)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4521 (1.6382)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [8]  [1250/1251]  eta: 0:00:00  lr: 0.001800  min_lr: 0.001800  loss: 4.9306 (4.6672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5882 (1.6467)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [8] Total time: 0:07:17 (0.3493 s / it)
Averaged stats: lr: 0.001800  min_lr: 0.001800  loss: 4.9306 (4.6574)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5882 (1.6467)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.6024 (1.6024)  acc1: 68.0000 (68.0000)  acc5: 87.2000 (87.2000)  time: 5.5745  data: 5.3778  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.6265 (1.7648)  acc1: 64.8000 (62.4727)  acc5: 89.2000 (87.4182)  time: 0.7088  data: 0.5371  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.2121 (2.0911)  acc1: 53.2000 (56.9524)  acc5: 78.4000 (81.4286)  time: 0.1953  data: 0.0266  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.2366 (2.0953)  acc1: 52.0000 (56.9440)  acc5: 76.8000 (81.4400)  time: 0.1948  data: 0.0265  max mem: 28503
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 56.806 Acc@5 81.710 loss 2.087
Accuracy of the model on the 50000 test images: 56.8%
Max accuracy: 56.81%
Epoch: [9]  [   0/1251]  eta: 1:06:58  lr: 0.001800  min_lr: 0.001800  loss: 5.0752 (5.0752)  weight_decay: 0.0500 (0.0500)  time: 3.2126  data: 2.8578  max mem: 28503
Epoch: [9]  [ 200/1251]  eta: 0:06:19  lr: 0.001832  min_lr: 0.001832  loss: 4.8802 (4.6130)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4170 (1.4730)  time: 0.3466  data: 0.0005  max mem: 28503
Epoch: [9]  [ 400/1251]  eta: 0:05:01  lr: 0.001864  min_lr: 0.001864  loss: 4.4273 (4.6167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5784 (1.5231)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [9]  [ 600/1251]  eta: 0:03:49  lr: 0.001896  min_lr: 0.001896  loss: 4.4818 (4.5952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6037 (1.5683)  time: 0.3501  data: 0.0004  max mem: 28503
Epoch: [9]  [ 800/1251]  eta: 0:02:38  lr: 0.001929  min_lr: 0.001929  loss: 4.7000 (4.5870)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4535 (1.5636)  time: 0.3587  data: 0.0004  max mem: 28503
Epoch: [9]  [1000/1251]  eta: 0:01:28  lr: 0.001961  min_lr: 0.001961  loss: 4.4667 (4.5623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3439 (1.5308)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [9]  [1200/1251]  eta: 0:00:17  lr: 0.001993  min_lr: 0.001993  loss: 4.2417 (4.5503)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2132 (1.5048)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [9]  [1250/1251]  eta: 0:00:00  lr: 0.002000  min_lr: 0.002000  loss: 4.5540 (4.5487)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1736 (1.4982)  time: 0.2921  data: 0.0005  max mem: 28503
Epoch: [9] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.002000  min_lr: 0.002000  loss: 4.5540 (4.5822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1736 (1.4982)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.5123 (1.5123)  acc1: 70.4000 (70.4000)  acc5: 90.8000 (90.8000)  time: 5.6464  data: 5.4536  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.5246 (1.6617)  acc1: 66.8000 (66.5818)  acc5: 90.8000 (88.9455)  time: 0.7016  data: 0.5304  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 2.0415 (1.9423)  acc1: 58.4000 (60.7238)  acc5: 82.0000 (84.0000)  time: 0.1878  data: 0.0191  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.0519 (1.9566)  acc1: 55.6000 (60.1280)  acc5: 80.4000 (83.5520)  time: 0.1875  data: 0.0190  max mem: 28503
Test: Total time: 0:00:10 (0.4064 s / it)
* Acc@1 59.572 Acc@5 83.436 loss 1.952
Accuracy of the model on the 50000 test images: 59.6%
Max accuracy: 59.57%
Epoch: [10]  [   0/1251]  eta: 1:04:58  lr: 0.002000  min_lr: 0.002000  loss: 4.1859 (4.1859)  weight_decay: 0.0500 (0.0500)  time: 3.1165  data: 2.7428  max mem: 28503
Epoch: [10]  [ 200/1251]  eta: 0:06:23  lr: 0.002032  min_lr: 0.002032  loss: 4.8052 (4.5779)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4408 (1.4904)  time: 0.3548  data: 0.0004  max mem: 28503
Epoch: [10]  [ 400/1251]  eta: 0:05:03  lr: 0.002064  min_lr: 0.002064  loss: 4.7068 (4.5521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2281 (1.3665)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [10]  [ 600/1251]  eta: 0:03:50  lr: 0.002096  min_lr: 0.002096  loss: 4.3107 (4.5349)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2244 (1.3501)  time: 0.3523  data: 0.0005  max mem: 28503
Epoch: [10]  [ 800/1251]  eta: 0:02:38  lr: 0.002129  min_lr: 0.002129  loss: 4.3927 (4.5248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2168 (1.3259)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [10]  [1000/1251]  eta: 0:01:28  lr: 0.002161  min_lr: 0.002161  loss: 4.6663 (4.5137)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1667 (1.3204)  time: 0.3532  data: 0.0004  max mem: 28503
Epoch: [10]  [1200/1251]  eta: 0:00:17  lr: 0.002193  min_lr: 0.002193  loss: 4.8312 (4.4882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2845 (1.3209)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [10]  [1250/1251]  eta: 0:00:00  lr: 0.002200  min_lr: 0.002200  loss: 4.5119 (4.4846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1403 (1.3141)  time: 0.2923  data: 0.0005  max mem: 28503
Epoch: [10] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.002200  min_lr: 0.002200  loss: 4.5119 (4.4960)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1403 (1.3141)
Test:  [ 0/25]  eta: 0:02:20  loss: 1.3723 (1.3723)  acc1: 72.0000 (72.0000)  acc5: 90.8000 (90.8000)  time: 5.6391  data: 5.4438  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.4303 (1.5405)  acc1: 68.4000 (67.4909)  acc5: 90.8000 (90.0727)  time: 0.7270  data: 0.5555  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.9042 (1.8405)  acc1: 58.0000 (61.0095)  acc5: 82.4000 (85.1619)  time: 0.2020  data: 0.0334  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.0598 (1.8502)  acc1: 56.0000 (60.7680)  acc5: 80.4000 (84.8000)  time: 0.2015  data: 0.0333  max mem: 28503
Test: Total time: 0:00:10 (0.4172 s / it)
* Acc@1 61.432 Acc@5 84.606 loss 1.846
Accuracy of the model on the 50000 test images: 61.4%
Max accuracy: 61.43%
Epoch: [11]  [   0/1251]  eta: 1:01:57  lr: 0.002200  min_lr: 0.002200  loss: 4.5356 (4.5356)  weight_decay: 0.0500 (0.0500)  time: 2.9717  data: 2.5500  max mem: 28503
Epoch: [11]  [ 200/1251]  eta: 0:06:19  lr: 0.002232  min_lr: 0.002232  loss: 4.4856 (4.4378)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2111 (1.3458)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [11]  [ 400/1251]  eta: 0:05:00  lr: 0.002264  min_lr: 0.002264  loss: 4.3496 (4.4537)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1759 (1.3236)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [11]  [ 600/1251]  eta: 0:03:48  lr: 0.002297  min_lr: 0.002297  loss: 4.5241 (4.4411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1686 (1.2899)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [11]  [ 800/1251]  eta: 0:02:38  lr: 0.002329  min_lr: 0.002329  loss: 4.8809 (4.4625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1371 (1.2656)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [11]  [1000/1251]  eta: 0:01:27  lr: 0.002361  min_lr: 0.002361  loss: 4.4587 (4.4468)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.2548)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [11]  [1200/1251]  eta: 0:00:17  lr: 0.002393  min_lr: 0.002393  loss: 4.5991 (4.4370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0028 (1.2298)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [11]  [1250/1251]  eta: 0:00:00  lr: 0.002400  min_lr: 0.002400  loss: 4.1602 (4.4342)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.2251)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [11] Total time: 0:07:16 (0.3487 s / it)
Averaged stats: lr: 0.002400  min_lr: 0.002400  loss: 4.1602 (4.4189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.2251)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.4028 (1.4028)  acc1: 73.2000 (73.2000)  acc5: 92.0000 (92.0000)  time: 5.6769  data: 5.4894  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.4028 (1.5019)  acc1: 72.8000 (69.4182)  acc5: 92.0000 (90.5818)  time: 0.7097  data: 0.5384  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.9331 (1.7891)  acc1: 60.4000 (63.6381)  acc5: 82.8000 (85.4857)  time: 0.1907  data: 0.0217  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 2.0185 (1.7960)  acc1: 56.8000 (63.3440)  acc5: 82.4000 (85.4560)  time: 0.1901  data: 0.0216  max mem: 28503
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 62.930 Acc@5 85.604 loss 1.788
Accuracy of the model on the 50000 test images: 62.9%
Max accuracy: 62.93%
Epoch: [12]  [   0/1251]  eta: 1:06:00  lr: 0.002400  min_lr: 0.002400  loss: 4.1948 (4.1948)  weight_decay: 0.0500 (0.0500)  time: 3.1657  data: 2.8055  max mem: 28503
Epoch: [12]  [ 200/1251]  eta: 0:06:20  lr: 0.002432  min_lr: 0.002432  loss: 4.4912 (4.3867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9224 (1.1189)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [12]  [ 400/1251]  eta: 0:05:02  lr: 0.002464  min_lr: 0.002464  loss: 4.5471 (4.3908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0586 (1.1062)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [12]  [ 600/1251]  eta: 0:03:49  lr: 0.002497  min_lr: 0.002497  loss: 4.6117 (4.3806)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0781 (1.0767)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [12]  [ 800/1251]  eta: 0:02:38  lr: 0.002529  min_lr: 0.002529  loss: 4.4860 (4.3727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9598 (1.1047)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [12]  [1000/1251]  eta: 0:01:27  lr: 0.002561  min_lr: 0.002561  loss: 4.6868 (4.3732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3309 (1.1288)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [12]  [1200/1251]  eta: 0:00:17  lr: 0.002593  min_lr: 0.002593  loss: 4.4118 (4.3652)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0771 (1.1164)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [12]  [1250/1251]  eta: 0:00:00  lr: 0.002600  min_lr: 0.002600  loss: 4.5267 (4.3663)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1373 (1.1163)  time: 0.2922  data: 0.0006  max mem: 28503
Epoch: [12] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.002600  min_lr: 0.002600  loss: 4.5267 (4.3738)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1373 (1.1163)
Test:  [ 0/25]  eta: 0:02:21  loss: 1.2036 (1.2036)  acc1: 76.0000 (76.0000)  acc5: 92.4000 (92.4000)  time: 5.6554  data: 5.4506  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 1.2935 (1.3929)  acc1: 72.4000 (70.4364)  acc5: 92.4000 (91.2000)  time: 0.7374  data: 0.5648  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7660 (1.6954)  acc1: 59.6000 (64.5333)  acc5: 86.4000 (87.0476)  time: 0.2070  data: 0.0381  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.9567 (1.7064)  acc1: 59.6000 (64.2720)  acc5: 84.0000 (86.8000)  time: 0.2064  data: 0.0381  max mem: 28503
Test: Total time: 0:00:10 (0.4221 s / it)
* Acc@1 64.080 Acc@5 86.502 loss 1.710
Accuracy of the model on the 50000 test images: 64.1%
Max accuracy: 64.08%
Epoch: [13]  [   0/1251]  eta: 1:04:22  lr: 0.002600  min_lr: 0.002600  loss: 4.3904 (4.3904)  weight_decay: 0.0500 (0.0500)  time: 3.0876  data: 2.7009  max mem: 28503
Epoch: [13]  [ 200/1251]  eta: 0:06:21  lr: 0.002632  min_lr: 0.002632  loss: 4.3834 (4.2627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9230 (0.9589)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [13]  [ 400/1251]  eta: 0:05:01  lr: 0.002665  min_lr: 0.002665  loss: 4.3327 (4.2797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9930 (1.0142)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [13]  [ 600/1251]  eta: 0:03:49  lr: 0.002697  min_lr: 0.002697  loss: 4.1664 (4.2943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0054 (1.0284)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [13]  [ 800/1251]  eta: 0:02:38  lr: 0.002729  min_lr: 0.002729  loss: 3.8540 (4.2954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1400 (1.0219)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [13]  [1000/1251]  eta: 0:01:28  lr: 0.002761  min_lr: 0.002761  loss: 4.2372 (4.2906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8573 (1.0273)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [13]  [1200/1251]  eta: 0:00:17  lr: 0.002793  min_lr: 0.002793  loss: 4.4798 (4.2937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1551 (1.0303)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [13]  [1250/1251]  eta: 0:00:00  lr: 0.002800  min_lr: 0.002800  loss: 4.3918 (4.2890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (1.0217)  time: 0.2931  data: 0.0007  max mem: 28503
Epoch: [13] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002800  min_lr: 0.002800  loss: 4.3918 (4.3020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8241 (1.0217)
Test:  [ 0/25]  eta: 0:02:19  loss: 1.2472 (1.2472)  acc1: 74.4000 (74.4000)  acc5: 92.4000 (92.4000)  time: 5.5771  data: 5.3776  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.2559 (1.4186)  acc1: 72.0000 (70.6909)  acc5: 92.4000 (91.2727)  time: 0.6991  data: 0.5273  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.8690 (1.7166)  acc1: 60.4000 (64.7810)  acc5: 84.4000 (86.9905)  time: 0.1898  data: 0.0212  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.9346 (1.7301)  acc1: 60.0000 (64.4320)  acc5: 83.6000 (86.8640)  time: 0.1896  data: 0.0211  max mem: 28503
Test: Total time: 0:00:10 (0.4054 s / it)
* Acc@1 64.566 Acc@5 86.790 loss 1.729
Accuracy of the model on the 50000 test images: 64.6%
Max accuracy: 64.57%
Epoch: [14]  [   0/1251]  eta: 1:12:07  lr: 0.002800  min_lr: 0.002800  loss: 4.3056 (4.3056)  weight_decay: 0.0500 (0.0500)  time: 3.4591  data: 3.1095  max mem: 28503
Epoch: [14]  [ 200/1251]  eta: 0:06:21  lr: 0.002833  min_lr: 0.002833  loss: 4.5131 (4.2810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8216 (0.8767)  time: 0.3518  data: 0.0004  max mem: 28503
Epoch: [14]  [ 400/1251]  eta: 0:05:02  lr: 0.002865  min_lr: 0.002865  loss: 4.2003 (4.2450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8868 (0.9235)  time: 0.3596  data: 0.0004  max mem: 28503
Epoch: [14]  [ 600/1251]  eta: 0:03:49  lr: 0.002897  min_lr: 0.002897  loss: 4.4436 (4.2471)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9620 (0.9268)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [14]  [ 800/1251]  eta: 0:02:38  lr: 0.002929  min_lr: 0.002929  loss: 4.5469 (4.2784)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0216 (0.9467)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [14]  [1000/1251]  eta: 0:01:27  lr: 0.002961  min_lr: 0.002961  loss: 4.3509 (4.2713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7656 (0.9255)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [14]  [1200/1251]  eta: 0:00:17  lr: 0.002993  min_lr: 0.002993  loss: 4.3364 (4.2658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.9289)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [14]  [1250/1251]  eta: 0:00:00  lr: 0.003000  min_lr: 0.003000  loss: 3.8860 (4.2618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.9250)  time: 0.2974  data: 0.0005  max mem: 28503
Epoch: [14] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003000  min_lr: 0.003000  loss: 3.8860 (4.2619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.9250)
Test:  [ 0/25]  eta: 0:01:37  loss: 1.2341 (1.2341)  acc1: 74.4000 (74.4000)  acc5: 93.2000 (93.2000)  time: 3.9191  data: 3.7207  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.2716 (1.3494)  acc1: 71.6000 (70.5455)  acc5: 93.2000 (91.4182)  time: 0.6063  data: 0.4336  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7932 (1.6114)  acc1: 61.2000 (65.6191)  acc5: 84.8000 (87.5429)  time: 0.2619  data: 0.0916  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.8153 (1.6224)  acc1: 60.8000 (65.2480)  acc5: 84.4000 (87.4400)  time: 0.2092  data: 0.0398  max mem: 28503
Test: Total time: 0:00:09 (0.3965 s / it)
* Acc@1 65.496 Acc@5 87.266 loss 1.619
Accuracy of the model on the 50000 test images: 65.5%
Max accuracy: 65.50%
Epoch: [15]  [   0/1251]  eta: 1:10:54  lr: 0.003000  min_lr: 0.003000  loss: 4.0823 (4.0823)  weight_decay: 0.0500 (0.0500)  time: 3.4008  data: 3.0338  max mem: 28503
Epoch: [15]  [ 200/1251]  eta: 0:06:20  lr: 0.003033  min_lr: 0.003033  loss: 4.3033 (4.2187)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9615 (0.9428)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [15]  [ 400/1251]  eta: 0:05:01  lr: 0.003065  min_lr: 0.003065  loss: 4.5848 (4.2258)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8169 (0.9107)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [15]  [ 600/1251]  eta: 0:03:49  lr: 0.003097  min_lr: 0.003097  loss: 4.0981 (4.2148)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8160 (0.8889)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [15]  [ 800/1251]  eta: 0:02:38  lr: 0.003129  min_lr: 0.003129  loss: 4.3751 (4.2203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8846 (0.8902)  time: 0.3537  data: 0.0004  max mem: 28503
Epoch: [15]  [1000/1251]  eta: 0:01:28  lr: 0.003161  min_lr: 0.003161  loss: 4.2548 (4.2214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7681 (0.8785)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [15]  [1200/1251]  eta: 0:00:17  lr: 0.003193  min_lr: 0.003193  loss: 4.0201 (4.2170)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8019 (0.8796)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [15]  [1250/1251]  eta: 0:00:00  lr: 0.003200  min_lr: 0.003200  loss: 4.3883 (4.2190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.8749)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [15] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003200  min_lr: 0.003200  loss: 4.3883 (4.2238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7575 (0.8749)
Test:  [ 0/25]  eta: 0:01:56  loss: 1.1080 (1.1080)  acc1: 77.2000 (77.2000)  acc5: 94.4000 (94.4000)  time: 4.6653  data: 4.4201  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.2216 (1.3159)  acc1: 74.4000 (72.4000)  acc5: 94.0000 (92.1455)  time: 0.7319  data: 0.5546  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7257 (1.5936)  acc1: 64.4000 (67.0095)  acc5: 85.6000 (87.9238)  time: 0.2535  data: 0.0841  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.7917 (1.6045)  acc1: 64.4000 (66.6240)  acc5: 85.2000 (87.9840)  time: 0.2154  data: 0.0470  max mem: 28503
Test: Total time: 0:00:10 (0.4198 s / it)
* Acc@1 66.742 Acc@5 88.154 loss 1.602
Accuracy of the model on the 50000 test images: 66.7%
Max accuracy: 66.74%
Epoch: [16]  [   0/1251]  eta: 1:11:20  lr: 0.003201  min_lr: 0.003201  loss: 4.0391 (4.0391)  weight_decay: 0.0500 (0.0500)  time: 3.4217  data: 3.0694  max mem: 28503
Epoch: [16]  [ 200/1251]  eta: 0:06:22  lr: 0.003233  min_lr: 0.003233  loss: 4.2098 (4.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8872 (0.8661)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [16]  [ 400/1251]  eta: 0:05:03  lr: 0.003265  min_lr: 0.003265  loss: 3.8641 (4.1808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6860 (0.8008)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [16]  [ 600/1251]  eta: 0:03:50  lr: 0.003297  min_lr: 0.003297  loss: 4.2579 (4.1941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8352 (0.8156)  time: 0.3578  data: 0.0005  max mem: 28503
Epoch: [16]  [ 800/1251]  eta: 0:02:38  lr: 0.003329  min_lr: 0.003329  loss: 4.2429 (4.2029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.8107)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [16]  [1000/1251]  eta: 0:01:28  lr: 0.003361  min_lr: 0.003361  loss: 4.4627 (4.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9246 (0.8151)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [16]  [1200/1251]  eta: 0:00:17  lr: 0.003393  min_lr: 0.003393  loss: 4.3249 (4.1780)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8484 (0.8127)  time: 0.3566  data: 0.0004  max mem: 28503
Epoch: [16]  [1250/1251]  eta: 0:00:00  lr: 0.003400  min_lr: 0.003400  loss: 4.3715 (4.1811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.8071)  time: 0.3002  data: 0.0006  max mem: 28503
Epoch: [16] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.003400  min_lr: 0.003400  loss: 4.3715 (4.1855)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.8071)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.1264 (1.1264)  acc1: 78.4000 (78.4000)  acc5: 93.6000 (93.6000)  time: 5.7074  data: 5.5057  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.3002 (1.3278)  acc1: 71.6000 (72.2545)  acc5: 93.2000 (92.0364)  time: 0.7238  data: 0.5504  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7237 (1.6057)  acc1: 63.6000 (66.6667)  acc5: 86.8000 (88.0000)  time: 0.1969  data: 0.0275  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.7317 (1.6116)  acc1: 63.2000 (66.6720)  acc5: 86.0000 (88.0160)  time: 0.1957  data: 0.0274  max mem: 28503
Test: Total time: 0:00:10 (0.4165 s / it)
* Acc@1 66.408 Acc@5 88.054 loss 1.610
Accuracy of the model on the 50000 test images: 66.4%
Max accuracy: 66.74%
Epoch: [17]  [   0/1251]  eta: 1:13:57  lr: 0.003401  min_lr: 0.003401  loss: 4.5102 (4.5102)  weight_decay: 0.0500 (0.0500)  time: 3.5472  data: 2.5807  max mem: 28503
Epoch: [17]  [ 200/1251]  eta: 0:06:22  lr: 0.003433  min_lr: 0.003433  loss: 3.9636 (4.1462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6467 (0.7172)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [17]  [ 400/1251]  eta: 0:05:02  lr: 0.003465  min_lr: 0.003465  loss: 3.6742 (4.0864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7696 (0.7378)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [17]  [ 600/1251]  eta: 0:03:50  lr: 0.003497  min_lr: 0.003497  loss: 4.4606 (4.1291)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7589)  time: 0.3456  data: 0.0005  max mem: 28503
Epoch: [17]  [ 800/1251]  eta: 0:02:38  lr: 0.003529  min_lr: 0.003529  loss: 4.5124 (4.1434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (0.7748)  time: 0.3663  data: 0.0004  max mem: 28503
Epoch: [17]  [1000/1251]  eta: 0:01:28  lr: 0.003561  min_lr: 0.003561  loss: 4.3325 (4.1374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7582 (0.7722)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [17]  [1200/1251]  eta: 0:00:17  lr: 0.003593  min_lr: 0.003593  loss: 4.3791 (4.1551)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.7636)  time: 0.3553  data: 0.0005  max mem: 28503
Epoch: [17]  [1250/1251]  eta: 0:00:00  lr: 0.003600  min_lr: 0.003600  loss: 4.3718 (4.1608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.7610)  time: 0.2979  data: 0.0007  max mem: 28503
Epoch: [17] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003600  min_lr: 0.003600  loss: 4.3718 (4.1603)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6689 (0.7610)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.2419 (1.2419)  acc1: 76.8000 (76.8000)  acc5: 94.0000 (94.0000)  time: 5.7190  data: 5.4946  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.2995 (1.4365)  acc1: 74.0000 (73.7091)  acc5: 92.8000 (92.0364)  time: 0.7240  data: 0.5499  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7742 (1.6861)  acc1: 64.8000 (67.9238)  acc5: 86.0000 (88.1714)  time: 0.1964  data: 0.0278  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.8365 (1.6843)  acc1: 64.0000 (67.6960)  acc5: 85.6000 (88.2240)  time: 0.1956  data: 0.0273  max mem: 28503
Test: Total time: 0:00:10 (0.4161 s / it)
* Acc@1 67.204 Acc@5 88.468 loss 1.685
Accuracy of the model on the 50000 test images: 67.2%
Max accuracy: 67.20%
Epoch: [18]  [   0/1251]  eta: 0:55:52  lr: 0.003601  min_lr: 0.003601  loss: 3.4889 (3.4889)  weight_decay: 0.0500 (0.0500)  time: 2.6801  data: 2.3023  max mem: 28503
Epoch: [18]  [ 200/1251]  eta: 0:06:19  lr: 0.003633  min_lr: 0.003633  loss: 4.5268 (4.1481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7319 (0.7214)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [18]  [ 400/1251]  eta: 0:05:01  lr: 0.003665  min_lr: 0.003665  loss: 4.0834 (4.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6546 (0.7138)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [18]  [ 600/1251]  eta: 0:03:48  lr: 0.003697  min_lr: 0.003697  loss: 3.7583 (4.1027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6570 (0.7435)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [18]  [ 800/1251]  eta: 0:02:38  lr: 0.003729  min_lr: 0.003729  loss: 4.2774 (4.1096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6841 (0.7374)  time: 0.3450  data: 0.0003  max mem: 28503
Epoch: [18]  [1000/1251]  eta: 0:01:27  lr: 0.003761  min_lr: 0.003761  loss: 4.1873 (4.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6399 (0.7391)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [18]  [1200/1251]  eta: 0:00:17  lr: 0.003793  min_lr: 0.003793  loss: 4.2429 (4.1144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6488 (0.7346)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [18]  [1250/1251]  eta: 0:00:00  lr: 0.003800  min_lr: 0.003800  loss: 4.2306 (4.1151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7224 (0.7347)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [18] Total time: 0:07:16 (0.3487 s / it)
Averaged stats: lr: 0.003800  min_lr: 0.003800  loss: 4.2306 (4.1251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7224 (0.7347)
Test:  [ 0/25]  eta: 0:02:17  loss: 1.0490 (1.0490)  acc1: 81.2000 (81.2000)  acc5: 95.2000 (95.2000)  time: 5.5004  data: 5.2979  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1859 (1.2450)  acc1: 74.0000 (73.2364)  acc5: 94.8000 (93.4546)  time: 0.7096  data: 0.5347  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.6168 (1.5218)  acc1: 64.4000 (67.9429)  acc5: 88.0000 (89.5619)  time: 0.1995  data: 0.0292  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.6253 (1.5314)  acc1: 64.4000 (67.7120)  acc5: 87.2000 (89.4720)  time: 0.1987  data: 0.0292  max mem: 28503
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 67.756 Acc@5 89.008 loss 1.533
Accuracy of the model on the 50000 test images: 67.8%
Max accuracy: 67.76%
Epoch: [19]  [   0/1251]  eta: 1:01:51  lr: 0.003801  min_lr: 0.003801  loss: 4.2638 (4.2638)  weight_decay: 0.0500 (0.0500)  time: 2.9668  data: 2.5353  max mem: 28503
Epoch: [19]  [ 200/1251]  eta: 0:06:18  lr: 0.003833  min_lr: 0.003833  loss: 4.1329 (4.0594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5786 (0.7271)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [19]  [ 400/1251]  eta: 0:05:01  lr: 0.003865  min_lr: 0.003865  loss: 4.1598 (4.0896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6208 (0.6918)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [19]  [ 600/1251]  eta: 0:03:49  lr: 0.003897  min_lr: 0.003897  loss: 4.2920 (4.1034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6241 (0.6906)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [19]  [ 800/1251]  eta: 0:02:38  lr: 0.003929  min_lr: 0.003929  loss: 4.2010 (4.0864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7519 (0.7124)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [19]  [1000/1251]  eta: 0:01:28  lr: 0.003961  min_lr: 0.003961  loss: 4.3144 (4.1012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6416 (0.7042)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [19]  [1200/1251]  eta: 0:00:17  lr: 0.003993  min_lr: 0.003993  loss: 3.7514 (4.0985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6634 (0.6979)  time: 0.3572  data: 0.0004  max mem: 28503
Epoch: [19]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.3806 (4.1031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.6988)  time: 0.2917  data: 0.0005  max mem: 28503
Epoch: [19] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.3806 (4.0987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6637 (0.6988)
Test:  [ 0/25]  eta: 0:02:13  loss: 1.1248 (1.1248)  acc1: 82.8000 (82.8000)  acc5: 94.0000 (94.0000)  time: 5.3310  data: 5.1289  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.2588 (1.3560)  acc1: 75.6000 (73.5273)  acc5: 93.6000 (93.0182)  time: 0.7226  data: 0.5487  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.7125 (1.6313)  acc1: 64.8000 (68.4381)  acc5: 88.0000 (88.9333)  time: 0.2194  data: 0.0497  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.7595 (1.6372)  acc1: 64.4000 (68.1440)  acc5: 86.8000 (88.9760)  time: 0.2180  data: 0.0496  max mem: 28503
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 67.848 Acc@5 88.854 loss 1.636
Accuracy of the model on the 50000 test images: 67.8%
Max accuracy: 67.85%
Epoch: [20]  [   0/1251]  eta: 0:58:32  lr: 0.004000  min_lr: 0.004000  loss: 4.3832 (4.3832)  weight_decay: 0.0500 (0.0500)  time: 2.8081  data: 2.4412  max mem: 28503
Epoch: [20]  [ 200/1251]  eta: 0:06:20  lr: 0.004000  min_lr: 0.004000  loss: 3.9151 (4.0308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7272 (0.7094)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [20]  [ 400/1251]  eta: 0:05:02  lr: 0.004000  min_lr: 0.004000  loss: 4.3877 (4.0958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5825 (0.6898)  time: 0.3448  data: 0.0003  max mem: 28503
Epoch: [20]  [ 600/1251]  eta: 0:03:49  lr: 0.004000  min_lr: 0.004000  loss: 4.0977 (4.0476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6020 (0.6699)  time: 0.3510  data: 0.0004  max mem: 28503
Epoch: [20]  [ 800/1251]  eta: 0:02:38  lr: 0.004000  min_lr: 0.004000  loss: 4.2236 (4.0606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7023 (0.6786)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [20]  [1000/1251]  eta: 0:01:28  lr: 0.004000  min_lr: 0.004000  loss: 3.7600 (4.0487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5468 (0.6708)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [20]  [1200/1251]  eta: 0:00:17  lr: 0.004000  min_lr: 0.004000  loss: 3.5867 (4.0378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6601 (0.6691)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [20]  [1250/1251]  eta: 0:00:00  lr: 0.004000  min_lr: 0.004000  loss: 4.1130 (4.0402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6354 (0.6686)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [20] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.004000  min_lr: 0.004000  loss: 4.1130 (4.0686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6354 (0.6686)
Test:  [ 0/25]  eta: 0:02:22  loss: 1.0778 (1.0778)  acc1: 81.6000 (81.6000)  acc5: 93.2000 (93.2000)  time: 5.6978  data: 5.5002  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1894 (1.2301)  acc1: 75.6000 (74.6182)  acc5: 93.6000 (93.2000)  time: 0.7211  data: 0.5499  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.5904 (1.4924)  acc1: 66.4000 (69.2571)  acc5: 89.2000 (89.3524)  time: 0.1959  data: 0.0275  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.6151 (1.4978)  acc1: 66.0000 (69.1040)  acc5: 87.6000 (89.4080)  time: 0.1957  data: 0.0274  max mem: 28503
Test: Total time: 0:00:10 (0.4148 s / it)
* Acc@1 68.958 Acc@5 89.610 loss 1.496
Accuracy of the model on the 50000 test images: 69.0%
Max accuracy: 68.96%
Epoch: [21]  [   0/1251]  eta: 1:01:49  lr: 0.004000  min_lr: 0.004000  loss: 3.8508 (3.8508)  weight_decay: 0.0500 (0.0500)  time: 2.9651  data: 2.5797  max mem: 28503
Epoch: [21]  [ 200/1251]  eta: 0:06:17  lr: 0.004000  min_lr: 0.004000  loss: 3.8253 (4.0978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.6900)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [21]  [ 400/1251]  eta: 0:05:01  lr: 0.004000  min_lr: 0.004000  loss: 4.1203 (4.0722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5923 (0.6960)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [21]  [ 600/1251]  eta: 0:03:49  lr: 0.004000  min_lr: 0.004000  loss: 4.1790 (4.0681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.6911)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [21]  [ 800/1251]  eta: 0:02:38  lr: 0.004000  min_lr: 0.004000  loss: 3.9775 (4.0678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6483 (0.6753)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [21]  [1000/1251]  eta: 0:01:27  lr: 0.004000  min_lr: 0.004000  loss: 4.1921 (4.0716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5971 (0.6686)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [21]  [1200/1251]  eta: 0:00:17  lr: 0.004000  min_lr: 0.004000  loss: 4.1389 (4.0556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6641 (0.6618)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [21]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.2652 (4.0550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5169 (0.6579)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [21] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.2652 (4.0289)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5169 (0.6579)
Test:  [ 0/25]  eta: 0:01:53  loss: 1.0542 (1.0542)  acc1: 80.0000 (80.0000)  acc5: 96.0000 (96.0000)  time: 4.5260  data: 4.3167  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.1727 (1.2169)  acc1: 74.4000 (75.2364)  acc5: 93.2000 (93.6364)  time: 0.6557  data: 0.4823  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.5132 (1.4697)  acc1: 67.6000 (69.7333)  acc5: 89.6000 (90.0191)  time: 0.2269  data: 0.0578  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5980 (1.4795)  acc1: 65.2000 (69.3760)  acc5: 88.0000 (90.0000)  time: 0.1877  data: 0.0186  max mem: 28503
Test: Total time: 0:00:09 (0.3941 s / it)
* Acc@1 69.272 Acc@5 89.886 loss 1.477
Accuracy of the model on the 50000 test images: 69.3%
Max accuracy: 69.27%
Epoch: [22]  [   0/1251]  eta: 1:07:29  lr: 0.003999  min_lr: 0.003999  loss: 3.0554 (3.0554)  weight_decay: 0.0500 (0.0500)  time: 3.2370  data: 2.8707  max mem: 28503
Epoch: [22]  [ 200/1251]  eta: 0:06:22  lr: 0.003999  min_lr: 0.003999  loss: 3.9868 (3.9782)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6223 (0.6627)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [22]  [ 400/1251]  eta: 0:05:02  lr: 0.003999  min_lr: 0.003999  loss: 4.2465 (3.9947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5799 (0.6524)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [22]  [ 600/1251]  eta: 0:03:49  lr: 0.003999  min_lr: 0.003999  loss: 4.0211 (3.9903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6344 (0.6442)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [22]  [ 800/1251]  eta: 0:02:38  lr: 0.003999  min_lr: 0.003999  loss: 3.9973 (3.9867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5894 (0.6416)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [22]  [1000/1251]  eta: 0:01:28  lr: 0.003999  min_lr: 0.003999  loss: 4.0541 (3.9801)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6381 (0.6398)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [22]  [1200/1251]  eta: 0:00:17  lr: 0.003999  min_lr: 0.003999  loss: 4.2298 (3.9902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.6432)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [22]  [1250/1251]  eta: 0:00:00  lr: 0.003999  min_lr: 0.003999  loss: 4.1534 (3.9925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6065 (0.6429)  time: 0.2981  data: 0.0006  max mem: 28503
Epoch: [22] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003999  min_lr: 0.003999  loss: 4.1534 (3.9956)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6065 (0.6429)
Test:  [ 0/25]  eta: 0:02:15  loss: 1.0667 (1.0667)  acc1: 79.2000 (79.2000)  acc5: 96.4000 (96.4000)  time: 5.4058  data: 5.2120  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 1.3145 (1.2837)  acc1: 76.4000 (75.0545)  acc5: 94.0000 (93.3818)  time: 0.7403  data: 0.5684  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.5989 (1.5434)  acc1: 67.2000 (69.9619)  acc5: 88.4000 (89.8857)  time: 0.2304  data: 0.0614  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.7465 (1.5542)  acc1: 64.8000 (69.6800)  acc5: 87.2000 (89.8400)  time: 0.2296  data: 0.0613  max mem: 28503
Test: Total time: 0:00:10 (0.4305 s / it)
* Acc@1 69.778 Acc@5 90.072 loss 1.541
Accuracy of the model on the 50000 test images: 69.8%
Max accuracy: 69.78%
Epoch: [23]  [   0/1251]  eta: 1:09:04  lr: 0.003999  min_lr: 0.003999  loss: 4.6548 (4.6548)  weight_decay: 0.0500 (0.0500)  time: 3.3130  data: 2.9252  max mem: 28503
Epoch: [23]  [ 200/1251]  eta: 0:06:20  lr: 0.003999  min_lr: 0.003999  loss: 3.8306 (3.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6356 (0.6419)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [23]  [ 400/1251]  eta: 0:05:02  lr: 0.003999  min_lr: 0.003999  loss: 4.2083 (3.9555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5906 (0.6347)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [23]  [ 600/1251]  eta: 0:03:49  lr: 0.003998  min_lr: 0.003998  loss: 4.2028 (3.9638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5476 (0.6165)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [23]  [ 800/1251]  eta: 0:02:38  lr: 0.003998  min_lr: 0.003998  loss: 4.2732 (3.9709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5400 (0.6079)  time: 0.3571  data: 0.0004  max mem: 28503
Epoch: [23]  [1000/1251]  eta: 0:01:27  lr: 0.003998  min_lr: 0.003998  loss: 4.2860 (3.9658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6644 (0.6195)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [23]  [1200/1251]  eta: 0:00:17  lr: 0.003998  min_lr: 0.003998  loss: 4.2027 (3.9601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6801 (0.6190)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [23]  [1250/1251]  eta: 0:00:00  lr: 0.003998  min_lr: 0.003998  loss: 4.3166 (3.9601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6310 (0.6194)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [23] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003998  min_lr: 0.003998  loss: 4.3166 (3.9608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6310 (0.6194)
Test:  [ 0/25]  eta: 0:02:25  loss: 1.0326 (1.0326)  acc1: 80.8000 (80.8000)  acc5: 96.4000 (96.4000)  time: 5.8175  data: 5.5822  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.2031 (1.2543)  acc1: 74.0000 (74.2182)  acc5: 94.0000 (93.8909)  time: 0.7296  data: 0.5545  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.5471 (1.4845)  acc1: 65.2000 (69.2952)  acc5: 89.6000 (90.1714)  time: 0.1945  data: 0.0259  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5885 (1.4830)  acc1: 65.2000 (69.3760)  acc5: 88.4000 (90.1600)  time: 0.1942  data: 0.0258  max mem: 28503
Test: Total time: 0:00:10 (0.4183 s / it)
* Acc@1 69.692 Acc@5 90.246 loss 1.479
Accuracy of the model on the 50000 test images: 69.7%
Max accuracy: 69.78%
Epoch: [24]  [   0/1251]  eta: 1:07:50  lr: 0.003998  min_lr: 0.003998  loss: 4.6510 (4.6510)  weight_decay: 0.0500 (0.0500)  time: 3.2534  data: 2.6535  max mem: 28503
Epoch: [24]  [ 200/1251]  eta: 0:06:21  lr: 0.003998  min_lr: 0.003998  loss: 4.0144 (3.9820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5501 (0.5488)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [24]  [ 400/1251]  eta: 0:05:02  lr: 0.003998  min_lr: 0.003998  loss: 3.9891 (3.9699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5837 (0.5949)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [24]  [ 600/1251]  eta: 0:03:49  lr: 0.003997  min_lr: 0.003997  loss: 4.0802 (3.9455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6169)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [24]  [ 800/1251]  eta: 0:02:38  lr: 0.003997  min_lr: 0.003997  loss: 4.1054 (3.9376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6359 (0.6198)  time: 0.3586  data: 0.0005  max mem: 28503
Epoch: [24]  [1000/1251]  eta: 0:01:28  lr: 0.003997  min_lr: 0.003997  loss: 3.9116 (3.9536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5542 (0.6141)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [24]  [1200/1251]  eta: 0:00:17  lr: 0.003997  min_lr: 0.003997  loss: 4.1805 (3.9657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5774 (0.6213)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [24]  [1250/1251]  eta: 0:00:00  lr: 0.003997  min_lr: 0.003997  loss: 3.7057 (3.9614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5370 (0.6187)  time: 0.2978  data: 0.0008  max mem: 28503
Epoch: [24] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.003997  min_lr: 0.003997  loss: 3.7057 (3.9508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5370 (0.6187)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.9985 (0.9985)  acc1: 79.2000 (79.2000)  acc5: 95.6000 (95.6000)  time: 5.8896  data: 5.6876  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 1.1593 (1.1845)  acc1: 73.6000 (75.0182)  acc5: 94.4000 (93.9273)  time: 0.7442  data: 0.5721  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3909 (1.4004)  acc1: 68.0000 (70.5333)  acc5: 89.2000 (90.3429)  time: 0.1991  data: 0.0303  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5589 (1.4079)  acc1: 67.6000 (70.4320)  acc5: 89.2000 (90.4000)  time: 0.1986  data: 0.0302  max mem: 28503
Test: Total time: 0:00:10 (0.4256 s / it)
* Acc@1 70.610 Acc@5 90.712 loss 1.399
Accuracy of the model on the 50000 test images: 70.6%
Max accuracy: 70.61%
Epoch: [25]  [   0/1251]  eta: 1:08:38  lr: 0.003997  min_lr: 0.003997  loss: 3.0435 (3.0435)  weight_decay: 0.0500 (0.0500)  time: 3.2922  data: 2.9308  max mem: 28503
Epoch: [25]  [ 200/1251]  eta: 0:06:20  lr: 0.003997  min_lr: 0.003997  loss: 4.2025 (3.9206)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5876 (0.6588)  time: 0.3486  data: 0.0004  max mem: 28503
Epoch: [25]  [ 400/1251]  eta: 0:05:02  lr: 0.003996  min_lr: 0.003996  loss: 4.0369 (3.9389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6202 (0.6567)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [25]  [ 600/1251]  eta: 0:03:49  lr: 0.003996  min_lr: 0.003996  loss: 4.1898 (3.9437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5942 (0.6285)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [25]  [ 800/1251]  eta: 0:02:38  lr: 0.003996  min_lr: 0.003996  loss: 3.7907 (3.9272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5592 (0.6283)  time: 0.3545  data: 0.0004  max mem: 28503
Epoch: [25]  [1000/1251]  eta: 0:01:28  lr: 0.003996  min_lr: 0.003996  loss: 4.1165 (3.9387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5911 (0.6193)  time: 0.3525  data: 0.0004  max mem: 28503
Epoch: [25]  [1200/1251]  eta: 0:00:17  lr: 0.003996  min_lr: 0.003996  loss: 4.0571 (3.9409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6320 (0.6188)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [25]  [1250/1251]  eta: 0:00:00  lr: 0.003995  min_lr: 0.003995  loss: 4.1001 (3.9414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5137 (0.6145)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [25] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003995  min_lr: 0.003995  loss: 4.1001 (3.9180)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5137 (0.6145)
Test:  [ 0/25]  eta: 0:02:24  loss: 1.0110 (1.0110)  acc1: 81.2000 (81.2000)  acc5: 96.0000 (96.0000)  time: 5.7962  data: 5.5659  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1596 (1.1999)  acc1: 78.0000 (76.3273)  acc5: 94.8000 (94.3273)  time: 0.7118  data: 0.5375  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.5036 (1.4379)  acc1: 67.2000 (71.1048)  acc5: 90.8000 (90.6667)  time: 0.1860  data: 0.0174  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5857 (1.4418)  acc1: 67.2000 (70.8320)  acc5: 88.8000 (90.6240)  time: 0.1858  data: 0.0173  max mem: 28503
Test: Total time: 0:00:10 (0.4129 s / it)
* Acc@1 70.822 Acc@5 90.844 loss 1.432
Accuracy of the model on the 50000 test images: 70.8%
Max accuracy: 70.82%
Epoch: [26]  [   0/1251]  eta: 1:02:03  lr: 0.003995  min_lr: 0.003995  loss: 3.0411 (3.0411)  weight_decay: 0.0500 (0.0500)  time: 2.9765  data: 2.6281  max mem: 28503
Epoch: [26]  [ 200/1251]  eta: 0:06:18  lr: 0.003995  min_lr: 0.003995  loss: 4.1422 (3.8573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5698 (0.5658)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [26]  [ 400/1251]  eta: 0:05:00  lr: 0.003995  min_lr: 0.003995  loss: 4.1638 (3.8984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5536 (0.5891)  time: 0.3620  data: 0.0004  max mem: 28503
Epoch: [26]  [ 600/1251]  eta: 0:03:48  lr: 0.003995  min_lr: 0.003995  loss: 4.2156 (3.8946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5193 (0.5904)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [26]  [ 800/1251]  eta: 0:02:38  lr: 0.003994  min_lr: 0.003994  loss: 4.0197 (3.8973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6296 (0.6008)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [26]  [1000/1251]  eta: 0:01:27  lr: 0.003994  min_lr: 0.003994  loss: 3.7299 (3.8975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5466 (0.6002)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [26]  [1200/1251]  eta: 0:00:17  lr: 0.003994  min_lr: 0.003994  loss: 3.6357 (3.9087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5818 (0.6058)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [26]  [1250/1251]  eta: 0:00:00  lr: 0.003994  min_lr: 0.003994  loss: 3.8168 (3.9051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.6097)  time: 0.2958  data: 0.0007  max mem: 28503
Epoch: [26] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003994  min_lr: 0.003994  loss: 3.8168 (3.8974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.6097)
Test:  [ 0/25]  eta: 0:02:27  loss: 0.9391 (0.9391)  acc1: 82.0000 (82.0000)  acc5: 96.8000 (96.8000)  time: 5.8817  data: 5.6803  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1318 (1.1349)  acc1: 74.0000 (76.1091)  acc5: 95.2000 (94.8000)  time: 0.7284  data: 0.5568  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.4358 (1.3728)  acc1: 67.6000 (70.9524)  acc5: 91.2000 (91.1429)  time: 0.1906  data: 0.0223  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4970 (1.3812)  acc1: 67.2000 (70.8480)  acc5: 89.2000 (91.0400)  time: 0.1904  data: 0.0222  max mem: 28503
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 71.178 Acc@5 91.140 loss 1.366
Accuracy of the model on the 50000 test images: 71.2%
Max accuracy: 71.18%
Epoch: [27]  [   0/1251]  eta: 1:01:00  lr: 0.003994  min_lr: 0.003994  loss: 4.3219 (4.3219)  weight_decay: 0.0500 (0.0500)  time: 2.9259  data: 2.5334  max mem: 28503
Epoch: [27]  [ 200/1251]  eta: 0:06:17  lr: 0.003994  min_lr: 0.003994  loss: 3.9503 (3.8781)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6071 (0.6185)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [27]  [ 400/1251]  eta: 0:05:01  lr: 0.003993  min_lr: 0.003993  loss: 3.7107 (3.8362)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5199 (0.6130)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [27]  [ 600/1251]  eta: 0:03:48  lr: 0.003993  min_lr: 0.003993  loss: 3.9233 (3.8263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6556 (0.6327)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [27]  [ 800/1251]  eta: 0:02:38  lr: 0.003993  min_lr: 0.003993  loss: 4.0617 (3.8523)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5538 (0.6304)  time: 0.3580  data: 0.0004  max mem: 28503
Epoch: [27]  [1000/1251]  eta: 0:01:27  lr: 0.003992  min_lr: 0.003992  loss: 4.1081 (3.8441)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5671 (0.6216)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [27]  [1200/1251]  eta: 0:00:17  lr: 0.003992  min_lr: 0.003992  loss: 3.4914 (3.8586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4566 (0.6086)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [27]  [1250/1251]  eta: 0:00:00  lr: 0.003992  min_lr: 0.003992  loss: 3.9659 (3.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4834 (0.6063)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [27] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.003992  min_lr: 0.003992  loss: 3.9659 (3.8677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4834 (0.6063)
Test:  [ 0/25]  eta: 0:01:56  loss: 0.8741 (0.8741)  acc1: 81.2000 (81.2000)  acc5: 96.4000 (96.4000)  time: 4.6746  data: 4.4303  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0570 (1.1173)  acc1: 76.8000 (76.0000)  acc5: 95.2000 (94.6182)  time: 0.6392  data: 0.4626  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.4545 (1.3452)  acc1: 68.0000 (71.4857)  acc5: 90.0000 (91.2191)  time: 0.2337  data: 0.0646  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5183 (1.3585)  acc1: 68.0000 (71.3280)  acc5: 88.8000 (91.1360)  time: 0.2106  data: 0.0422  max mem: 28503
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 71.670 Acc@5 91.150 loss 1.352
Accuracy of the model on the 50000 test images: 71.7%
Max accuracy: 71.67%
Epoch: [28]  [   0/1251]  eta: 1:02:16  lr: 0.003992  min_lr: 0.003992  loss: 4.2310 (4.2310)  weight_decay: 0.0500 (0.0500)  time: 2.9872  data: 2.5918  max mem: 28503
Epoch: [28]  [ 200/1251]  eta: 0:06:18  lr: 0.003992  min_lr: 0.003992  loss: 4.1273 (3.8669)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6197 (0.6492)  time: 0.3545  data: 0.0005  max mem: 28503
Epoch: [28]  [ 400/1251]  eta: 0:05:01  lr: 0.003991  min_lr: 0.003991  loss: 3.9101 (3.8826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5949 (0.6611)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [28]  [ 600/1251]  eta: 0:03:49  lr: 0.003991  min_lr: 0.003991  loss: 3.9775 (3.8499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5308 (0.6580)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [28]  [ 800/1251]  eta: 0:02:38  lr: 0.003991  min_lr: 0.003991  loss: 4.0278 (3.8466)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5615 (0.6770)  time: 0.3548  data: 0.0004  max mem: 28503
Epoch: [28]  [1000/1251]  eta: 0:01:27  lr: 0.003990  min_lr: 0.003990  loss: 3.8472 (3.8391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5032 (0.6693)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [28]  [1200/1251]  eta: 0:00:17  lr: 0.003990  min_lr: 0.003990  loss: 3.7820 (3.8281)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6754 (0.6596)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [28]  [1250/1251]  eta: 0:00:00  lr: 0.003990  min_lr: 0.003990  loss: 3.8541 (3.8309)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6091 (0.6593)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [28] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.003990  min_lr: 0.003990  loss: 3.8541 (3.8521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6091 (0.6593)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.9234 (0.9234)  acc1: 80.4000 (80.4000)  acc5: 95.6000 (95.6000)  time: 5.3621  data: 5.1625  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1059 (1.1210)  acc1: 75.6000 (75.7818)  acc5: 95.2000 (94.6546)  time: 0.6717  data: 0.4983  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3627 (1.3460)  acc1: 68.0000 (71.3333)  acc5: 90.8000 (91.5238)  time: 0.2139  data: 0.0443  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4582 (1.3620)  acc1: 68.4000 (71.1360)  acc5: 88.4000 (91.1840)  time: 0.2134  data: 0.0442  max mem: 28503
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 71.490 Acc@5 91.184 loss 1.355
Accuracy of the model on the 50000 test images: 71.5%
Max accuracy: 71.67%
Epoch: [29]  [   0/1251]  eta: 1:10:59  lr: 0.003990  min_lr: 0.003990  loss: 3.9139 (3.9139)  weight_decay: 0.0500 (0.0500)  time: 3.4052  data: 2.8290  max mem: 28503
Epoch: [29]  [ 200/1251]  eta: 0:06:22  lr: 0.003989  min_lr: 0.003989  loss: 3.7386 (3.7963)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6463 (0.6120)  time: 0.3564  data: 0.0004  max mem: 28503
Epoch: [29]  [ 400/1251]  eta: 0:05:03  lr: 0.003989  min_lr: 0.003989  loss: 3.9343 (3.8133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6443 (0.6357)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [29]  [ 600/1251]  eta: 0:03:50  lr: 0.003989  min_lr: 0.003989  loss: 3.6879 (3.8214)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5288 (0.6289)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [29]  [ 800/1251]  eta: 0:02:38  lr: 0.003988  min_lr: 0.003988  loss: 4.1290 (3.8263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6017 (0.6299)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [29]  [1000/1251]  eta: 0:01:28  lr: 0.003988  min_lr: 0.003988  loss: 3.6453 (3.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6025 (0.6322)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [29]  [1200/1251]  eta: 0:00:17  lr: 0.003988  min_lr: 0.003988  loss: 3.9932 (3.8262)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5973 (0.6254)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [29]  [1250/1251]  eta: 0:00:00  lr: 0.003987  min_lr: 0.003987  loss: 3.6238 (3.8251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.6263)  time: 0.2920  data: 0.0005  max mem: 28503
Epoch: [29] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003987  min_lr: 0.003987  loss: 3.6238 (3.8292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6740 (0.6263)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.9383 (0.9383)  acc1: 82.8000 (82.8000)  acc5: 96.0000 (96.0000)  time: 5.2900  data: 5.0597  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0779 (1.1199)  acc1: 77.2000 (76.3636)  acc5: 95.6000 (94.8000)  time: 0.6372  data: 0.4621  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.4734 (1.3428)  acc1: 67.2000 (71.3333)  acc5: 89.6000 (91.5619)  time: 0.2030  data: 0.0340  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5109 (1.3562)  acc1: 65.2000 (70.9920)  acc5: 89.2000 (91.4080)  time: 0.2016  data: 0.0331  max mem: 28503
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 71.856 Acc@5 91.376 loss 1.349
Accuracy of the model on the 50000 test images: 71.9%
Max accuracy: 71.86%
Epoch: [30]  [   0/1251]  eta: 1:05:18  lr: 0.003987  min_lr: 0.003987  loss: 4.3608 (4.3608)  weight_decay: 0.0500 (0.0500)  time: 3.1320  data: 2.7309  max mem: 28503
Epoch: [30]  [ 200/1251]  eta: 0:06:21  lr: 0.003987  min_lr: 0.003987  loss: 4.1345 (3.7631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6038 (0.6431)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [30]  [ 400/1251]  eta: 0:05:02  lr: 0.003987  min_lr: 0.003987  loss: 3.7941 (3.7764)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5605 (0.6216)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [30]  [ 600/1251]  eta: 0:03:49  lr: 0.003986  min_lr: 0.003986  loss: 3.6869 (3.7791)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5276 (0.6275)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [30]  [ 800/1251]  eta: 0:02:38  lr: 0.003986  min_lr: 0.003986  loss: 3.8743 (3.8012)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6013 (0.6247)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [30]  [1000/1251]  eta: 0:01:28  lr: 0.003985  min_lr: 0.003985  loss: 4.1037 (3.7973)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5922 (0.6180)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [30]  [1200/1251]  eta: 0:00:17  lr: 0.003985  min_lr: 0.003985  loss: 3.6098 (3.7937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5250 (0.6084)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [30]  [1250/1251]  eta: 0:00:00  lr: 0.003985  min_lr: 0.003985  loss: 3.8266 (3.7920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5344 (0.6064)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [30] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003985  min_lr: 0.003985  loss: 3.8266 (3.8061)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5344 (0.6064)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9882 (0.9882)  acc1: 82.0000 (82.0000)  acc5: 96.4000 (96.4000)  time: 5.6750  data: 5.4720  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.1078 (1.1382)  acc1: 76.4000 (76.5818)  acc5: 95.6000 (94.9818)  time: 0.6706  data: 0.4978  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.4171 (1.3389)  acc1: 68.0000 (71.9048)  acc5: 90.4000 (91.6762)  time: 0.1722  data: 0.0030  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.5551 (1.3512)  acc1: 66.8000 (71.6000)  acc5: 89.2000 (91.6160)  time: 0.1715  data: 0.0029  max mem: 28503
Test: Total time: 0:00:09 (0.3947 s / it)
* Acc@1 72.642 Acc@5 91.682 loss 1.346
Accuracy of the model on the 50000 test images: 72.6%
Max accuracy: 72.64%
Epoch: [31]  [   0/1251]  eta: 0:59:41  lr: 0.003985  min_lr: 0.003985  loss: 2.9978 (2.9978)  weight_decay: 0.0500 (0.0500)  time: 2.8630  data: 2.4478  max mem: 28503
Epoch: [31]  [ 200/1251]  eta: 0:06:18  lr: 0.003984  min_lr: 0.003984  loss: 4.1252 (3.7519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5760 (0.6885)  time: 0.3571  data: 0.0005  max mem: 28503
Epoch: [31]  [ 400/1251]  eta: 0:05:01  lr: 0.003984  min_lr: 0.003984  loss: 3.9941 (3.7923)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5465 (0.6568)  time: 0.3542  data: 0.0004  max mem: 28503
Epoch: [31]  [ 600/1251]  eta: 0:03:49  lr: 0.003983  min_lr: 0.003983  loss: 3.5819 (3.7906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5569 (0.6555)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [31]  [ 800/1251]  eta: 0:02:38  lr: 0.003983  min_lr: 0.003983  loss: 3.9134 (3.7988)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6872 (0.6583)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [31]  [1000/1251]  eta: 0:01:27  lr: 0.003982  min_lr: 0.003982  loss: 3.9793 (3.8094)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6597 (0.6573)  time: 0.3532  data: 0.0004  max mem: 28503
Epoch: [31]  [1200/1251]  eta: 0:00:17  lr: 0.003982  min_lr: 0.003982  loss: 3.6483 (3.8051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5431 (0.6528)  time: 0.3448  data: 0.0005  max mem: 28503
Epoch: [31]  [1250/1251]  eta: 0:00:00  lr: 0.003982  min_lr: 0.003982  loss: 3.7614 (3.8015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5967 (0.6538)  time: 0.2915  data: 0.0006  max mem: 28503
Epoch: [31] Total time: 0:07:16 (0.3488 s / it)
Averaged stats: lr: 0.003982  min_lr: 0.003982  loss: 3.7614 (3.8018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5967 (0.6538)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.9438 (0.9438)  acc1: 83.2000 (83.2000)  acc5: 95.6000 (95.6000)  time: 5.7790  data: 5.5751  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0257 (1.0922)  acc1: 78.0000 (77.1636)  acc5: 95.6000 (94.8727)  time: 0.7127  data: 0.5404  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3120 (1.2971)  acc1: 70.0000 (72.8191)  acc5: 91.2000 (91.9429)  time: 0.1956  data: 0.0268  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4227 (1.3065)  acc1: 70.0000 (72.6240)  acc5: 90.0000 (91.6800)  time: 0.1954  data: 0.0267  max mem: 28503
Test: Total time: 0:00:10 (0.4176 s / it)
* Acc@1 72.470 Acc@5 91.794 loss 1.303
Accuracy of the model on the 50000 test images: 72.5%
Max accuracy: 72.64%
Epoch: [32]  [   0/1251]  eta: 1:09:43  lr: 0.003982  min_lr: 0.003982  loss: 4.0309 (4.0309)  weight_decay: 0.0500 (0.0500)  time: 3.3443  data: 2.5453  max mem: 28503
Epoch: [32]  [ 200/1251]  eta: 0:06:21  lr: 0.003981  min_lr: 0.003981  loss: 3.6338 (3.7645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5664 (0.5947)  time: 0.3495  data: 0.0004  max mem: 28503
Epoch: [32]  [ 400/1251]  eta: 0:05:02  lr: 0.003981  min_lr: 0.003981  loss: 3.5897 (3.7891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (nan)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [32]  [ 600/1251]  eta: 0:03:49  lr: 0.003980  min_lr: 0.003980  loss: 3.7827 (3.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6269 (nan)  time: 0.3572  data: 0.0004  max mem: 28503
Epoch: [32]  [ 800/1251]  eta: 0:02:38  lr: 0.003980  min_lr: 0.003980  loss: 3.7317 (3.7889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7055 (nan)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [32]  [1000/1251]  eta: 0:01:28  lr: 0.003979  min_lr: 0.003979  loss: 3.3575 (3.7735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (nan)  time: 0.3522  data: 0.0004  max mem: 28503
Epoch: [32]  [1200/1251]  eta: 0:00:17  lr: 0.003979  min_lr: 0.003979  loss: 3.9585 (3.7847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5942 (nan)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [32]  [1250/1251]  eta: 0:00:00  lr: 0.003979  min_lr: 0.003979  loss: 3.6925 (3.7858)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (nan)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [32] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003979  min_lr: 0.003979  loss: 3.6925 (3.7763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5584 (nan)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.8964 (0.8964)  acc1: 80.8000 (80.8000)  acc5: 96.0000 (96.0000)  time: 5.4062  data: 5.2041  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0941 (1.0890)  acc1: 77.2000 (77.4182)  acc5: 94.8000 (95.0546)  time: 0.7036  data: 0.5302  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.4052 (1.3178)  acc1: 68.8000 (71.9619)  acc5: 92.0000 (91.6571)  time: 0.2047  data: 0.0314  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4682 (1.3306)  acc1: 68.8000 (71.9200)  acc5: 89.2000 (91.6320)  time: 0.2038  data: 0.0314  max mem: 28503
Test: Total time: 0:00:10 (0.4103 s / it)
* Acc@1 72.736 Acc@5 91.902 loss 1.315
Accuracy of the model on the 50000 test images: 72.7%
Max accuracy: 72.74%
Epoch: [33]  [   0/1251]  eta: 0:57:23  lr: 0.003979  min_lr: 0.003979  loss: 2.8471 (2.8471)  weight_decay: 0.0500 (0.0500)  time: 2.7526  data: 2.3651  max mem: 28503
Epoch: [33]  [ 200/1251]  eta: 0:06:22  lr: 0.003978  min_lr: 0.003978  loss: 3.8334 (3.7300)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3546  data: 0.0005  max mem: 28503
Epoch: [33]  [ 400/1251]  eta: 0:05:02  lr: 0.003978  min_lr: 0.003978  loss: 3.9706 (3.7283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6538 (nan)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [33]  [ 600/1251]  eta: 0:03:49  lr: 0.003977  min_lr: 0.003977  loss: 3.8354 (3.7096)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6117 (nan)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [33]  [ 800/1251]  eta: 0:02:38  lr: 0.003977  min_lr: 0.003977  loss: 3.7866 (3.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5531 (nan)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [33]  [1000/1251]  eta: 0:01:28  lr: 0.003976  min_lr: 0.003976  loss: 4.0881 (3.7314)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5549 (nan)  time: 0.3558  data: 0.0004  max mem: 28503
Epoch: [33]  [1200/1251]  eta: 0:00:17  lr: 0.003976  min_lr: 0.003976  loss: 3.6347 (3.7378)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5294 (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [33]  [1250/1251]  eta: 0:00:00  lr: 0.003975  min_lr: 0.003975  loss: 3.9554 (3.7413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5294 (nan)  time: 0.2921  data: 0.0008  max mem: 28503
Epoch: [33] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003975  min_lr: 0.003975  loss: 3.9554 (3.7607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5294 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8953 (0.8953)  acc1: 82.8000 (82.8000)  acc5: 96.4000 (96.4000)  time: 5.5151  data: 5.3309  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0376 (1.0620)  acc1: 79.6000 (77.4545)  acc5: 95.6000 (95.3091)  time: 0.6827  data: 0.5122  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3246 (1.2706)  acc1: 69.6000 (72.8191)  acc5: 92.0000 (92.4381)  time: 0.1890  data: 0.0203  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4160 (1.2808)  acc1: 69.6000 (72.6240)  acc5: 91.2000 (92.4640)  time: 0.1897  data: 0.0212  max mem: 28503
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 73.220 Acc@5 92.194 loss 1.274
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.22%
Epoch: [34]  [   0/1251]  eta: 1:01:31  lr: 0.003975  min_lr: 0.003975  loss: 3.9152 (3.9152)  weight_decay: 0.0500 (0.0500)  time: 2.9508  data: 2.5859  max mem: 28503
Epoch: [34]  [ 200/1251]  eta: 0:06:18  lr: 0.003975  min_lr: 0.003975  loss: 3.7991 (3.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5644 (0.6952)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [34]  [ 400/1251]  eta: 0:05:01  lr: 0.003974  min_lr: 0.003974  loss: 3.9231 (3.7311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6236 (0.6642)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [34]  [ 600/1251]  eta: 0:03:49  lr: 0.003974  min_lr: 0.003974  loss: 3.8551 (3.7150)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5939 (0.6772)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [34]  [ 800/1251]  eta: 0:02:38  lr: 0.003973  min_lr: 0.003973  loss: 3.7641 (3.7318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5956 (0.6709)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [34]  [1000/1251]  eta: 0:01:28  lr: 0.003972  min_lr: 0.003972  loss: 3.8225 (3.7432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5389 (0.6553)  time: 0.3514  data: 0.0004  max mem: 28503
Epoch: [34]  [1200/1251]  eta: 0:00:17  lr: 0.003972  min_lr: 0.003972  loss: 3.5596 (3.7360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7081 (0.6483)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [34]  [1250/1251]  eta: 0:00:00  lr: 0.003972  min_lr: 0.003972  loss: 3.6339 (3.7367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.6522)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [34] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.003972  min_lr: 0.003972  loss: 3.6339 (3.7479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7290 (0.6522)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.8941 (0.8941)  acc1: 81.2000 (81.2000)  acc5: 97.2000 (97.2000)  time: 5.5971  data: 5.4098  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0182 (1.0741)  acc1: 80.4000 (77.9273)  acc5: 95.6000 (94.9818)  time: 0.6978  data: 0.5265  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3259 (1.2825)  acc1: 70.0000 (73.4286)  acc5: 92.0000 (92.0571)  time: 0.1882  data: 0.0191  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4785 (1.2890)  acc1: 69.6000 (73.2160)  acc5: 90.0000 (92.0800)  time: 0.1875  data: 0.0191  max mem: 28503
Test: Total time: 0:00:10 (0.4074 s / it)
* Acc@1 73.434 Acc@5 92.104 loss 1.283
Accuracy of the model on the 50000 test images: 73.4%
Max accuracy: 73.43%
Epoch: [35]  [   0/1251]  eta: 1:03:08  lr: 0.003972  min_lr: 0.003972  loss: 3.1017 (3.1017)  weight_decay: 0.0500 (0.0500)  time: 3.0286  data: 2.6574  max mem: 28503
Epoch: [35]  [ 200/1251]  eta: 0:06:22  lr: 0.003971  min_lr: 0.003971  loss: 3.7174 (3.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5619 (0.6374)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [35]  [ 400/1251]  eta: 0:05:02  lr: 0.003971  min_lr: 0.003971  loss: 3.7569 (3.7319)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6102 (0.6263)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [35]  [ 600/1251]  eta: 0:03:49  lr: 0.003970  min_lr: 0.003970  loss: 3.7651 (3.7576)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6115 (0.6419)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [35]  [ 800/1251]  eta: 0:02:38  lr: 0.003969  min_lr: 0.003969  loss: 3.7881 (3.7451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6794 (0.6667)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [35]  [1000/1251]  eta: 0:01:27  lr: 0.003969  min_lr: 0.003969  loss: 3.5581 (3.7400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5337 (0.6626)  time: 0.3527  data: 0.0004  max mem: 28503
Epoch: [35]  [1200/1251]  eta: 0:00:17  lr: 0.003968  min_lr: 0.003968  loss: 3.5993 (3.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.6703)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [35]  [1250/1251]  eta: 0:00:00  lr: 0.003968  min_lr: 0.003968  loss: 3.6310 (3.7374)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5511 (0.6687)  time: 0.2914  data: 0.0006  max mem: 28503
Epoch: [35] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.003968  min_lr: 0.003968  loss: 3.6310 (3.7403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5511 (0.6687)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.9387 (0.9387)  acc1: 81.2000 (81.2000)  acc5: 94.8000 (94.8000)  time: 5.6733  data: 5.4784  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0667 (1.0544)  acc1: 80.4000 (78.1091)  acc5: 94.8000 (94.8727)  time: 0.7164  data: 0.5451  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2831 (1.2634)  acc1: 70.0000 (73.2000)  acc5: 90.4000 (91.8095)  time: 0.2071  data: 0.0384  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3738 (1.2681)  acc1: 68.8000 (72.9920)  acc5: 90.4000 (91.9360)  time: 0.2068  data: 0.0383  max mem: 28503
Test: Total time: 0:00:10 (0.4229 s / it)
* Acc@1 73.632 Acc@5 92.278 loss 1.258
Accuracy of the model on the 50000 test images: 73.6%
Max accuracy: 73.63%
Epoch: [36]  [   0/1251]  eta: 1:02:57  lr: 0.003968  min_lr: 0.003968  loss: 3.7607 (3.7607)  weight_decay: 0.0500 (0.0500)  time: 3.0192  data: 2.6450  max mem: 28503
Epoch: [36]  [ 200/1251]  eta: 0:06:21  lr: 0.003967  min_lr: 0.003967  loss: 3.8686 (3.7698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5526 (0.6471)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [36]  [ 400/1251]  eta: 0:05:01  lr: 0.003967  min_lr: 0.003967  loss: 3.7142 (3.7526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5659 (0.6608)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [36]  [ 600/1251]  eta: 0:03:49  lr: 0.003966  min_lr: 0.003966  loss: 3.3972 (3.7361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6923 (0.6751)  time: 0.3471  data: 0.0005  max mem: 28503
Epoch: [36]  [ 800/1251]  eta: 0:02:38  lr: 0.003965  min_lr: 0.003965  loss: 3.8711 (3.7349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5679 (0.6778)  time: 0.3440  data: 0.0005  max mem: 28503
Epoch: [36]  [1000/1251]  eta: 0:01:27  lr: 0.003965  min_lr: 0.003965  loss: 3.6954 (3.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6360 (0.6739)  time: 0.3520  data: 0.0005  max mem: 28503
Epoch: [36]  [1200/1251]  eta: 0:00:17  lr: 0.003964  min_lr: 0.003964  loss: 3.9998 (3.7216)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [36]  [1250/1251]  eta: 0:00:00  lr: 0.003964  min_lr: 0.003964  loss: 3.9217 (3.7225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6274 (nan)  time: 0.2913  data: 0.0006  max mem: 28503
Epoch: [36] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.003964  min_lr: 0.003964  loss: 3.9217 (3.7203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6274 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.9028 (0.9028)  acc1: 83.2000 (83.2000)  acc5: 96.4000 (96.4000)  time: 5.4523  data: 5.2312  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0603 (1.0895)  acc1: 78.8000 (77.4909)  acc5: 95.6000 (95.0182)  time: 0.6791  data: 0.5054  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3328 (1.3090)  acc1: 68.8000 (72.4571)  acc5: 91.6000 (92.0762)  time: 0.1852  data: 0.0165  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4962 (1.3208)  acc1: 68.4000 (72.3040)  acc5: 90.0000 (92.0640)  time: 0.1849  data: 0.0164  max mem: 28503
Test: Total time: 0:00:10 (0.4028 s / it)
* Acc@1 73.172 Acc@5 92.156 loss 1.311
Accuracy of the model on the 50000 test images: 73.2%
Max accuracy: 73.63%
Epoch: [37]  [   0/1251]  eta: 1:09:23  lr: 0.003964  min_lr: 0.003964  loss: 3.7260 (3.7260)  weight_decay: 0.0500 (0.0500)  time: 3.3278  data: 1.7833  max mem: 28503
Epoch: [37]  [ 200/1251]  eta: 0:06:22  lr: 0.003963  min_lr: 0.003963  loss: 3.8725 (3.7495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5967 (0.6678)  time: 0.3447  data: 0.0005  max mem: 28503
Epoch: [37]  [ 400/1251]  eta: 0:05:02  lr: 0.003962  min_lr: 0.003962  loss: 3.9780 (3.7259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7021 (0.6797)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [37]  [ 600/1251]  eta: 0:03:50  lr: 0.003962  min_lr: 0.003962  loss: 3.9081 (3.7035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7894 (0.6814)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [37]  [ 800/1251]  eta: 0:02:38  lr: 0.003961  min_lr: 0.003961  loss: 3.8736 (3.7224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7279 (0.6935)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [37]  [1000/1251]  eta: 0:01:28  lr: 0.003960  min_lr: 0.003960  loss: 3.7709 (3.7169)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6725 (0.6813)  time: 0.3576  data: 0.0004  max mem: 28503
Epoch: [37]  [1200/1251]  eta: 0:00:17  lr: 0.003960  min_lr: 0.003960  loss: 3.8543 (3.7210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6545 (0.6918)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [37]  [1250/1251]  eta: 0:00:00  lr: 0.003959  min_lr: 0.003959  loss: 3.7826 (3.7236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5852 (0.6886)  time: 0.2961  data: 0.0007  max mem: 28503
Epoch: [37] Total time: 0:07:18 (0.3501 s / it)
Averaged stats: lr: 0.003959  min_lr: 0.003959  loss: 3.7826 (3.7125)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5852 (0.6886)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8474 (0.8474)  acc1: 83.2000 (83.2000)  acc5: 97.6000 (97.6000)  time: 5.8071  data: 5.6061  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0419 (1.0445)  acc1: 79.6000 (78.5818)  acc5: 95.6000 (95.1636)  time: 0.6995  data: 0.5277  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2826 (1.2517)  acc1: 69.6000 (73.6571)  acc5: 91.6000 (92.3048)  time: 0.1787  data: 0.0100  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3596 (1.2580)  acc1: 68.4000 (73.4240)  acc5: 90.0000 (92.2720)  time: 0.1784  data: 0.0099  max mem: 28503
Test: Total time: 0:00:10 (0.4095 s / it)
* Acc@1 73.824 Acc@5 92.508 loss 1.250
Accuracy of the model on the 50000 test images: 73.8%
Max accuracy: 73.82%
Epoch: [38]  [   0/1251]  eta: 0:53:48  lr: 0.003959  min_lr: 0.003959  loss: 4.0920 (4.0920)  weight_decay: 0.0500 (0.0500)  time: 2.5810  data: 2.2276  max mem: 28503
Epoch: [38]  [ 200/1251]  eta: 0:06:15  lr: 0.003959  min_lr: 0.003959  loss: 3.8548 (3.7050)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6369 (0.7231)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [38]  [ 400/1251]  eta: 0:05:00  lr: 0.003958  min_lr: 0.003958  loss: 3.7916 (3.7377)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6278 (0.6958)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [38]  [ 600/1251]  eta: 0:03:48  lr: 0.003957  min_lr: 0.003957  loss: 4.0056 (3.7364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.7127)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [38]  [ 800/1251]  eta: 0:02:38  lr: 0.003956  min_lr: 0.003956  loss: 3.9538 (3.7261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6160 (0.7012)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [38]  [1000/1251]  eta: 0:01:27  lr: 0.003956  min_lr: 0.003956  loss: 3.9484 (3.7303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.6939)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [38]  [1200/1251]  eta: 0:00:17  lr: 0.003955  min_lr: 0.003955  loss: 3.9629 (3.7256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6634 (0.6926)  time: 0.3463  data: 0.0005  max mem: 28503
Epoch: [38]  [1250/1251]  eta: 0:00:00  lr: 0.003955  min_lr: 0.003955  loss: 3.4896 (3.7202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6917)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [38] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003955  min_lr: 0.003955  loss: 3.4896 (3.7000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6180 (0.6917)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7800 (0.7800)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.7527  data: 5.5566  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9822 (0.9864)  acc1: 78.8000 (79.2364)  acc5: 96.4000 (95.4545)  time: 0.7429  data: 0.5708  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2590 (1.1940)  acc1: 71.6000 (74.4571)  acc5: 92.4000 (92.6667)  time: 0.2052  data: 0.0362  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3573 (1.2067)  acc1: 70.8000 (73.9840)  acc5: 90.4000 (92.5120)  time: 0.2045  data: 0.0361  max mem: 28503
Test: Total time: 0:00:10 (0.4251 s / it)
* Acc@1 73.872 Acc@5 92.512 loss 1.203
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.87%
Epoch: [39]  [   0/1251]  eta: 0:57:27  lr: 0.003955  min_lr: 0.003955  loss: 4.0799 (4.0799)  weight_decay: 0.0500 (0.0500)  time: 2.7562  data: 2.3921  max mem: 28503
Epoch: [39]  [ 200/1251]  eta: 0:06:18  lr: 0.003954  min_lr: 0.003954  loss: 3.6420 (3.6524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5555 (0.6674)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [39]  [ 400/1251]  eta: 0:05:01  lr: 0.003953  min_lr: 0.003953  loss: 3.7047 (3.6995)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7217 (0.7162)  time: 0.3577  data: 0.0004  max mem: 28503
Epoch: [39]  [ 600/1251]  eta: 0:03:48  lr: 0.003952  min_lr: 0.003952  loss: 3.8532 (3.7081)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7049 (0.7114)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [39]  [ 800/1251]  eta: 0:02:38  lr: 0.003952  min_lr: 0.003952  loss: 3.5472 (3.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5285 (0.7127)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [39]  [1000/1251]  eta: 0:01:27  lr: 0.003951  min_lr: 0.003951  loss: 3.8287 (3.6976)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6499 (0.7167)  time: 0.3562  data: 0.0003  max mem: 28503
Epoch: [39]  [1200/1251]  eta: 0:00:17  lr: 0.003950  min_lr: 0.003950  loss: 3.8680 (3.6948)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6908 (0.7137)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [39]  [1250/1251]  eta: 0:00:00  lr: 0.003950  min_lr: 0.003950  loss: 3.8861 (3.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7092)  time: 0.2975  data: 0.0006  max mem: 28503
Epoch: [39] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003950  min_lr: 0.003950  loss: 3.8861 (3.6815)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6576 (0.7092)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8176 (0.8176)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.8030  data: 5.5969  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0015 (1.0493)  acc1: 78.8000 (78.0000)  acc5: 95.6000 (95.3091)  time: 0.7165  data: 0.5442  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2531 (1.2588)  acc1: 72.4000 (73.6952)  acc5: 92.0000 (92.5333)  time: 0.1884  data: 0.0195  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3946 (1.2687)  acc1: 70.0000 (73.3760)  acc5: 90.8000 (92.3680)  time: 0.1881  data: 0.0195  max mem: 28503
Test: Total time: 0:00:10 (0.4128 s / it)
* Acc@1 73.876 Acc@5 92.516 loss 1.254
Accuracy of the model on the 50000 test images: 73.9%
Max accuracy: 73.88%
Epoch: [40]  [   0/1251]  eta: 1:06:32  lr: 0.003950  min_lr: 0.003950  loss: 3.4375 (3.4375)  weight_decay: 0.0500 (0.0500)  time: 3.1911  data: 2.8326  max mem: 28503
Epoch: [40]  [ 200/1251]  eta: 0:06:18  lr: 0.003949  min_lr: 0.003949  loss: 3.5702 (3.6674)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6034 (0.6947)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [40]  [ 400/1251]  eta: 0:05:02  lr: 0.003948  min_lr: 0.003948  loss: 3.1447 (3.6514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5518 (0.6692)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [40]  [ 600/1251]  eta: 0:03:49  lr: 0.003947  min_lr: 0.003947  loss: 3.7550 (3.6940)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6748 (0.6954)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [40]  [ 800/1251]  eta: 0:02:38  lr: 0.003947  min_lr: 0.003947  loss: 4.0721 (3.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5774 (0.7010)  time: 0.3523  data: 0.0004  max mem: 28503
Epoch: [40]  [1000/1251]  eta: 0:01:27  lr: 0.003946  min_lr: 0.003946  loss: 3.8128 (3.7024)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.7153)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [40]  [1200/1251]  eta: 0:00:17  lr: 0.003945  min_lr: 0.003945  loss: 3.9674 (3.7034)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6665 (0.7146)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [40]  [1250/1251]  eta: 0:00:00  lr: 0.003945  min_lr: 0.003945  loss: 3.9076 (3.7007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5534 (0.7104)  time: 0.2915  data: 0.0006  max mem: 28503
Epoch: [40] Total time: 0:07:17 (0.3493 s / it)
Averaged stats: lr: 0.003945  min_lr: 0.003945  loss: 3.9076 (3.6716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5534 (0.7104)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8592 (0.8592)  acc1: 83.2000 (83.2000)  acc5: 98.0000 (98.0000)  time: 5.6074  data: 5.4020  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0428 (1.0224)  acc1: 78.8000 (79.3091)  acc5: 96.0000 (95.4546)  time: 0.6635  data: 0.4914  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2001 (1.2255)  acc1: 70.8000 (74.3810)  acc5: 92.4000 (92.8952)  time: 0.1688  data: 0.0003  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4010 (1.2375)  acc1: 70.8000 (73.8080)  acc5: 90.4000 (92.7680)  time: 0.1687  data: 0.0002  max mem: 28503
Test: Total time: 0:00:09 (0.3893 s / it)
* Acc@1 74.320 Acc@5 92.564 loss 1.230
Accuracy of the model on the 50000 test images: 74.3%
Max accuracy: 74.32%
Epoch: [41]  [   0/1251]  eta: 1:02:54  lr: 0.003945  min_lr: 0.003945  loss: 3.7795 (3.7795)  weight_decay: 0.0500 (0.0500)  time: 3.0173  data: 2.6427  max mem: 28503
Epoch: [41]  [ 200/1251]  eta: 0:06:17  lr: 0.003944  min_lr: 0.003944  loss: 3.7746 (3.5862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6451 (0.8022)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [41]  [ 400/1251]  eta: 0:05:00  lr: 0.003943  min_lr: 0.003943  loss: 3.6109 (3.6300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7280 (0.7411)  time: 0.3531  data: 0.0004  max mem: 28503
Epoch: [41]  [ 600/1251]  eta: 0:03:48  lr: 0.003942  min_lr: 0.003942  loss: 3.5671 (3.6381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9094 (0.7700)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [41]  [ 800/1251]  eta: 0:02:38  lr: 0.003941  min_lr: 0.003941  loss: 3.7254 (3.6495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5808 (0.7500)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [41]  [1000/1251]  eta: 0:01:27  lr: 0.003940  min_lr: 0.003940  loss: 3.7715 (3.6629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5528 (0.7500)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [41]  [1200/1251]  eta: 0:00:17  lr: 0.003940  min_lr: 0.003940  loss: 3.8092 (3.6655)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5510 (0.7327)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [41]  [1250/1251]  eta: 0:00:00  lr: 0.003939  min_lr: 0.003939  loss: 3.8963 (3.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.7308)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [41] Total time: 0:07:16 (0.3493 s / it)
Averaged stats: lr: 0.003939  min_lr: 0.003939  loss: 3.8963 (3.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5813 (0.7308)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.8771 (0.8771)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 4.4459  data: 4.2025  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0896 (1.1088)  acc1: 80.4000 (79.0545)  acc5: 95.2000 (95.3818)  time: 0.6445  data: 0.4650  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3243 (1.3145)  acc1: 72.0000 (74.5143)  acc5: 92.4000 (92.4952)  time: 0.2407  data: 0.0697  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4642 (1.3228)  acc1: 72.4000 (74.4320)  acc5: 90.4000 (92.5600)  time: 0.2095  data: 0.0408  max mem: 28503
Test: Total time: 0:00:10 (0.4004 s / it)
* Acc@1 74.492 Acc@5 92.716 loss 1.319
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.49%
Epoch: [42]  [   0/1251]  eta: 1:03:54  lr: 0.003939  min_lr: 0.003939  loss: 3.7891 (3.7891)  weight_decay: 0.0500 (0.0500)  time: 3.0651  data: 2.6976  max mem: 28503
Epoch: [42]  [ 200/1251]  eta: 0:06:18  lr: 0.003939  min_lr: 0.003939  loss: 3.9111 (3.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6363 (0.7265)  time: 0.3542  data: 0.0004  max mem: 28503
Epoch: [42]  [ 400/1251]  eta: 0:05:02  lr: 0.003938  min_lr: 0.003938  loss: 3.6057 (3.6530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6985 (0.7512)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [42]  [ 600/1251]  eta: 0:03:49  lr: 0.003937  min_lr: 0.003937  loss: 3.6104 (3.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.7589)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [42]  [ 800/1251]  eta: 0:02:38  lr: 0.003936  min_lr: 0.003936  loss: 3.3898 (3.6434)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5630 (0.7480)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [42]  [1000/1251]  eta: 0:01:28  lr: 0.003935  min_lr: 0.003935  loss: 3.8575 (3.6389)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6384 (0.7517)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [42]  [1200/1251]  eta: 0:00:17  lr: 0.003934  min_lr: 0.003934  loss: 3.8102 (3.6460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8977 (0.7535)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [42]  [1250/1251]  eta: 0:00:00  lr: 0.003934  min_lr: 0.003934  loss: 3.6107 (3.6405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6902 (0.7478)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [42] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003934  min_lr: 0.003934  loss: 3.6107 (3.6487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6902 (0.7478)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7993 (0.7993)  acc1: 82.8000 (82.8000)  acc5: 96.8000 (96.8000)  time: 5.2983  data: 5.0868  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9624 (0.9618)  acc1: 78.8000 (78.8727)  acc5: 96.4000 (95.4909)  time: 0.6891  data: 0.5158  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2061 (1.1663)  acc1: 72.0000 (74.2286)  acc5: 92.0000 (92.7238)  time: 0.2101  data: 0.0412  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3065 (1.1734)  acc1: 70.8000 (73.7600)  acc5: 91.6000 (92.8160)  time: 0.2101  data: 0.0416  max mem: 28503
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 74.528 Acc@5 92.956 loss 1.165
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.53%
Epoch: [43]  [   0/1251]  eta: 1:06:49  lr: 0.003934  min_lr: 0.003934  loss: 3.8030 (3.8030)  weight_decay: 0.0500 (0.0500)  time: 3.2053  data: 2.8146  max mem: 28503
Epoch: [43]  [ 200/1251]  eta: 0:06:19  lr: 0.003933  min_lr: 0.003933  loss: 3.7922 (3.6564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7087 (0.7854)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [43]  [ 400/1251]  eta: 0:05:02  lr: 0.003932  min_lr: 0.003932  loss: 3.9096 (3.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7417 (0.7389)  time: 0.3535  data: 0.0004  max mem: 28503
Epoch: [43]  [ 600/1251]  eta: 0:03:49  lr: 0.003931  min_lr: 0.003931  loss: 3.8537 (3.6605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6341 (0.7565)  time: 0.3465  data: 0.0003  max mem: 28503
Epoch: [43]  [ 800/1251]  eta: 0:02:38  lr: 0.003930  min_lr: 0.003930  loss: 3.4172 (3.6428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7669 (0.7664)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [43]  [1000/1251]  eta: 0:01:28  lr: 0.003929  min_lr: 0.003929  loss: 3.5180 (3.6458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6549 (0.7594)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [43]  [1200/1251]  eta: 0:00:17  lr: 0.003928  min_lr: 0.003928  loss: 3.8987 (3.6330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6104 (0.7589)  time: 0.3536  data: 0.0005  max mem: 28503
Epoch: [43]  [1250/1251]  eta: 0:00:00  lr: 0.003928  min_lr: 0.003928  loss: 3.8727 (3.6392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6218 (0.7582)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [43] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003928  min_lr: 0.003928  loss: 3.8727 (3.6416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6218 (0.7582)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.8674 (0.8674)  acc1: 84.4000 (84.4000)  acc5: 96.8000 (96.8000)  time: 5.8317  data: 5.6145  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0344 (1.0340)  acc1: 80.0000 (78.6182)  acc5: 96.4000 (95.8909)  time: 0.7122  data: 0.5385  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2913 (1.2485)  acc1: 70.8000 (73.9238)  acc5: 92.8000 (92.8571)  time: 0.1844  data: 0.0155  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3618 (1.2539)  acc1: 70.8000 (73.7600)  acc5: 90.8000 (92.8160)  time: 0.1839  data: 0.0155  max mem: 28503
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 74.528 Acc@5 92.906 loss 1.245
Accuracy of the model on the 50000 test images: 74.5%
Max accuracy: 74.53%
Epoch: [44]  [   0/1251]  eta: 1:10:47  lr: 0.003928  min_lr: 0.003928  loss: 4.3474 (4.3474)  weight_decay: 0.0500 (0.0500)  time: 3.3954  data: 2.5822  max mem: 28503
Epoch: [44]  [ 200/1251]  eta: 0:06:22  lr: 0.003927  min_lr: 0.003927  loss: 3.7035 (3.5867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7020 (0.7451)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [44]  [ 400/1251]  eta: 0:05:02  lr: 0.003926  min_lr: 0.003926  loss: 3.7208 (3.5974)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.7778)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [44]  [ 600/1251]  eta: 0:03:49  lr: 0.003925  min_lr: 0.003925  loss: 3.6636 (3.6238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6053 (0.7574)  time: 0.3537  data: 0.0004  max mem: 28503
Epoch: [44]  [ 800/1251]  eta: 0:02:38  lr: 0.003924  min_lr: 0.003924  loss: 3.7783 (3.6286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7650 (0.7726)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [44]  [1000/1251]  eta: 0:01:28  lr: 0.003923  min_lr: 0.003923  loss: 3.6931 (3.6192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6001 (nan)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [44]  [1200/1251]  eta: 0:00:17  lr: 0.003922  min_lr: 0.003922  loss: 3.6570 (3.6285)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7085 (nan)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [44]  [1250/1251]  eta: 0:00:00  lr: 0.003922  min_lr: 0.003922  loss: 3.3361 (3.6236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (nan)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [44] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003922  min_lr: 0.003922  loss: 3.3361 (3.6282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7675 (0.7675)  acc1: 83.6000 (83.6000)  acc5: 97.2000 (97.2000)  time: 5.5708  data: 5.3778  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0020 (0.9625)  acc1: 78.4000 (79.0909)  acc5: 96.4000 (95.8182)  time: 0.7223  data: 0.5512  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1857 (1.1658)  acc1: 72.0000 (74.6286)  acc5: 92.8000 (92.8571)  time: 0.2030  data: 0.0343  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3114 (1.1823)  acc1: 70.8000 (74.0000)  acc5: 91.6000 (92.7840)  time: 0.2027  data: 0.0342  max mem: 28503
Test: Total time: 0:00:10 (0.4163 s / it)
* Acc@1 74.736 Acc@5 93.008 loss 1.165
Accuracy of the model on the 50000 test images: 74.7%
Max accuracy: 74.74%
Epoch: [45]  [   0/1251]  eta: 1:01:46  lr: 0.003922  min_lr: 0.003922  loss: 4.0595 (4.0595)  weight_decay: 0.0500 (0.0500)  time: 2.9629  data: 2.5657  max mem: 28503
Epoch: [45]  [ 200/1251]  eta: 0:06:19  lr: 0.003921  min_lr: 0.003921  loss: 3.6371 (3.6722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.6774)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [45]  [ 400/1251]  eta: 0:05:01  lr: 0.003920  min_lr: 0.003920  loss: 3.7271 (3.6595)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8723 (0.7735)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [45]  [ 600/1251]  eta: 0:03:49  lr: 0.003919  min_lr: 0.003919  loss: 3.7661 (3.6432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6346 (0.7728)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [45]  [ 800/1251]  eta: 0:02:38  lr: 0.003918  min_lr: 0.003918  loss: 3.8622 (3.6385)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [45]  [1000/1251]  eta: 0:01:27  lr: 0.003917  min_lr: 0.003917  loss: 3.7123 (3.6367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5289 (nan)  time: 0.3526  data: 0.0004  max mem: 28503
Epoch: [45]  [1200/1251]  eta: 0:00:17  lr: 0.003916  min_lr: 0.003916  loss: 3.5834 (3.6271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6984 (nan)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [45]  [1250/1251]  eta: 0:00:00  lr: 0.003916  min_lr: 0.003916  loss: 3.4356 (3.6268)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6680 (nan)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [45] Total time: 0:07:16 (0.3488 s / it)
Averaged stats: lr: 0.003916  min_lr: 0.003916  loss: 3.4356 (3.6220)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6680 (nan)
Test:  [ 0/25]  eta: 0:02:04  loss: 0.8150 (0.8150)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 4.9601  data: 4.7482  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0355 (1.0159)  acc1: 81.2000 (78.5818)  acc5: 96.0000 (95.7455)  time: 0.7136  data: 0.5396  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1798 (1.2036)  acc1: 72.4000 (75.0286)  acc5: 92.4000 (92.9143)  time: 0.2286  data: 0.0594  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3401 (1.2107)  acc1: 72.4000 (74.5120)  acc5: 91.6000 (92.8800)  time: 0.2107  data: 0.0425  max mem: 28503
Test: Total time: 0:00:10 (0.4116 s / it)
* Acc@1 74.888 Acc@5 93.042 loss 1.203
Accuracy of the model on the 50000 test images: 74.9%
Max accuracy: 74.89%
Epoch: [46]  [   0/1251]  eta: 1:05:38  lr: 0.003916  min_lr: 0.003916  loss: 4.2687 (4.2687)  weight_decay: 0.0500 (0.0500)  time: 3.1485  data: 2.7821  max mem: 28503
Epoch: [46]  [ 200/1251]  eta: 0:06:20  lr: 0.003914  min_lr: 0.003914  loss: 3.8479 (3.5953)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.4801 (0.6883)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [46]  [ 400/1251]  eta: 0:05:02  lr: 0.003913  min_lr: 0.003913  loss: 3.9202 (3.6186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5629 (0.7136)  time: 0.3560  data: 0.0005  max mem: 28503
Epoch: [46]  [ 600/1251]  eta: 0:03:49  lr: 0.003912  min_lr: 0.003912  loss: 3.6469 (3.5970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7297 (0.7400)  time: 0.3456  data: 0.0005  max mem: 28503
Epoch: [46]  [ 800/1251]  eta: 0:02:38  lr: 0.003911  min_lr: 0.003911  loss: 3.7973 (3.6043)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5963 (0.7230)  time: 0.3468  data: 0.0005  max mem: 28503
Epoch: [46]  [1000/1251]  eta: 0:01:28  lr: 0.003910  min_lr: 0.003910  loss: 3.2947 (3.5982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6277 (0.7454)  time: 0.3550  data: 0.0005  max mem: 28503
Epoch: [46]  [1200/1251]  eta: 0:00:17  lr: 0.003909  min_lr: 0.003909  loss: 3.5652 (3.6002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5980 (0.7479)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [46]  [1250/1251]  eta: 0:00:00  lr: 0.003909  min_lr: 0.003909  loss: 3.4857 (3.5950)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6046 (0.7451)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [46] Total time: 0:07:19 (0.3511 s / it)
Averaged stats: lr: 0.003909  min_lr: 0.003909  loss: 3.4857 (3.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6046 (0.7451)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.8130 (0.8130)  acc1: 84.0000 (84.0000)  acc5: 98.0000 (98.0000)  time: 5.3543  data: 5.1555  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0248 (1.0161)  acc1: 79.6000 (79.1636)  acc5: 95.6000 (95.7091)  time: 0.6969  data: 0.5245  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2618 (1.2279)  acc1: 72.4000 (74.2857)  acc5: 93.2000 (93.1238)  time: 0.1999  data: 0.0308  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3827 (1.2402)  acc1: 71.2000 (74.0000)  acc5: 91.2000 (93.0240)  time: 0.1992  data: 0.0307  max mem: 28503
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 74.996 Acc@5 93.050 loss 1.228
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.00%
Epoch: [47]  [   0/1251]  eta: 0:59:02  lr: 0.003909  min_lr: 0.003909  loss: 3.6182 (3.6182)  weight_decay: 0.0500 (0.0500)  time: 2.8320  data: 2.3894  max mem: 28503
Epoch: [47]  [ 200/1251]  eta: 0:06:20  lr: 0.003908  min_lr: 0.003908  loss: 3.5308 (3.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6906 (0.7846)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [47]  [ 400/1251]  eta: 0:05:01  lr: 0.003907  min_lr: 0.003907  loss: 3.6790 (3.6074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6555 (0.8170)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [47]  [ 600/1251]  eta: 0:03:49  lr: 0.003906  min_lr: 0.003906  loss: 3.7049 (3.6153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7109 (0.8356)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [47]  [ 800/1251]  eta: 0:02:38  lr: 0.003905  min_lr: 0.003905  loss: 3.5724 (3.6240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.8061)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [47]  [1000/1251]  eta: 0:01:28  lr: 0.003904  min_lr: 0.003904  loss: 3.3524 (3.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7495 (0.8140)  time: 0.3513  data: 0.0004  max mem: 28503
Epoch: [47]  [1200/1251]  eta: 0:00:17  lr: 0.003902  min_lr: 0.003902  loss: 3.4690 (3.6098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6652 (0.8010)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [47]  [1250/1251]  eta: 0:00:00  lr: 0.003902  min_lr: 0.003902  loss: 3.7025 (3.6112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8028)  time: 0.2925  data: 0.0007  max mem: 28503
Epoch: [47] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.003902  min_lr: 0.003902  loss: 3.7025 (3.6107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7983 (0.8028)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8346 (0.8346)  acc1: 86.0000 (86.0000)  acc5: 98.0000 (98.0000)  time: 5.5175  data: 5.3196  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0538 (1.0519)  acc1: 78.8000 (79.3091)  acc5: 96.0000 (95.5636)  time: 0.7309  data: 0.5589  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3137 (1.2519)  acc1: 72.0000 (75.1619)  acc5: 92.4000 (92.8952)  time: 0.2103  data: 0.0415  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4365 (1.2595)  acc1: 71.6000 (74.6400)  acc5: 91.2000 (92.7520)  time: 0.2099  data: 0.0414  max mem: 28503
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 75.002 Acc@5 92.990 loss 1.242
Accuracy of the model on the 50000 test images: 75.0%
Max accuracy: 75.00%
Epoch: [48]  [   0/1251]  eta: 1:04:19  lr: 0.003902  min_lr: 0.003902  loss: 2.5499 (2.5499)  weight_decay: 0.0500 (0.0500)  time: 3.0850  data: 2.6867  max mem: 28503
Epoch: [48]  [ 200/1251]  eta: 0:06:20  lr: 0.003901  min_lr: 0.003901  loss: 3.7522 (3.6092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7295 (0.8296)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [48]  [ 400/1251]  eta: 0:05:02  lr: 0.003900  min_lr: 0.003900  loss: 3.8011 (3.6049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5281 (0.7682)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [48]  [ 600/1251]  eta: 0:03:49  lr: 0.003899  min_lr: 0.003899  loss: 3.4462 (3.5918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7516 (0.7952)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [48]  [ 800/1251]  eta: 0:02:38  lr: 0.003898  min_lr: 0.003898  loss: 3.7848 (3.6098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8579 (0.8128)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [48]  [1000/1251]  eta: 0:01:28  lr: 0.003897  min_lr: 0.003897  loss: 3.6976 (3.6136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5217 (0.7827)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [48]  [1200/1251]  eta: 0:00:17  lr: 0.003895  min_lr: 0.003895  loss: 3.3266 (3.6098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7145 (0.7764)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [48]  [1250/1251]  eta: 0:00:00  lr: 0.003895  min_lr: 0.003895  loss: 3.9053 (3.6120)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8960 (0.7864)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [48] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.003895  min_lr: 0.003895  loss: 3.9053 (3.6026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8960 (0.7864)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.7333 (0.7333)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (98.0000)  time: 5.2638  data: 5.0360  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9439 (0.9907)  acc1: 81.6000 (79.3455)  acc5: 96.0000 (95.7818)  time: 0.6774  data: 0.5035  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2393 (1.1776)  acc1: 72.4000 (75.1238)  acc5: 92.8000 (93.0667)  time: 0.1937  data: 0.0252  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3456 (1.1923)  acc1: 71.6000 (74.6560)  acc5: 91.2000 (92.9120)  time: 0.1935  data: 0.0251  max mem: 28503
Test: Total time: 0:00:09 (0.3961 s / it)
* Acc@1 75.250 Acc@5 93.126 loss 1.176
Accuracy of the model on the 50000 test images: 75.3%
Max accuracy: 75.25%
Epoch: [49]  [   0/1251]  eta: 1:00:40  lr: 0.003895  min_lr: 0.003895  loss: 2.6548 (2.6548)  weight_decay: 0.0500 (0.0500)  time: 2.9103  data: 2.5336  max mem: 28503
Epoch: [49]  [ 200/1251]  eta: 0:06:19  lr: 0.003894  min_lr: 0.003894  loss: 3.6936 (3.5680)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7471 (0.8650)  time: 0.3461  data: 0.0003  max mem: 28503
Epoch: [49]  [ 400/1251]  eta: 0:05:01  lr: 0.003893  min_lr: 0.003893  loss: 3.5231 (3.5467)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6284 (0.8444)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [49]  [ 600/1251]  eta: 0:03:49  lr: 0.003892  min_lr: 0.003892  loss: 3.8302 (3.5710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6776 (0.8115)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [49]  [ 800/1251]  eta: 0:02:38  lr: 0.003890  min_lr: 0.003890  loss: 3.8315 (3.5793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6855 (0.8029)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [49]  [1000/1251]  eta: 0:01:27  lr: 0.003889  min_lr: 0.003889  loss: 3.6033 (3.5758)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6774 (0.7928)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [49]  [1200/1251]  eta: 0:00:17  lr: 0.003888  min_lr: 0.003888  loss: 3.5979 (3.5722)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.7924)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [49]  [1250/1251]  eta: 0:00:00  lr: 0.003888  min_lr: 0.003888  loss: 3.6519 (3.5740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.7889)  time: 0.2913  data: 0.0005  max mem: 28503
Epoch: [49] Total time: 0:07:17 (0.3493 s / it)
Averaged stats: lr: 0.003888  min_lr: 0.003888  loss: 3.6519 (3.5900)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7828 (0.7889)
Test:  [ 0/25]  eta: 0:01:36  loss: 0.7944 (0.7944)  acc1: 83.6000 (83.6000)  acc5: 96.8000 (96.8000)  time: 3.8506  data: 3.6463  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0322 (1.0149)  acc1: 77.6000 (78.9091)  acc5: 96.0000 (95.3818)  time: 0.6031  data: 0.4273  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2458 (1.2054)  acc1: 72.0000 (75.0476)  acc5: 92.8000 (92.8571)  time: 0.2500  data: 0.0765  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3130 (1.2167)  acc1: 72.0000 (74.8480)  acc5: 92.0000 (92.8800)  time: 0.2239  data: 0.0512  max mem: 28503
Test: Total time: 0:00:09 (0.3863 s / it)
* Acc@1 75.150 Acc@5 93.250 loss 1.207
Accuracy of the model on the 50000 test images: 75.2%
Max accuracy: 75.25%
Epoch: [50]  [   0/1251]  eta: 1:10:36  lr: 0.003888  min_lr: 0.003888  loss: 4.2069 (4.2069)  weight_decay: 0.0500 (0.0500)  time: 3.3862  data: 2.3359  max mem: 28503
Epoch: [50]  [ 200/1251]  eta: 0:06:25  lr: 0.003887  min_lr: 0.003887  loss: 3.4673 (3.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8773 (0.8377)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [50]  [ 400/1251]  eta: 0:05:03  lr: 0.003885  min_lr: 0.003885  loss: 3.7328 (3.5727)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [50]  [ 600/1251]  eta: 0:03:50  lr: 0.003884  min_lr: 0.003884  loss: 3.6105 (3.5925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6350 (nan)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [50]  [ 800/1251]  eta: 0:02:39  lr: 0.003883  min_lr: 0.003883  loss: 3.6811 (3.5912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7503 (nan)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [50]  [1000/1251]  eta: 0:01:28  lr: 0.003882  min_lr: 0.003882  loss: 3.8248 (3.5996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6788 (nan)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [50]  [1200/1251]  eta: 0:00:17  lr: 0.003881  min_lr: 0.003881  loss: 3.6030 (3.5990)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8219 (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [50]  [1250/1251]  eta: 0:00:00  lr: 0.003880  min_lr: 0.003880  loss: 3.8997 (3.5972)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (nan)  time: 0.2924  data: 0.0005  max mem: 28503
Epoch: [50] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.003880  min_lr: 0.003880  loss: 3.8997 (3.5853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6177 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7189 (0.7189)  acc1: 84.4000 (84.4000)  acc5: 98.0000 (98.0000)  time: 5.7780  data: 5.5339  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9573 (0.9518)  acc1: 79.6000 (78.4000)  acc5: 96.4000 (96.1818)  time: 0.7354  data: 0.5599  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1511 (1.1435)  acc1: 71.2000 (74.5714)  acc5: 92.0000 (93.3524)  time: 0.1998  data: 0.0313  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2978 (1.1514)  acc1: 72.8000 (74.4960)  acc5: 91.2000 (93.3280)  time: 0.1996  data: 0.0312  max mem: 28503
Test: Total time: 0:00:10 (0.4211 s / it)
* Acc@1 75.400 Acc@5 93.396 loss 1.138
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.40%
Epoch: [51]  [   0/1251]  eta: 1:08:14  lr: 0.003880  min_lr: 0.003880  loss: 3.1380 (3.1380)  weight_decay: 0.0500 (0.0500)  time: 3.2731  data: 2.9119  max mem: 28503
Epoch: [51]  [ 200/1251]  eta: 0:06:23  lr: 0.003879  min_lr: 0.003879  loss: 3.4661 (3.6234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6394 (0.7163)  time: 0.3564  data: 0.0005  max mem: 28503
Epoch: [51]  [ 400/1251]  eta: 0:05:03  lr: 0.003878  min_lr: 0.003878  loss: 3.6323 (3.6140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5952 (0.7386)  time: 0.3461  data: 0.0005  max mem: 28503
Epoch: [51]  [ 600/1251]  eta: 0:03:50  lr: 0.003877  min_lr: 0.003877  loss: 3.6437 (3.5968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9174 (0.7490)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [51]  [ 800/1251]  eta: 0:02:38  lr: 0.003875  min_lr: 0.003875  loss: 3.5765 (3.5859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9367 (0.7663)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [51]  [1000/1251]  eta: 0:01:28  lr: 0.003874  min_lr: 0.003874  loss: 3.3597 (3.5852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7803 (0.7932)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [51]  [1200/1251]  eta: 0:00:17  lr: 0.003873  min_lr: 0.003873  loss: 3.5852 (3.5710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6246 (0.7927)  time: 0.3587  data: 0.0005  max mem: 28503
Epoch: [51]  [1250/1251]  eta: 0:00:00  lr: 0.003873  min_lr: 0.003873  loss: 3.8665 (3.5763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (0.7987)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [51] Total time: 0:07:19 (0.3511 s / it)
Averaged stats: lr: 0.003873  min_lr: 0.003873  loss: 3.8665 (3.5743)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8250 (0.7987)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8549 (0.8549)  acc1: 86.0000 (86.0000)  acc5: 97.6000 (97.6000)  time: 5.7338  data: 5.5199  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0646 (1.0372)  acc1: 79.6000 (79.5273)  acc5: 96.0000 (95.4909)  time: 0.7283  data: 0.5555  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2626 (1.2376)  acc1: 72.8000 (75.3905)  acc5: 93.2000 (92.9714)  time: 0.1981  data: 0.0296  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.4020 (1.2491)  acc1: 70.8000 (75.0080)  acc5: 90.8000 (92.8160)  time: 0.1980  data: 0.0295  max mem: 28503
Test: Total time: 0:00:10 (0.4181 s / it)
* Acc@1 75.440 Acc@5 93.258 loss 1.238
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.44%
Epoch: [52]  [   0/1251]  eta: 1:05:12  lr: 0.003873  min_lr: 0.003873  loss: 3.4740 (3.4740)  weight_decay: 0.0500 (0.0500)  time: 3.1273  data: 2.7706  max mem: 28503
Epoch: [52]  [ 200/1251]  eta: 0:06:22  lr: 0.003871  min_lr: 0.003871  loss: 3.7359 (3.5648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7686 (0.7501)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [52]  [ 400/1251]  eta: 0:05:02  lr: 0.003870  min_lr: 0.003870  loss: 3.7446 (3.6042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7325 (0.8072)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [52]  [ 600/1251]  eta: 0:03:50  lr: 0.003869  min_lr: 0.003869  loss: 3.7687 (3.6085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7072 (0.8155)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [52]  [ 800/1251]  eta: 0:02:38  lr: 0.003867  min_lr: 0.003867  loss: 3.5746 (3.6003)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6736 (0.8105)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [52]  [1000/1251]  eta: 0:01:28  lr: 0.003866  min_lr: 0.003866  loss: 3.5532 (3.5866)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7563 (0.8168)  time: 0.3507  data: 0.0004  max mem: 28503
Epoch: [52]  [1200/1251]  eta: 0:00:17  lr: 0.003865  min_lr: 0.003865  loss: 3.7232 (3.5891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9467 (0.8166)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [52]  [1250/1251]  eta: 0:00:00  lr: 0.003865  min_lr: 0.003865  loss: 3.6443 (3.5905)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9467 (0.8175)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [52] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.003865  min_lr: 0.003865  loss: 3.6443 (3.5667)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9467 (0.8175)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7845 (0.7845)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 5.6902  data: 5.4810  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9789 (1.0075)  acc1: 78.8000 (80.2182)  acc5: 96.4000 (95.7455)  time: 0.6717  data: 0.4985  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1852 (1.2135)  acc1: 73.6000 (75.4857)  acc5: 92.0000 (92.9714)  time: 0.1821  data: 0.0131  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3549 (1.2178)  acc1: 72.4000 (75.4080)  acc5: 90.8000 (92.9920)  time: 0.1814  data: 0.0130  max mem: 28503
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 75.446 Acc@5 93.234 loss 1.203
Accuracy of the model on the 50000 test images: 75.4%
Max accuracy: 75.45%
Epoch: [53]  [   0/1251]  eta: 1:00:53  lr: 0.003865  min_lr: 0.003865  loss: 4.0721 (4.0721)  weight_decay: 0.0500 (0.0500)  time: 2.9208  data: 2.4883  max mem: 28503
Epoch: [53]  [ 200/1251]  eta: 0:06:21  lr: 0.003863  min_lr: 0.003863  loss: 3.7506 (3.5365)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6825 (0.7798)  time: 0.3450  data: 0.0005  max mem: 28503
Epoch: [53]  [ 400/1251]  eta: 0:05:02  lr: 0.003862  min_lr: 0.003862  loss: 3.5559 (3.5472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6304 (0.7458)  time: 0.3546  data: 0.0005  max mem: 28503
Epoch: [53]  [ 600/1251]  eta: 0:03:49  lr: 0.003861  min_lr: 0.003861  loss: 3.4970 (3.5450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.7684)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [53]  [ 800/1251]  eta: 0:02:38  lr: 0.003859  min_lr: 0.003859  loss: 3.4821 (3.5631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6192 (0.7818)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [53]  [1000/1251]  eta: 0:01:28  lr: 0.003858  min_lr: 0.003858  loss: 3.3466 (3.5495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5121 (0.7681)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [53]  [1200/1251]  eta: 0:00:17  lr: 0.003857  min_lr: 0.003857  loss: 3.6901 (3.5468)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8332 (0.7685)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [53]  [1250/1251]  eta: 0:00:00  lr: 0.003856  min_lr: 0.003856  loss: 3.4230 (3.5472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5670 (0.7680)  time: 0.3052  data: 0.0005  max mem: 28503
Epoch: [53] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003856  min_lr: 0.003856  loss: 3.4230 (3.5462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5670 (0.7680)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7938 (0.7938)  acc1: 83.2000 (83.2000)  acc5: 96.8000 (96.8000)  time: 5.7023  data: 5.4800  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9952 (0.9957)  acc1: 80.0000 (79.7091)  acc5: 96.4000 (95.8545)  time: 0.6873  data: 0.5138  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1928 (1.1981)  acc1: 74.0000 (75.5810)  acc5: 93.6000 (93.1238)  time: 0.1772  data: 0.0086  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3214 (1.2041)  acc1: 72.8000 (75.4400)  acc5: 92.0000 (93.1360)  time: 0.1770  data: 0.0085  max mem: 28503
Test: Total time: 0:00:10 (0.4080 s / it)
* Acc@1 75.508 Acc@5 93.264 loss 1.197
Accuracy of the model on the 50000 test images: 75.5%
Max accuracy: 75.51%
Epoch: [54]  [   0/1251]  eta: 1:08:29  lr: 0.003856  min_lr: 0.003856  loss: 4.2148 (4.2148)  weight_decay: 0.0500 (0.0500)  time: 3.2851  data: 2.9329  max mem: 28503
Epoch: [54]  [ 200/1251]  eta: 0:06:20  lr: 0.003855  min_lr: 0.003855  loss: 3.6799 (3.4980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6406 (0.7868)  time: 0.3662  data: 0.0004  max mem: 28503
Epoch: [54]  [ 400/1251]  eta: 0:05:00  lr: 0.003854  min_lr: 0.003854  loss: 3.3615 (3.5297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7062 (0.8597)  time: 0.3481  data: 0.0005  max mem: 28503
Epoch: [54]  [ 600/1251]  eta: 0:03:49  lr: 0.003852  min_lr: 0.003852  loss: 3.5725 (3.5335)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.8436)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [54]  [ 800/1251]  eta: 0:02:38  lr: 0.003851  min_lr: 0.003851  loss: 3.8614 (3.5409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9379 (nan)  time: 0.3451  data: 0.0005  max mem: 28503
Epoch: [54]  [1000/1251]  eta: 0:01:27  lr: 0.003849  min_lr: 0.003849  loss: 3.5331 (3.5608)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7395 (nan)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [54]  [1200/1251]  eta: 0:00:17  lr: 0.003848  min_lr: 0.003848  loss: 3.5622 (3.5692)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7257 (nan)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [54]  [1250/1251]  eta: 0:00:00  lr: 0.003848  min_lr: 0.003848  loss: 3.4194 (3.5679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (nan)  time: 0.2968  data: 0.0007  max mem: 28503
Epoch: [54] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003848  min_lr: 0.003848  loss: 3.4194 (3.5512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7665 (nan)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.7827 (0.7827)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 5.3539  data: 5.1482  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9869 (0.9694)  acc1: 79.2000 (79.0182)  acc5: 95.2000 (95.9273)  time: 0.7312  data: 0.5588  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1949 (1.1597)  acc1: 72.8000 (75.0095)  acc5: 92.8000 (93.5810)  time: 0.2187  data: 0.0500  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2998 (1.1707)  acc1: 72.0000 (74.6400)  acc5: 92.0000 (93.4880)  time: 0.2185  data: 0.0499  max mem: 28503
Test: Total time: 0:00:10 (0.4194 s / it)
* Acc@1 75.644 Acc@5 93.554 loss 1.155
Accuracy of the model on the 50000 test images: 75.6%
Max accuracy: 75.64%
Epoch: [55]  [   0/1251]  eta: 1:06:06  lr: 0.003848  min_lr: 0.003848  loss: 3.6360 (3.6360)  weight_decay: 0.0500 (0.0500)  time: 3.1703  data: 2.8097  max mem: 28503
Epoch: [55]  [ 200/1251]  eta: 0:06:19  lr: 0.003846  min_lr: 0.003846  loss: 3.7442 (3.5511)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7254 (0.8147)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [55]  [ 400/1251]  eta: 0:05:01  lr: 0.003845  min_lr: 0.003845  loss: 3.4980 (3.5376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7397 (0.7668)  time: 0.3439  data: 0.0004  max mem: 28503
Epoch: [55]  [ 600/1251]  eta: 0:03:49  lr: 0.003844  min_lr: 0.003844  loss: 3.4828 (3.5304)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (0.7895)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [55]  [ 800/1251]  eta: 0:02:38  lr: 0.003842  min_lr: 0.003842  loss: 3.7671 (3.5401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7068 (0.7858)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [55]  [1000/1251]  eta: 0:01:27  lr: 0.003841  min_lr: 0.003841  loss: 3.5118 (3.5313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7378 (0.7969)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [55]  [1200/1251]  eta: 0:00:17  lr: 0.003839  min_lr: 0.003839  loss: 3.6813 (3.5369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6556 (0.7818)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [55]  [1250/1251]  eta: 0:00:00  lr: 0.003839  min_lr: 0.003839  loss: 3.0100 (3.5312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.7818)  time: 0.2964  data: 0.0007  max mem: 28503
Epoch: [55] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.003839  min_lr: 0.003839  loss: 3.0100 (3.5525)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.7818)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7520 (0.7520)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.4895  data: 5.2862  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9532 (0.9369)  acc1: 80.4000 (79.6364)  acc5: 96.0000 (95.9636)  time: 0.7080  data: 0.5352  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1197 (1.1277)  acc1: 73.6000 (75.5619)  acc5: 93.2000 (93.5048)  time: 0.1992  data: 0.0301  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2815 (1.1361)  acc1: 72.8000 (75.2000)  acc5: 92.4000 (93.4560)  time: 0.1994  data: 0.0309  max mem: 28503
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 75.702 Acc@5 93.434 loss 1.129
Accuracy of the model on the 50000 test images: 75.7%
Max accuracy: 75.70%
Epoch: [56]  [   0/1251]  eta: 1:03:23  lr: 0.003839  min_lr: 0.003839  loss: 3.8050 (3.8050)  weight_decay: 0.0500 (0.0500)  time: 3.0405  data: 2.6547  max mem: 28503
Epoch: [56]  [ 200/1251]  eta: 0:06:20  lr: 0.003838  min_lr: 0.003838  loss: 3.8906 (3.5223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8626 (0.8764)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [56]  [ 400/1251]  eta: 0:05:02  lr: 0.003836  min_lr: 0.003836  loss: 3.6003 (3.5162)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6617 (0.8321)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [56]  [ 600/1251]  eta: 0:03:49  lr: 0.003835  min_lr: 0.003835  loss: 3.6345 (3.5343)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6732 (0.7845)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [56]  [ 800/1251]  eta: 0:02:38  lr: 0.003833  min_lr: 0.003833  loss: 3.7747 (3.5527)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6976 (0.7779)  time: 0.3441  data: 0.0004  max mem: 28503
Epoch: [56]  [1000/1251]  eta: 0:01:28  lr: 0.003832  min_lr: 0.003832  loss: 3.3213 (3.5444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6479 (0.7941)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [56]  [1200/1251]  eta: 0:00:17  lr: 0.003831  min_lr: 0.003831  loss: 3.2551 (3.5528)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7042 (0.8061)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [56]  [1250/1251]  eta: 0:00:00  lr: 0.003830  min_lr: 0.003830  loss: 3.8454 (3.5526)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7920 (0.8150)  time: 0.2984  data: 0.0007  max mem: 28503
Epoch: [56] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.003830  min_lr: 0.003830  loss: 3.8454 (3.5478)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7920 (0.8150)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7724 (0.7724)  acc1: 84.4000 (84.4000)  acc5: 97.6000 (97.6000)  time: 5.5574  data: 5.3616  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9924 (0.9698)  acc1: 80.4000 (80.1818)  acc5: 96.4000 (95.9636)  time: 0.7280  data: 0.5569  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1566 (1.1656)  acc1: 74.4000 (76.3429)  acc5: 93.2000 (93.3333)  time: 0.2067  data: 0.0383  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2767 (1.1733)  acc1: 72.4000 (76.0000)  acc5: 93.2000 (93.3280)  time: 0.2064  data: 0.0381  max mem: 28503
Test: Total time: 0:00:10 (0.4185 s / it)
* Acc@1 76.032 Acc@5 93.482 loss 1.162
Accuracy of the model on the 50000 test images: 76.0%
Max accuracy: 76.03%
Epoch: [57]  [   0/1251]  eta: 1:02:03  lr: 0.003830  min_lr: 0.003830  loss: 3.9500 (3.9500)  weight_decay: 0.0500 (0.0500)  time: 2.9764  data: 2.6080  max mem: 28503
Epoch: [57]  [ 200/1251]  eta: 0:06:18  lr: 0.003829  min_lr: 0.003829  loss: 3.1334 (3.4997)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6888 (0.7451)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [57]  [ 400/1251]  eta: 0:05:01  lr: 0.003827  min_lr: 0.003827  loss: 3.3727 (3.5109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6959 (0.7737)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [57]  [ 600/1251]  eta: 0:03:49  lr: 0.003826  min_lr: 0.003826  loss: 3.7176 (3.5260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5727 (0.7381)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [57]  [ 800/1251]  eta: 0:02:38  lr: 0.003824  min_lr: 0.003824  loss: 3.6355 (3.5151)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7836 (0.7627)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [57]  [1000/1251]  eta: 0:01:27  lr: 0.003823  min_lr: 0.003823  loss: 3.5975 (3.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.7769)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [57]  [1200/1251]  eta: 0:00:17  lr: 0.003821  min_lr: 0.003821  loss: 3.4025 (3.5306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6480 (0.7879)  time: 0.3506  data: 0.0004  max mem: 28503
Epoch: [57]  [1250/1251]  eta: 0:00:00  lr: 0.003821  min_lr: 0.003821  loss: 3.5719 (3.5263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7913 (0.7971)  time: 0.2914  data: 0.0006  max mem: 28503
Epoch: [57] Total time: 0:07:16 (0.3489 s / it)
Averaged stats: lr: 0.003821  min_lr: 0.003821  loss: 3.5719 (3.5307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7913 (0.7971)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7126 (0.7126)  acc1: 86.4000 (86.4000)  acc5: 97.6000 (97.6000)  time: 5.5822  data: 5.3779  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9729 (0.9553)  acc1: 80.4000 (79.4545)  acc5: 95.6000 (95.7818)  time: 0.6799  data: 0.5078  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2089 (1.1558)  acc1: 71.6000 (75.2762)  acc5: 93.6000 (93.1619)  time: 0.1791  data: 0.0105  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3299 (1.1667)  acc1: 71.6000 (74.9760)  acc5: 92.4000 (93.1840)  time: 0.1789  data: 0.0104  max mem: 28503
Test: Total time: 0:00:10 (0.4049 s / it)
* Acc@1 75.928 Acc@5 93.566 loss 1.149
Accuracy of the model on the 50000 test images: 75.9%
Max accuracy: 76.03%
Epoch: [58]  [   0/1251]  eta: 1:06:13  lr: 0.003821  min_lr: 0.003821  loss: 3.4816 (3.4816)  weight_decay: 0.0500 (0.0500)  time: 3.1760  data: 2.4694  max mem: 28503
Epoch: [58]  [ 200/1251]  eta: 0:06:22  lr: 0.003820  min_lr: 0.003820  loss: 3.6041 (3.4307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9287 (0.8420)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [58]  [ 400/1251]  eta: 0:05:02  lr: 0.003818  min_lr: 0.003818  loss: 3.6634 (3.4767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7396 (0.8027)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [58]  [ 600/1251]  eta: 0:03:49  lr: 0.003817  min_lr: 0.003817  loss: 3.8150 (3.5183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.8033)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [58]  [ 800/1251]  eta: 0:02:38  lr: 0.003815  min_lr: 0.003815  loss: 3.5409 (3.5370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6524 (0.8077)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [58]  [1000/1251]  eta: 0:01:28  lr: 0.003813  min_lr: 0.003813  loss: 3.5689 (3.5417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7791 (0.8135)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [58]  [1200/1251]  eta: 0:00:17  lr: 0.003812  min_lr: 0.003812  loss: 3.7677 (3.5402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6222 (0.8221)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [58]  [1250/1251]  eta: 0:00:00  lr: 0.003812  min_lr: 0.003812  loss: 3.7716 (3.5420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.8246)  time: 0.2983  data: 0.0006  max mem: 28503
Epoch: [58] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003812  min_lr: 0.003812  loss: 3.7716 (3.5375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6894 (0.8246)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.8070 (0.8070)  acc1: 87.6000 (87.6000)  acc5: 97.2000 (97.2000)  time: 5.4607  data: 5.2626  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0653 (1.0334)  acc1: 78.4000 (80.1455)  acc5: 96.0000 (95.8545)  time: 0.6959  data: 0.5228  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2694 (1.2204)  acc1: 72.8000 (75.7714)  acc5: 93.6000 (93.1048)  time: 0.1940  data: 0.0245  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3489 (1.2266)  acc1: 72.4000 (75.3440)  acc5: 91.6000 (93.1360)  time: 0.1929  data: 0.0244  max mem: 28503
Test: Total time: 0:00:10 (0.4042 s / it)
* Acc@1 76.114 Acc@5 93.588 loss 1.209
Accuracy of the model on the 50000 test images: 76.1%
Max accuracy: 76.11%
Epoch: [59]  [   0/1251]  eta: 1:06:53  lr: 0.003812  min_lr: 0.003812  loss: 3.7848 (3.7848)  weight_decay: 0.0500 (0.0500)  time: 3.2085  data: 2.8551  max mem: 28503
Epoch: [59]  [ 200/1251]  eta: 0:06:21  lr: 0.003810  min_lr: 0.003810  loss: 3.5561 (3.4512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6992 (0.7376)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [59]  [ 400/1251]  eta: 0:05:02  lr: 0.003809  min_lr: 0.003809  loss: 3.6793 (3.4853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6457 (0.7573)  time: 0.3560  data: 0.0004  max mem: 28503
Epoch: [59]  [ 600/1251]  eta: 0:03:49  lr: 0.003807  min_lr: 0.003807  loss: 2.9975 (3.4896)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5983 (0.7557)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [59]  [ 800/1251]  eta: 0:02:38  lr: 0.003805  min_lr: 0.003805  loss: 3.5768 (3.5053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7669 (0.7852)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [59]  [1000/1251]  eta: 0:01:28  lr: 0.003804  min_lr: 0.003804  loss: 3.2297 (3.5103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6124 (0.7762)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [59]  [1200/1251]  eta: 0:00:17  lr: 0.003802  min_lr: 0.003802  loss: 3.4393 (3.5272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6681 (0.7752)  time: 0.3550  data: 0.0005  max mem: 28503
Epoch: [59]  [1250/1251]  eta: 0:00:00  lr: 0.003802  min_lr: 0.003802  loss: 3.5978 (3.5282)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7794)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [59] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003802  min_lr: 0.003802  loss: 3.5978 (3.5256)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7156 (0.7794)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7722 (0.7722)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.2000)  time: 5.5484  data: 5.3404  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9788 (0.9220)  acc1: 78.4000 (80.0000)  acc5: 96.4000 (96.1818)  time: 0.6816  data: 0.5087  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1093 (1.1195)  acc1: 74.8000 (76.0571)  acc5: 94.0000 (93.5048)  time: 0.1870  data: 0.0151  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2984 (1.1331)  acc1: 73.2000 (75.7280)  acc5: 91.6000 (93.2800)  time: 0.1864  data: 0.0150  max mem: 28503
Test: Total time: 0:00:10 (0.4019 s / it)
* Acc@1 76.170 Acc@5 93.616 loss 1.115
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.17%
Epoch: [60]  [   0/1251]  eta: 1:01:34  lr: 0.003802  min_lr: 0.003802  loss: 3.1261 (3.1261)  weight_decay: 0.0500 (0.0500)  time: 2.9536  data: 2.5740  max mem: 28503
Epoch: [60]  [ 200/1251]  eta: 0:06:19  lr: 0.003800  min_lr: 0.003800  loss: 3.3873 (3.4888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9549 (0.7565)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [60]  [ 400/1251]  eta: 0:05:01  lr: 0.003799  min_lr: 0.003799  loss: 3.5801 (3.5109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7809 (0.8082)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [60]  [ 600/1251]  eta: 0:03:49  lr: 0.003797  min_lr: 0.003797  loss: 3.5124 (3.5254)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7162 (0.8280)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [60]  [ 800/1251]  eta: 0:02:38  lr: 0.003796  min_lr: 0.003796  loss: 3.7730 (3.5413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6295 (0.7988)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [60]  [1000/1251]  eta: 0:01:27  lr: 0.003794  min_lr: 0.003794  loss: 3.6378 (3.5429)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8858 (0.8242)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [60]  [1200/1251]  eta: 0:00:17  lr: 0.003793  min_lr: 0.003793  loss: 3.6408 (3.5321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6876 (0.8254)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [60]  [1250/1251]  eta: 0:00:00  lr: 0.003792  min_lr: 0.003792  loss: 3.6290 (3.5363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (0.8258)  time: 0.2983  data: 0.0006  max mem: 28503
Epoch: [60] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003792  min_lr: 0.003792  loss: 3.6290 (3.5153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7878 (0.8258)
Test:  [ 0/25]  eta: 0:01:37  loss: 0.8413 (0.8413)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 3.8921  data: 3.6530  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 1.0397 (1.0343)  acc1: 80.4000 (78.9818)  acc5: 96.4000 (95.8909)  time: 0.6653  data: 0.4895  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2390 (1.2208)  acc1: 71.6000 (75.2571)  acc5: 93.2000 (93.1810)  time: 0.2676  data: 0.0986  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3695 (1.2309)  acc1: 71.6000 (74.9920)  acc5: 91.2000 (93.0240)  time: 0.1945  data: 0.0260  max mem: 28503
Test: Total time: 0:00:10 (0.4003 s / it)
* Acc@1 75.832 Acc@5 93.354 loss 1.210
Accuracy of the model on the 50000 test images: 75.8%
Max accuracy: 76.17%
Epoch: [61]  [   0/1251]  eta: 1:14:51  lr: 0.003792  min_lr: 0.003792  loss: 4.3428 (4.3428)  weight_decay: 0.0500 (0.0500)  time: 3.5903  data: 2.9426  max mem: 28503
Epoch: [61]  [ 200/1251]  eta: 0:06:25  lr: 0.003791  min_lr: 0.003791  loss: 3.5659 (3.4779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7704 (0.8216)  time: 0.3710  data: 0.0004  max mem: 28503
Epoch: [61]  [ 400/1251]  eta: 0:05:03  lr: 0.003789  min_lr: 0.003789  loss: 3.4779 (3.4832)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5776 (0.7600)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [61]  [ 600/1251]  eta: 0:03:50  lr: 0.003787  min_lr: 0.003787  loss: 3.5374 (3.5071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6762 (0.7689)  time: 0.3461  data: 0.0005  max mem: 28503
Epoch: [61]  [ 800/1251]  eta: 0:02:38  lr: 0.003786  min_lr: 0.003786  loss: 3.5922 (3.5167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.7730)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [61]  [1000/1251]  eta: 0:01:28  lr: 0.003784  min_lr: 0.003784  loss: 3.6609 (3.5181)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5985 (0.7632)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [61]  [1200/1251]  eta: 0:00:17  lr: 0.003782  min_lr: 0.003782  loss: 3.7337 (3.5280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9593 (0.7843)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [61]  [1250/1251]  eta: 0:00:00  lr: 0.003782  min_lr: 0.003782  loss: 3.4169 (3.5276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.7829)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [61] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.003782  min_lr: 0.003782  loss: 3.4169 (3.5142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.7829)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.7620 (0.7620)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6616  data: 5.4729  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0128 (0.9730)  acc1: 80.0000 (80.2182)  acc5: 96.4000 (96.0727)  time: 0.6914  data: 0.5208  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2097 (1.1628)  acc1: 72.8000 (75.7524)  acc5: 93.6000 (93.2952)  time: 0.1815  data: 0.0129  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2546 (1.1731)  acc1: 71.6000 (75.3920)  acc5: 90.8000 (93.1840)  time: 0.1813  data: 0.0128  max mem: 28503
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 76.162 Acc@5 93.712 loss 1.144
Accuracy of the model on the 50000 test images: 76.2%
Max accuracy: 76.17%
Epoch: [62]  [   0/1251]  eta: 1:13:56  lr: 0.003782  min_lr: 0.003782  loss: 3.0447 (3.0447)  weight_decay: 0.0500 (0.0500)  time: 3.5467  data: 1.8247  max mem: 28503
Epoch: [62]  [ 200/1251]  eta: 0:06:21  lr: 0.003780  min_lr: 0.003780  loss: 3.3878 (3.4985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6874 (0.8819)  time: 0.3441  data: 0.0004  max mem: 28503
Epoch: [62]  [ 400/1251]  eta: 0:05:03  lr: 0.003779  min_lr: 0.003779  loss: 3.8423 (3.4837)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.7899)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [62]  [ 600/1251]  eta: 0:03:50  lr: 0.003777  min_lr: 0.003777  loss: 3.4667 (3.4890)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8931 (0.8482)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [62]  [ 800/1251]  eta: 0:02:38  lr: 0.003775  min_lr: 0.003775  loss: 3.7388 (3.5037)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7553 (0.8577)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [62]  [1000/1251]  eta: 0:01:28  lr: 0.003774  min_lr: 0.003774  loss: 3.6512 (3.4955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7351 (0.8562)  time: 0.3564  data: 0.0004  max mem: 28503
Epoch: [62]  [1200/1251]  eta: 0:00:17  lr: 0.003772  min_lr: 0.003772  loss: 3.6126 (3.4930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7196 (0.8564)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [62]  [1250/1251]  eta: 0:00:00  lr: 0.003772  min_lr: 0.003772  loss: 3.4853 (3.4964)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [62] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.003772  min_lr: 0.003772  loss: 3.4853 (3.5063)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6868 (0.6868)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.5142  data: 5.3247  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0000 (0.9504)  acc1: 80.8000 (80.6545)  acc5: 96.8000 (96.1091)  time: 0.7329  data: 0.5624  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1622 (1.1419)  acc1: 74.8000 (76.3619)  acc5: 93.2000 (93.4286)  time: 0.2116  data: 0.0432  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2691 (1.1525)  acc1: 73.6000 (75.8720)  acc5: 91.6000 (93.3760)  time: 0.2114  data: 0.0431  max mem: 28503
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 76.572 Acc@5 93.724 loss 1.129
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.57%
Epoch: [63]  [   0/1251]  eta: 0:58:59  lr: 0.003772  min_lr: 0.003772  loss: 3.0349 (3.0349)  weight_decay: 0.0500 (0.0500)  time: 2.8292  data: 2.3936  max mem: 28503
Epoch: [63]  [ 200/1251]  eta: 0:06:18  lr: 0.003770  min_lr: 0.003770  loss: 3.7263 (3.5051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7456 (0.7491)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [63]  [ 400/1251]  eta: 0:05:01  lr: 0.003768  min_lr: 0.003768  loss: 3.5872 (3.4925)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5192 (0.7078)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [63]  [ 600/1251]  eta: 0:03:49  lr: 0.003767  min_lr: 0.003767  loss: 3.6799 (3.4996)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9214 (0.7867)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [63]  [ 800/1251]  eta: 0:02:38  lr: 0.003765  min_lr: 0.003765  loss: 3.6425 (3.5086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (0.8042)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [63]  [1000/1251]  eta: 0:01:27  lr: 0.003763  min_lr: 0.003763  loss: 3.7186 (3.5075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6349 (0.7893)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [63]  [1200/1251]  eta: 0:00:17  lr: 0.003762  min_lr: 0.003762  loss: 3.6584 (3.4957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9431 (0.8096)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [63]  [1250/1251]  eta: 0:00:00  lr: 0.003761  min_lr: 0.003761  loss: 3.6277 (3.4951)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6534 (0.8069)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [63] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.003761  min_lr: 0.003761  loss: 3.6277 (3.5029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6534 (0.8069)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6935 (0.6935)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.6771  data: 5.4756  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9949 (0.9518)  acc1: 79.6000 (80.5818)  acc5: 96.4000 (96.1091)  time: 0.6970  data: 0.5239  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1814 (1.1463)  acc1: 73.2000 (76.4571)  acc5: 92.8000 (93.4667)  time: 0.1889  data: 0.0144  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2777 (1.1558)  acc1: 73.6000 (76.0960)  acc5: 91.6000 (93.4080)  time: 0.1896  data: 0.0143  max mem: 28503
Test: Total time: 0:00:10 (0.4094 s / it)
* Acc@1 76.464 Acc@5 93.644 loss 1.143
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.57%
Epoch: [64]  [   0/1251]  eta: 1:07:43  lr: 0.003761  min_lr: 0.003761  loss: 3.4846 (3.4846)  weight_decay: 0.0500 (0.0500)  time: 3.2481  data: 2.5433  max mem: 28503
Epoch: [64]  [ 200/1251]  eta: 0:06:21  lr: 0.003760  min_lr: 0.003760  loss: 3.5079 (3.4928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6738 (0.7440)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [64]  [ 400/1251]  eta: 0:05:02  lr: 0.003758  min_lr: 0.003758  loss: 3.5184 (3.4820)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.7822)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [64]  [ 600/1251]  eta: 0:03:49  lr: 0.003756  min_lr: 0.003756  loss: 3.7223 (3.4891)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6387 (0.7888)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [64]  [ 800/1251]  eta: 0:02:38  lr: 0.003754  min_lr: 0.003754  loss: 3.4311 (3.4933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (0.7913)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [64]  [1000/1251]  eta: 0:01:28  lr: 0.003753  min_lr: 0.003753  loss: 3.7519 (3.4906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6358 (0.7811)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [64]  [1200/1251]  eta: 0:00:17  lr: 0.003751  min_lr: 0.003751  loss: 3.5277 (3.5046)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6883 (0.7902)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [64]  [1250/1251]  eta: 0:00:00  lr: 0.003751  min_lr: 0.003751  loss: 3.3994 (3.5013)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5829 (0.7825)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [64] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.003751  min_lr: 0.003751  loss: 3.3994 (3.5038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5829 (0.7825)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7833 (0.7833)  acc1: 85.2000 (85.2000)  acc5: 96.8000 (96.8000)  time: 5.6975  data: 5.5014  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9647 (0.9471)  acc1: 78.0000 (79.9273)  acc5: 96.8000 (95.9636)  time: 0.7284  data: 0.5567  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1548 (1.1134)  acc1: 74.0000 (76.1524)  acc5: 93.6000 (93.7714)  time: 0.2000  data: 0.0312  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2332 (1.1244)  acc1: 74.0000 (75.9200)  acc5: 92.0000 (93.6640)  time: 0.1995  data: 0.0311  max mem: 28503
Test: Total time: 0:00:10 (0.4179 s / it)
* Acc@1 76.576 Acc@5 93.804 loss 1.110
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.58%
Epoch: [65]  [   0/1251]  eta: 1:01:00  lr: 0.003751  min_lr: 0.003751  loss: 3.6032 (3.6032)  weight_decay: 0.0500 (0.0500)  time: 2.9261  data: 2.4789  max mem: 28503
Epoch: [65]  [ 200/1251]  eta: 0:06:20  lr: 0.003749  min_lr: 0.003749  loss: 3.3456 (3.4534)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7108 (0.7863)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [65]  [ 400/1251]  eta: 0:05:01  lr: 0.003747  min_lr: 0.003747  loss: 3.6240 (3.4574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7403 (0.7986)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [65]  [ 600/1251]  eta: 0:03:48  lr: 0.003745  min_lr: 0.003745  loss: 3.4078 (3.4727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (0.8434)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [65]  [ 800/1251]  eta: 0:02:38  lr: 0.003744  min_lr: 0.003744  loss: 3.7173 (3.4824)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8694 (0.8425)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [65]  [1000/1251]  eta: 0:01:27  lr: 0.003742  min_lr: 0.003742  loss: 3.5703 (3.5072)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6869 (0.8342)  time: 0.3530  data: 0.0004  max mem: 28503
Epoch: [65]  [1200/1251]  eta: 0:00:17  lr: 0.003740  min_lr: 0.003740  loss: 3.5750 (3.5020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7077 (0.8458)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [65]  [1250/1251]  eta: 0:00:00  lr: 0.003740  min_lr: 0.003740  loss: 3.7094 (3.5005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8706 (0.8476)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [65] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.003740  min_lr: 0.003740  loss: 3.7094 (3.4952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8706 (0.8476)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7188 (0.7188)  acc1: 84.8000 (84.8000)  acc5: 98.0000 (98.0000)  time: 5.5780  data: 5.3819  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9884 (0.9559)  acc1: 80.8000 (80.4000)  acc5: 96.0000 (95.9273)  time: 0.7244  data: 0.5520  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1491 (1.1382)  acc1: 74.0000 (76.4571)  acc5: 93.2000 (93.8476)  time: 0.2070  data: 0.0346  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2741 (1.1424)  acc1: 74.4000 (76.3840)  acc5: 92.8000 (93.8240)  time: 0.2066  data: 0.0345  max mem: 28503
Test: Total time: 0:00:10 (0.4189 s / it)
* Acc@1 76.606 Acc@5 93.848 loss 1.142
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.61%
Epoch: [66]  [   0/1251]  eta: 1:03:00  lr: 0.003740  min_lr: 0.003740  loss: 2.9381 (2.9381)  weight_decay: 0.0500 (0.0500)  time: 3.0218  data: 2.6066  max mem: 28503
Epoch: [66]  [ 200/1251]  eta: 0:06:18  lr: 0.003738  min_lr: 0.003738  loss: 3.5074 (3.5341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6040 (0.7081)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [66]  [ 400/1251]  eta: 0:05:01  lr: 0.003736  min_lr: 0.003736  loss: 3.8546 (3.5213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7999 (0.7841)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [66]  [ 600/1251]  eta: 0:03:49  lr: 0.003734  min_lr: 0.003734  loss: 3.6744 (3.5193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7798 (0.8205)  time: 0.3553  data: 0.0004  max mem: 28503
Epoch: [66]  [ 800/1251]  eta: 0:02:38  lr: 0.003732  min_lr: 0.003732  loss: 3.5247 (3.5076)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5959 (0.7861)  time: 0.3449  data: 0.0005  max mem: 28503
Epoch: [66]  [1000/1251]  eta: 0:01:28  lr: 0.003731  min_lr: 0.003731  loss: 3.4194 (3.5065)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5624 (0.7893)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [66]  [1200/1251]  eta: 0:00:17  lr: 0.003729  min_lr: 0.003729  loss: 3.7060 (3.5009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5933 (0.7780)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [66]  [1250/1251]  eta: 0:00:00  lr: 0.003728  min_lr: 0.003728  loss: 3.7054 (3.5069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.7753)  time: 0.2922  data: 0.0005  max mem: 28503
Epoch: [66] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003728  min_lr: 0.003728  loss: 3.7054 (3.4904)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6619 (0.7753)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.8330 (0.8330)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.0787  data: 4.8836  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0675 (1.0534)  acc1: 81.2000 (80.9455)  acc5: 96.0000 (96.2182)  time: 0.6790  data: 0.5075  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2401 (1.2246)  acc1: 74.8000 (76.7810)  acc5: 93.2000 (93.5619)  time: 0.2109  data: 0.0421  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3098 (1.2318)  acc1: 74.8000 (76.7360)  acc5: 91.6000 (93.4240)  time: 0.2104  data: 0.0420  max mem: 28503
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 76.638 Acc@5 93.804 loss 1.222
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [67]  [   0/1251]  eta: 1:08:05  lr: 0.003728  min_lr: 0.003728  loss: 3.3824 (3.3824)  weight_decay: 0.0500 (0.0500)  time: 3.2657  data: 2.8959  max mem: 28503
Epoch: [67]  [ 200/1251]  eta: 0:06:21  lr: 0.003727  min_lr: 0.003727  loss: 3.7222 (3.5179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7121 (0.7918)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [67]  [ 400/1251]  eta: 0:05:02  lr: 0.003725  min_lr: 0.003725  loss: 3.6535 (3.4869)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6303 (0.7541)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [67]  [ 600/1251]  eta: 0:03:49  lr: 0.003723  min_lr: 0.003723  loss: 3.6328 (3.4918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7475 (0.7854)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [67]  [ 800/1251]  eta: 0:02:38  lr: 0.003721  min_lr: 0.003721  loss: 3.6810 (3.5053)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8824 (0.8013)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [67]  [1000/1251]  eta: 0:01:27  lr: 0.003719  min_lr: 0.003719  loss: 3.6270 (3.5049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.7890)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [67]  [1200/1251]  eta: 0:00:17  lr: 0.003717  min_lr: 0.003717  loss: 3.7598 (3.5118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6519 (0.7893)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [67]  [1250/1251]  eta: 0:00:00  lr: 0.003717  min_lr: 0.003717  loss: 3.6633 (3.5159)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (0.7972)  time: 0.2924  data: 0.0007  max mem: 28503
Epoch: [67] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.003717  min_lr: 0.003717  loss: 3.6633 (3.4883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7772 (0.7972)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7814 (0.7814)  acc1: 84.0000 (84.0000)  acc5: 98.4000 (98.4000)  time: 5.6196  data: 5.4023  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9886 (0.9551)  acc1: 80.0000 (80.1455)  acc5: 96.8000 (96.2909)  time: 0.7052  data: 0.5317  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1664 (1.1425)  acc1: 74.0000 (76.4381)  acc5: 93.2000 (93.7143)  time: 0.1976  data: 0.0227  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2624 (1.1460)  acc1: 73.2000 (76.1440)  acc5: 93.2000 (93.8720)  time: 0.1979  data: 0.0227  max mem: 28503
Test: Total time: 0:00:10 (0.4134 s / it)
* Acc@1 76.542 Acc@5 93.794 loss 1.146
Accuracy of the model on the 50000 test images: 76.5%
Max accuracy: 76.64%
Epoch: [68]  [   0/1251]  eta: 1:15:45  lr: 0.003717  min_lr: 0.003717  loss: 3.2018 (3.2018)  weight_decay: 0.0500 (0.0500)  time: 3.6336  data: 2.8478  max mem: 28503
Epoch: [68]  [ 200/1251]  eta: 0:06:23  lr: 0.003715  min_lr: 0.003715  loss: 3.3892 (3.4613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8145 (0.7768)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [68]  [ 400/1251]  eta: 0:05:03  lr: 0.003713  min_lr: 0.003713  loss: 3.3535 (3.4765)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6624 (0.8470)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [68]  [ 600/1251]  eta: 0:03:50  lr: 0.003711  min_lr: 0.003711  loss: 3.7263 (3.4798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7209 (0.8389)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [68]  [ 800/1251]  eta: 0:02:39  lr: 0.003710  min_lr: 0.003710  loss: 3.7470 (3.4880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6459 (0.8213)  time: 0.3444  data: 0.0005  max mem: 28503
Epoch: [68]  [1000/1251]  eta: 0:01:28  lr: 0.003708  min_lr: 0.003708  loss: 3.7063 (3.4931)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6876 (0.8236)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [68]  [1200/1251]  eta: 0:00:17  lr: 0.003706  min_lr: 0.003706  loss: 3.2402 (3.4852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8392 (0.8269)  time: 0.3476  data: 0.0005  max mem: 28503
Epoch: [68]  [1250/1251]  eta: 0:00:00  lr: 0.003705  min_lr: 0.003705  loss: 3.5043 (3.4828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8100 (0.8253)  time: 0.2923  data: 0.0007  max mem: 28503
Epoch: [68] Total time: 0:07:19 (0.3511 s / it)
Averaged stats: lr: 0.003705  min_lr: 0.003705  loss: 3.5043 (3.4779)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8100 (0.8253)
Test:  [ 0/25]  eta: 0:01:46  loss: 0.7613 (0.7613)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 4.2604  data: 4.0422  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.9642 (0.9326)  acc1: 81.6000 (81.0182)  acc5: 96.8000 (96.4727)  time: 0.6429  data: 0.4682  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1439 (1.1084)  acc1: 74.4000 (76.7619)  acc5: 93.2000 (93.8667)  time: 0.2577  data: 0.0883  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2674 (1.1172)  acc1: 73.2000 (76.3360)  acc5: 92.0000 (93.7280)  time: 0.2015  data: 0.0331  max mem: 28503
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 76.566 Acc@5 93.852 loss 1.104
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [69]  [   0/1251]  eta: 1:13:15  lr: 0.003705  min_lr: 0.003705  loss: 3.7743 (3.7743)  weight_decay: 0.0500 (0.0500)  time: 3.5133  data: 2.1131  max mem: 28503
Epoch: [69]  [ 200/1251]  eta: 0:06:24  lr: 0.003703  min_lr: 0.003703  loss: 3.6812 (3.4165)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5715 (0.7816)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [69]  [ 400/1251]  eta: 0:05:03  lr: 0.003702  min_lr: 0.003702  loss: 3.4981 (3.4387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8641)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [69]  [ 600/1251]  eta: 0:03:50  lr: 0.003700  min_lr: 0.003700  loss: 3.2962 (3.4382)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6964 (0.8449)  time: 0.3516  data: 0.0004  max mem: 28503
Epoch: [69]  [ 800/1251]  eta: 0:02:38  lr: 0.003698  min_lr: 0.003698  loss: 3.7246 (3.4517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8284 (0.8288)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [69]  [1000/1251]  eta: 0:01:28  lr: 0.003696  min_lr: 0.003696  loss: 3.6514 (3.4533)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3506  data: 0.0004  max mem: 28503
Epoch: [69]  [1200/1251]  eta: 0:00:17  lr: 0.003694  min_lr: 0.003694  loss: 3.5720 (3.4629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6668 (nan)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [69]  [1250/1251]  eta: 0:00:00  lr: 0.003694  min_lr: 0.003694  loss: 3.5336 (3.4665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (nan)  time: 0.2922  data: 0.0005  max mem: 28503
Epoch: [69] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003694  min_lr: 0.003694  loss: 3.5336 (3.4854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.8022 (0.8022)  acc1: 84.8000 (84.8000)  acc5: 98.4000 (98.4000)  time: 5.6212  data: 5.4189  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9309 (0.9689)  acc1: 80.8000 (80.1818)  acc5: 96.8000 (96.4000)  time: 0.6886  data: 0.5159  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1716 (1.1504)  acc1: 74.0000 (76.6476)  acc5: 93.6000 (93.7143)  time: 0.1819  data: 0.0129  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2362 (1.1542)  acc1: 74.4000 (76.4480)  acc5: 93.2000 (93.8240)  time: 0.1813  data: 0.0128  max mem: 28503
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 76.620 Acc@5 93.880 loss 1.148
Accuracy of the model on the 50000 test images: 76.6%
Max accuracy: 76.64%
Epoch: [70]  [   0/1251]  eta: 1:05:35  lr: 0.003694  min_lr: 0.003694  loss: 4.0903 (4.0903)  weight_decay: 0.0500 (0.0500)  time: 3.1458  data: 2.4017  max mem: 28503
Epoch: [70]  [ 200/1251]  eta: 0:06:23  lr: 0.003692  min_lr: 0.003692  loss: 3.4874 (3.4459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6881 (0.7753)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [70]  [ 400/1251]  eta: 0:05:03  lr: 0.003690  min_lr: 0.003690  loss: 3.7367 (3.4535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7938 (0.8380)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [70]  [ 600/1251]  eta: 0:03:50  lr: 0.003688  min_lr: 0.003688  loss: 3.6247 (3.4663)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6878 (0.8393)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [70]  [ 800/1251]  eta: 0:02:38  lr: 0.003686  min_lr: 0.003686  loss: 3.5489 (3.4547)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7033 (0.8336)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [70]  [1000/1251]  eta: 0:01:28  lr: 0.003684  min_lr: 0.003684  loss: 3.6454 (3.4563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6982 (0.8389)  time: 0.3462  data: 0.0005  max mem: 28503
Epoch: [70]  [1200/1251]  eta: 0:00:17  lr: 0.003682  min_lr: 0.003682  loss: 3.5685 (3.4676)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8542 (0.8337)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [70]  [1250/1251]  eta: 0:00:00  lr: 0.003682  min_lr: 0.003682  loss: 3.6956 (3.4702)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (0.8355)  time: 0.2923  data: 0.0007  max mem: 28503
Epoch: [70] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.003682  min_lr: 0.003682  loss: 3.6956 (3.4699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8422 (0.8355)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7662 (0.7662)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 5.7269  data: 5.5310  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9474 (0.9462)  acc1: 82.8000 (80.4000)  acc5: 96.4000 (96.6909)  time: 0.6882  data: 0.5151  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1436 (1.1184)  acc1: 74.0000 (76.3619)  acc5: 92.0000 (94.0191)  time: 0.1821  data: 0.0068  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2383 (1.1246)  acc1: 74.0000 (76.1120)  acc5: 92.0000 (94.0000)  time: 0.1828  data: 0.0068  max mem: 28503
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 76.978 Acc@5 94.024 loss 1.115
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 76.98%
Epoch: [71]  [   0/1251]  eta: 0:54:18  lr: 0.003681  min_lr: 0.003681  loss: 3.4226 (3.4226)  weight_decay: 0.0500 (0.0500)  time: 2.6046  data: 2.2198  max mem: 28503
Epoch: [71]  [ 200/1251]  eta: 0:06:18  lr: 0.003680  min_lr: 0.003680  loss: 3.5630 (3.4462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (0.7145)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [71]  [ 400/1251]  eta: 0:05:00  lr: 0.003678  min_lr: 0.003678  loss: 3.6674 (3.4548)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0075 (0.8580)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [71]  [ 600/1251]  eta: 0:03:49  lr: 0.003676  min_lr: 0.003676  loss: 3.3190 (3.4293)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6867 (0.7978)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [71]  [ 800/1251]  eta: 0:02:38  lr: 0.003674  min_lr: 0.003674  loss: 3.3711 (3.4305)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7730 (0.7955)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [71]  [1000/1251]  eta: 0:01:27  lr: 0.003672  min_lr: 0.003672  loss: 3.6069 (3.4502)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9120 (0.8099)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [71]  [1200/1251]  eta: 0:00:17  lr: 0.003670  min_lr: 0.003670  loss: 3.7991 (3.4589)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6102 (0.8184)  time: 0.3520  data: 0.0004  max mem: 28503
Epoch: [71]  [1250/1251]  eta: 0:00:00  lr: 0.003669  min_lr: 0.003669  loss: 3.5122 (3.4606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.8193)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [71] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003669  min_lr: 0.003669  loss: 3.5122 (3.4652)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6818 (0.8193)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7395 (0.7395)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.5680  data: 5.3845  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9265 (0.9317)  acc1: 80.4000 (81.4909)  acc5: 96.8000 (96.2182)  time: 0.7284  data: 0.5558  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1379 (1.1127)  acc1: 76.0000 (77.1429)  acc5: 93.6000 (93.8476)  time: 0.2066  data: 0.0365  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2708 (1.1203)  acc1: 72.8000 (76.5440)  acc5: 93.2000 (93.8400)  time: 0.2061  data: 0.0365  max mem: 28503
Test: Total time: 0:00:10 (0.4190 s / it)
* Acc@1 76.842 Acc@5 94.018 loss 1.109
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.98%
Epoch: [72]  [   0/1251]  eta: 1:12:37  lr: 0.003669  min_lr: 0.003669  loss: 2.4972 (2.4972)  weight_decay: 0.0500 (0.0500)  time: 3.4832  data: 3.0694  max mem: 28503
Epoch: [72]  [ 200/1251]  eta: 0:06:22  lr: 0.003667  min_lr: 0.003667  loss: 3.5821 (3.3851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7243 (0.8801)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [72]  [ 400/1251]  eta: 0:05:02  lr: 0.003665  min_lr: 0.003665  loss: 3.6232 (3.4091)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7641 (0.8362)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [72]  [ 600/1251]  eta: 0:03:49  lr: 0.003663  min_lr: 0.003663  loss: 3.5710 (3.4236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8606 (0.8407)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [72]  [ 800/1251]  eta: 0:02:38  lr: 0.003661  min_lr: 0.003661  loss: 3.4763 (3.4283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7146 (0.8418)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [72]  [1000/1251]  eta: 0:01:28  lr: 0.003659  min_lr: 0.003659  loss: 3.7318 (3.4381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6592 (0.8255)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [72]  [1200/1251]  eta: 0:00:17  lr: 0.003657  min_lr: 0.003657  loss: 3.6725 (3.4444)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8396 (0.8302)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [72]  [1250/1251]  eta: 0:00:00  lr: 0.003657  min_lr: 0.003657  loss: 3.6206 (3.4452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7589 (0.8275)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [72] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.003657  min_lr: 0.003657  loss: 3.6206 (3.4634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7589 (0.8275)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7088 (0.7088)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5598  data: 5.3697  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9107 (0.9171)  acc1: 78.8000 (80.3273)  acc5: 96.4000 (96.2182)  time: 0.6895  data: 0.5187  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0665 (1.0966)  acc1: 73.6000 (76.6476)  acc5: 93.2000 (93.6381)  time: 0.1855  data: 0.0169  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2437 (1.1067)  acc1: 73.2000 (76.3040)  acc5: 91.6000 (93.5040)  time: 0.1852  data: 0.0167  max mem: 28503
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 76.764 Acc@5 93.888 loss 1.086
Accuracy of the model on the 50000 test images: 76.8%
Max accuracy: 76.98%
Epoch: [73]  [   0/1251]  eta: 1:12:27  lr: 0.003657  min_lr: 0.003657  loss: 3.8302 (3.8302)  weight_decay: 0.0500 (0.0500)  time: 3.4756  data: 3.0158  max mem: 28503
Epoch: [73]  [ 200/1251]  eta: 0:06:22  lr: 0.003655  min_lr: 0.003655  loss: 3.7095 (3.4477)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6899 (0.7604)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [73]  [ 400/1251]  eta: 0:05:03  lr: 0.003653  min_lr: 0.003653  loss: 3.3354 (3.4202)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6779 (0.7755)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [73]  [ 600/1251]  eta: 0:03:50  lr: 0.003651  min_lr: 0.003651  loss: 3.5638 (3.4315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6934 (0.7572)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [73]  [ 800/1251]  eta: 0:02:38  lr: 0.003649  min_lr: 0.003649  loss: 3.7790 (3.4160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7012 (0.7837)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [73]  [1000/1251]  eta: 0:01:28  lr: 0.003647  min_lr: 0.003647  loss: 3.4928 (3.4204)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5796 (0.7797)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [73]  [1200/1251]  eta: 0:00:17  lr: 0.003645  min_lr: 0.003645  loss: 3.3331 (3.4250)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.7821)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [73]  [1250/1251]  eta: 0:00:00  lr: 0.003644  min_lr: 0.003644  loss: 3.5452 (3.4286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.7770)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [73] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.003644  min_lr: 0.003644  loss: 3.5452 (3.4531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6929 (0.7770)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.8066 (0.8066)  acc1: 85.2000 (85.2000)  acc5: 97.2000 (97.2000)  time: 5.6902  data: 5.4875  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9968 (0.9705)  acc1: 81.2000 (80.8364)  acc5: 96.4000 (96.3636)  time: 0.6712  data: 0.4992  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1457 (1.1499)  acc1: 74.0000 (77.0286)  acc5: 94.0000 (93.7714)  time: 0.1716  data: 0.0029  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2683 (1.1603)  acc1: 74.0000 (76.6080)  acc5: 92.0000 (93.7440)  time: 0.1801  data: 0.0118  max mem: 28503
Test: Total time: 0:00:10 (0.4086 s / it)
* Acc@1 77.074 Acc@5 94.000 loss 1.151
Accuracy of the model on the 50000 test images: 77.1%
Max accuracy: 77.07%
Epoch: [74]  [   0/1251]  eta: 1:04:29  lr: 0.003644  min_lr: 0.003644  loss: 2.6591 (2.6591)  weight_decay: 0.0500 (0.0500)  time: 3.0930  data: 2.7092  max mem: 28503
Epoch: [74]  [ 200/1251]  eta: 0:06:19  lr: 0.003642  min_lr: 0.003642  loss: 3.5494 (3.3985)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6846 (0.8397)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [74]  [ 400/1251]  eta: 0:05:02  lr: 0.003640  min_lr: 0.003640  loss: 3.4430 (3.4112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6962 (0.8116)  time: 0.3507  data: 0.0004  max mem: 28503
Epoch: [74]  [ 600/1251]  eta: 0:03:49  lr: 0.003638  min_lr: 0.003638  loss: 3.7531 (3.4207)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7146 (0.7996)  time: 0.3462  data: 0.0005  max mem: 28503
Epoch: [74]  [ 800/1251]  eta: 0:02:38  lr: 0.003636  min_lr: 0.003636  loss: 3.6274 (3.4423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8595 (0.8334)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [74]  [1000/1251]  eta: 0:01:28  lr: 0.003634  min_lr: 0.003634  loss: 3.6539 (3.4357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9603 (0.8345)  time: 0.3457  data: 0.0003  max mem: 28503
Epoch: [74]  [1200/1251]  eta: 0:00:17  lr: 0.003632  min_lr: 0.003632  loss: 3.3301 (3.4395)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7533 (0.8279)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [74]  [1250/1251]  eta: 0:00:00  lr: 0.003631  min_lr: 0.003631  loss: 3.2558 (3.4361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9835 (0.8378)  time: 0.2994  data: 0.0005  max mem: 28503
Epoch: [74] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003631  min_lr: 0.003631  loss: 3.2558 (3.4563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9835 (0.8378)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7146 (0.7146)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.2963  data: 5.1008  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8699 (0.8928)  acc1: 80.8000 (79.9273)  acc5: 96.4000 (96.4364)  time: 0.7047  data: 0.5321  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0938 (1.0764)  acc1: 72.8000 (76.1524)  acc5: 93.6000 (93.9238)  time: 0.2078  data: 0.0377  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1426 (1.0793)  acc1: 73.2000 (76.0960)  acc5: 92.4000 (93.9040)  time: 0.2068  data: 0.0376  max mem: 28503
Test: Total time: 0:00:10 (0.4084 s / it)
* Acc@1 76.998 Acc@5 94.050 loss 1.070
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.07%
Epoch: [75]  [   0/1251]  eta: 1:11:37  lr: 0.003631  min_lr: 0.003631  loss: 3.1150 (3.1150)  weight_decay: 0.0500 (0.0500)  time: 3.4350  data: 2.3103  max mem: 28503
Epoch: [75]  [ 200/1251]  eta: 0:06:22  lr: 0.003629  min_lr: 0.003629  loss: 3.5511 (3.4550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6770 (0.7711)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [75]  [ 400/1251]  eta: 0:05:03  lr: 0.003627  min_lr: 0.003627  loss: 3.6876 (3.4556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8550 (0.8080)  time: 0.3486  data: 0.0004  max mem: 28503
Epoch: [75]  [ 600/1251]  eta: 0:03:50  lr: 0.003625  min_lr: 0.003625  loss: 3.3450 (3.4583)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7132 (0.7979)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [75]  [ 800/1251]  eta: 0:02:38  lr: 0.003623  min_lr: 0.003623  loss: 3.8057 (3.4650)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8911 (0.8403)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [75]  [1000/1251]  eta: 0:01:28  lr: 0.003621  min_lr: 0.003621  loss: 3.6651 (3.4683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.8201)  time: 0.3522  data: 0.0004  max mem: 28503
Epoch: [75]  [1200/1251]  eta: 0:00:17  lr: 0.003619  min_lr: 0.003619  loss: 3.3864 (3.4750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.8235)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [75]  [1250/1251]  eta: 0:00:00  lr: 0.003618  min_lr: 0.003618  loss: 3.7148 (3.4754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6052 (0.8171)  time: 0.2913  data: 0.0005  max mem: 28503
Epoch: [75] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003618  min_lr: 0.003618  loss: 3.7148 (3.4513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6052 (0.8171)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7905 (0.7905)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.6205  data: 5.4239  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0241 (1.0132)  acc1: 81.6000 (81.5273)  acc5: 96.4000 (96.2182)  time: 0.7077  data: 0.5363  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1535 (1.1827)  acc1: 75.6000 (77.2571)  acc5: 93.6000 (94.0762)  time: 0.1942  data: 0.0238  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2891 (1.2004)  acc1: 73.6000 (77.0080)  acc5: 93.2000 (93.9040)  time: 0.1992  data: 0.0289  max mem: 28503
Test: Total time: 0:00:10 (0.4149 s / it)
* Acc@1 76.874 Acc@5 94.036 loss 1.195
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.07%
Epoch: [76]  [   0/1251]  eta: 1:12:11  lr: 0.003618  min_lr: 0.003618  loss: 3.7666 (3.7666)  weight_decay: 0.0500 (0.0500)  time: 3.4625  data: 2.1578  max mem: 28503
Epoch: [76]  [ 200/1251]  eta: 0:06:22  lr: 0.003616  min_lr: 0.003616  loss: 3.5661 (3.4196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.7235)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [76]  [ 400/1251]  eta: 0:05:03  lr: 0.003614  min_lr: 0.003614  loss: 3.4462 (3.4497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7817 (0.7693)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [76]  [ 600/1251]  eta: 0:03:50  lr: 0.003612  min_lr: 0.003612  loss: 3.4629 (3.4298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6497 (0.7794)  time: 0.3443  data: 0.0005  max mem: 28503
Epoch: [76]  [ 800/1251]  eta: 0:02:38  lr: 0.003610  min_lr: 0.003610  loss: 3.2764 (3.4281)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3518  data: 0.0004  max mem: 28503
Epoch: [76]  [1000/1251]  eta: 0:01:28  lr: 0.003607  min_lr: 0.003607  loss: 3.4877 (3.4460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7063 (nan)  time: 0.3468  data: 0.0005  max mem: 28503
Epoch: [76]  [1200/1251]  eta: 0:00:17  lr: 0.003605  min_lr: 0.003605  loss: 3.6314 (3.4402)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (nan)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [76]  [1250/1251]  eta: 0:00:00  lr: 0.003605  min_lr: 0.003605  loss: 3.5735 (3.4396)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6208 (nan)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [76] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.003605  min_lr: 0.003605  loss: 3.5735 (3.4463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6208 (nan)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7302 (0.7302)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.2360  data: 5.0298  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0219 (0.9623)  acc1: 80.4000 (81.3455)  acc5: 95.2000 (96.0727)  time: 0.7314  data: 0.5589  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1598 (1.1323)  acc1: 74.4000 (77.5429)  acc5: 94.0000 (93.8095)  time: 0.2248  data: 0.0560  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1598 (1.1466)  acc1: 74.0000 (76.9920)  acc5: 93.2000 (93.7120)  time: 0.2243  data: 0.0559  max mem: 28503
Test: Total time: 0:00:10 (0.4195 s / it)
* Acc@1 76.894 Acc@5 93.998 loss 1.135
Accuracy of the model on the 50000 test images: 76.9%
Max accuracy: 77.07%
Epoch: [77]  [   0/1251]  eta: 1:10:26  lr: 0.003605  min_lr: 0.003605  loss: 2.9068 (2.9068)  weight_decay: 0.0500 (0.0500)  time: 3.3784  data: 2.9301  max mem: 28503
Epoch: [77]  [ 200/1251]  eta: 0:06:23  lr: 0.003603  min_lr: 0.003603  loss: 3.2191 (3.3819)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7115 (0.8057)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [77]  [ 400/1251]  eta: 0:05:04  lr: 0.003601  min_lr: 0.003601  loss: 3.6126 (3.4213)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7385 (0.8291)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [77]  [ 600/1251]  eta: 0:03:50  lr: 0.003598  min_lr: 0.003598  loss: 3.6130 (3.4404)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6810 (0.7941)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [77]  [ 800/1251]  eta: 0:02:38  lr: 0.003596  min_lr: 0.003596  loss: 3.4918 (3.4479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8443 (0.8356)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [77]  [1000/1251]  eta: 0:01:28  lr: 0.003594  min_lr: 0.003594  loss: 3.5685 (3.4469)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5891 (0.8235)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [77]  [1200/1251]  eta: 0:00:17  lr: 0.003592  min_lr: 0.003592  loss: 3.5695 (3.4488)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6851 (0.8171)  time: 0.3519  data: 0.0004  max mem: 28503
Epoch: [77]  [1250/1251]  eta: 0:00:00  lr: 0.003591  min_lr: 0.003591  loss: 3.7279 (3.4509)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7314 (0.8153)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [77] Total time: 0:07:18 (0.3501 s / it)
Averaged stats: lr: 0.003591  min_lr: 0.003591  loss: 3.7279 (3.4462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7314 (0.8153)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.8428 (0.8428)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.5525  data: 5.3568  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0033 (0.9965)  acc1: 81.2000 (80.9455)  acc5: 97.2000 (96.6182)  time: 0.6989  data: 0.5268  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1743 (1.1777)  acc1: 75.2000 (77.3143)  acc5: 92.8000 (94.4000)  time: 0.1911  data: 0.0220  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3583 (1.1869)  acc1: 74.8000 (77.0400)  acc5: 92.8000 (94.3680)  time: 0.1903  data: 0.0219  max mem: 28503
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 77.292 Acc@5 94.256 loss 1.186
Accuracy of the model on the 50000 test images: 77.3%
Max accuracy: 77.29%
Epoch: [78]  [   0/1251]  eta: 0:59:26  lr: 0.003591  min_lr: 0.003591  loss: 3.6797 (3.6797)  weight_decay: 0.0500 (0.0500)  time: 2.8510  data: 2.4574  max mem: 28503
Epoch: [78]  [ 200/1251]  eta: 0:06:17  lr: 0.003589  min_lr: 0.003589  loss: 3.7039 (3.3884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6315 (0.8505)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [78]  [ 400/1251]  eta: 0:05:00  lr: 0.003587  min_lr: 0.003587  loss: 3.5867 (3.4315)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6266 (0.8707)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [78]  [ 600/1251]  eta: 0:03:48  lr: 0.003585  min_lr: 0.003585  loss: 3.6237 (3.4082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6701 (nan)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [78]  [ 800/1251]  eta: 0:02:38  lr: 0.003583  min_lr: 0.003583  loss: 3.3048 (3.4098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6764 (nan)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [78]  [1000/1251]  eta: 0:01:27  lr: 0.003580  min_lr: 0.003580  loss: 3.4688 (3.4160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7641 (nan)  time: 0.3511  data: 0.0004  max mem: 28503
Epoch: [78]  [1200/1251]  eta: 0:00:17  lr: 0.003578  min_lr: 0.003578  loss: 3.6219 (3.4297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7251 (nan)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [78]  [1250/1251]  eta: 0:00:00  lr: 0.003578  min_lr: 0.003578  loss: 3.2004 (3.4300)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6620 (nan)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [78] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003578  min_lr: 0.003578  loss: 3.2004 (3.4313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6620 (nan)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7286 (0.7286)  acc1: 84.4000 (84.4000)  acc5: 98.4000 (98.4000)  time: 5.7660  data: 5.5531  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9523 (0.9417)  acc1: 82.0000 (80.6909)  acc5: 96.8000 (96.3636)  time: 0.6958  data: 0.5227  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1495 (1.1176)  acc1: 73.6000 (76.7619)  acc5: 94.0000 (94.2667)  time: 0.1787  data: 0.0099  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2590 (1.1266)  acc1: 73.2000 (76.3840)  acc5: 93.2000 (94.2560)  time: 0.1782  data: 0.0098  max mem: 28503
Test: Total time: 0:00:10 (0.4102 s / it)
* Acc@1 77.168 Acc@5 94.224 loss 1.111
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.29%
Epoch: [79]  [   0/1251]  eta: 1:09:07  lr: 0.003578  min_lr: 0.003578  loss: 3.3297 (3.3297)  weight_decay: 0.0500 (0.0500)  time: 3.3152  data: 2.3685  max mem: 28503
Epoch: [79]  [ 200/1251]  eta: 0:06:21  lr: 0.003575  min_lr: 0.003575  loss: 3.3769 (3.4577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7563 (0.8600)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [79]  [ 400/1251]  eta: 0:05:03  lr: 0.003573  min_lr: 0.003573  loss: 3.6546 (3.4292)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7589 (0.8409)  time: 0.3461  data: 0.0005  max mem: 28503
Epoch: [79]  [ 600/1251]  eta: 0:03:50  lr: 0.003571  min_lr: 0.003571  loss: 3.6245 (3.4286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8471 (0.8188)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [79]  [ 800/1251]  eta: 0:02:38  lr: 0.003569  min_lr: 0.003569  loss: 3.5639 (3.4356)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8005 (0.8250)  time: 0.3562  data: 0.0005  max mem: 28503
Epoch: [79]  [1000/1251]  eta: 0:01:28  lr: 0.003567  min_lr: 0.003567  loss: 3.7763 (3.4482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7779 (0.8312)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [79]  [1200/1251]  eta: 0:00:17  lr: 0.003564  min_lr: 0.003564  loss: 3.7139 (3.4392)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7152 (0.8351)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [79]  [1250/1251]  eta: 0:00:00  lr: 0.003564  min_lr: 0.003564  loss: 3.7560 (3.4419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.8365)  time: 0.2914  data: 0.0006  max mem: 28503
Epoch: [79] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.003564  min_lr: 0.003564  loss: 3.7560 (3.4372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7647 (0.8365)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7367 (0.7367)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.4845  data: 5.2575  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.9720 (0.9809)  acc1: 81.2000 (81.6000)  acc5: 96.8000 (96.2182)  time: 0.6548  data: 0.4805  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2394 (1.1589)  acc1: 75.2000 (77.5048)  acc5: 93.6000 (94.0381)  time: 0.1702  data: 0.0015  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3003 (1.1666)  acc1: 75.2000 (77.2160)  acc5: 93.2000 (94.0640)  time: 0.1698  data: 0.0014  max mem: 28503
Test: Total time: 0:00:09 (0.3863 s / it)
* Acc@1 76.968 Acc@5 94.066 loss 1.166
Accuracy of the model on the 50000 test images: 77.0%
Max accuracy: 77.29%
Epoch: [80]  [   0/1251]  eta: 1:07:37  lr: 0.003564  min_lr: 0.003564  loss: 3.6243 (3.6243)  weight_decay: 0.0500 (0.0500)  time: 3.2430  data: 1.6800  max mem: 28503
Epoch: [80]  [ 200/1251]  eta: 0:06:19  lr: 0.003562  min_lr: 0.003562  loss: 3.5523 (3.4350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7669 (0.8643)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [80]  [ 400/1251]  eta: 0:05:02  lr: 0.003559  min_lr: 0.003559  loss: 3.4124 (3.4126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7105 (0.8005)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [80]  [ 600/1251]  eta: 0:03:49  lr: 0.003557  min_lr: 0.003557  loss: 3.5465 (3.4152)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7228 (0.8031)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [80]  [ 800/1251]  eta: 0:02:38  lr: 0.003555  min_lr: 0.003555  loss: 3.5780 (3.4045)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7252 (0.7930)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [80]  [1000/1251]  eta: 0:01:28  lr: 0.003553  min_lr: 0.003553  loss: 3.6569 (3.4112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8416 (0.8006)  time: 0.3526  data: 0.0004  max mem: 28503
Epoch: [80]  [1200/1251]  eta: 0:00:17  lr: 0.003550  min_lr: 0.003550  loss: 3.4912 (3.4080)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6505 (nan)  time: 0.3519  data: 0.0004  max mem: 28503
Epoch: [80]  [1250/1251]  eta: 0:00:00  lr: 0.003550  min_lr: 0.003550  loss: 3.0163 (3.4054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9682 (nan)  time: 0.2913  data: 0.0007  max mem: 28503
Epoch: [80] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003550  min_lr: 0.003550  loss: 3.0163 (3.4157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9682 (nan)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.5993 (0.5993)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.9314  data: 5.7296  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8296 (0.8487)  acc1: 81.6000 (81.3818)  acc5: 96.0000 (96.4000)  time: 0.6939  data: 0.5212  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0801 (1.0379)  acc1: 75.2000 (77.6571)  acc5: 93.6000 (94.1714)  time: 0.1694  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1788 (1.0450)  acc1: 74.4000 (77.4880)  acc5: 93.2000 (94.1920)  time: 0.1686  data: 0.0002  max mem: 28503
Test: Total time: 0:00:10 (0.4056 s / it)
* Acc@1 77.156 Acc@5 94.208 loss 1.044
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.29%
Epoch: [81]  [   0/1251]  eta: 1:09:32  lr: 0.003550  min_lr: 0.003550  loss: 3.8156 (3.8156)  weight_decay: 0.0500 (0.0500)  time: 3.3353  data: 2.0416  max mem: 28503
Epoch: [81]  [ 200/1251]  eta: 0:06:21  lr: 0.003547  min_lr: 0.003547  loss: 3.6462 (3.4166)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6675 (0.7876)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [81]  [ 400/1251]  eta: 0:05:02  lr: 0.003545  min_lr: 0.003545  loss: 3.5606 (3.4273)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7056 (0.7866)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [81]  [ 600/1251]  eta: 0:03:49  lr: 0.003543  min_lr: 0.003543  loss: 3.6070 (3.4322)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8650 (0.8076)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [81]  [ 800/1251]  eta: 0:02:38  lr: 0.003541  min_lr: 0.003541  loss: 3.6184 (3.4364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7997 (0.8347)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [81]  [1000/1251]  eta: 0:01:28  lr: 0.003538  min_lr: 0.003538  loss: 3.4007 (3.4421)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.8287)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [81]  [1200/1251]  eta: 0:00:17  lr: 0.003536  min_lr: 0.003536  loss: 3.5390 (3.4383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7117 (0.8129)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [81]  [1250/1251]  eta: 0:00:00  lr: 0.003535  min_lr: 0.003535  loss: 3.5166 (3.4397)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7249 (0.8171)  time: 0.2922  data: 0.0007  max mem: 28503
Epoch: [81] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.003535  min_lr: 0.003535  loss: 3.5166 (3.4205)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7249 (0.8171)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.7050 (0.7050)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 5.8534  data: 5.6546  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9008 (0.9287)  acc1: 81.2000 (80.8727)  acc5: 96.8000 (96.3636)  time: 0.7206  data: 0.5480  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1332 (1.1072)  acc1: 74.0000 (76.8952)  acc5: 94.0000 (94.0952)  time: 0.1879  data: 0.0187  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2463 (1.1178)  acc1: 74.0000 (76.7040)  acc5: 92.4000 (94.0640)  time: 0.1871  data: 0.0186  max mem: 28503
Test: Total time: 0:00:10 (0.4151 s / it)
* Acc@1 77.362 Acc@5 94.158 loss 1.101
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.36%
Epoch: [82]  [   0/1251]  eta: 1:02:42  lr: 0.003535  min_lr: 0.003535  loss: 2.9744 (2.9744)  weight_decay: 0.0500 (0.0500)  time: 3.0079  data: 2.6455  max mem: 28503
Epoch: [82]  [ 200/1251]  eta: 0:06:18  lr: 0.003533  min_lr: 0.003533  loss: 3.4285 (3.3578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6966 (0.8139)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [82]  [ 400/1251]  eta: 0:05:00  lr: 0.003531  min_lr: 0.003531  loss: 3.6847 (3.4051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6527 (0.8042)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [82]  [ 600/1251]  eta: 0:03:48  lr: 0.003528  min_lr: 0.003528  loss: 3.6031 (3.4122)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7584 (0.7883)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [82]  [ 800/1251]  eta: 0:02:38  lr: 0.003526  min_lr: 0.003526  loss: 3.4787 (3.4087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7454 (0.7908)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [82]  [1000/1251]  eta: 0:01:27  lr: 0.003524  min_lr: 0.003524  loss: 3.5993 (3.4134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7048 (0.8014)  time: 0.3444  data: 0.0005  max mem: 28503
Epoch: [82]  [1200/1251]  eta: 0:00:17  lr: 0.003521  min_lr: 0.003521  loss: 3.6038 (3.4275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7746 (0.8214)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [82]  [1250/1251]  eta: 0:00:00  lr: 0.003521  min_lr: 0.003521  loss: 3.7197 (3.4275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.8186)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [82] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.003521  min_lr: 0.003521  loss: 3.7197 (3.4193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.8186)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6515 (0.6515)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6977  data: 5.4753  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9608 (0.9168)  acc1: 81.2000 (81.3818)  acc5: 97.6000 (96.5091)  time: 0.6779  data: 0.5045  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1511 (1.1040)  acc1: 73.2000 (77.4095)  acc5: 93.2000 (94.0571)  time: 0.1741  data: 0.0056  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2411 (1.1084)  acc1: 73.6000 (77.2160)  acc5: 92.4000 (93.8720)  time: 0.1739  data: 0.0055  max mem: 28503
Test: Total time: 0:00:09 (0.3975 s / it)
* Acc@1 77.444 Acc@5 94.164 loss 1.096
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.44%
Epoch: [83]  [   0/1251]  eta: 1:04:26  lr: 0.003521  min_lr: 0.003521  loss: 3.1075 (3.1075)  weight_decay: 0.0500 (0.0500)  time: 3.0909  data: 2.7395  max mem: 28503
Epoch: [83]  [ 200/1251]  eta: 0:06:18  lr: 0.003519  min_lr: 0.003519  loss: 3.4908 (3.3457)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9117 (0.9062)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [83]  [ 400/1251]  eta: 0:05:02  lr: 0.003516  min_lr: 0.003516  loss: 3.5925 (3.4028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6286 (0.8408)  time: 0.3582  data: 0.0004  max mem: 28503
Epoch: [83]  [ 600/1251]  eta: 0:03:49  lr: 0.003514  min_lr: 0.003514  loss: 3.7345 (3.3978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5806 (0.8133)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [83]  [ 800/1251]  eta: 0:02:38  lr: 0.003512  min_lr: 0.003512  loss: 3.6429 (3.3941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6560 (0.8070)  time: 0.3538  data: 0.0004  max mem: 28503
Epoch: [83]  [1000/1251]  eta: 0:01:27  lr: 0.003509  min_lr: 0.003509  loss: 3.3417 (3.3961)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8341 (0.8142)  time: 0.3503  data: 0.0004  max mem: 28503
Epoch: [83]  [1200/1251]  eta: 0:00:17  lr: 0.003507  min_lr: 0.003507  loss: 3.5298 (3.4000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9266 (0.8322)  time: 0.3541  data: 0.0005  max mem: 28503
Epoch: [83]  [1250/1251]  eta: 0:00:00  lr: 0.003506  min_lr: 0.003506  loss: 3.3282 (3.4010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (0.8341)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [83] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003506  min_lr: 0.003506  loss: 3.3282 (3.4106)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8322 (0.8341)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6705 (0.6705)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.4721  data: 5.2613  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8659 (0.8801)  acc1: 81.6000 (81.4545)  acc5: 96.4000 (96.4727)  time: 0.6846  data: 0.5112  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1000 (1.0600)  acc1: 74.8000 (77.6000)  acc5: 93.6000 (94.3429)  time: 0.1872  data: 0.0182  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1950 (1.0737)  acc1: 74.4000 (77.1040)  acc5: 93.6000 (94.2560)  time: 0.1911  data: 0.0226  max mem: 28503
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 77.550 Acc@5 94.256 loss 1.066
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.55%
Epoch: [84]  [   0/1251]  eta: 1:07:10  lr: 0.003506  min_lr: 0.003506  loss: 3.7283 (3.7283)  weight_decay: 0.0500 (0.0500)  time: 3.2216  data: 2.8583  max mem: 28503
Epoch: [84]  [ 200/1251]  eta: 0:06:19  lr: 0.003504  min_lr: 0.003504  loss: 3.4551 (3.3564)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7934 (0.8144)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [84]  [ 400/1251]  eta: 0:05:02  lr: 0.003502  min_lr: 0.003502  loss: 3.6123 (3.3795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7808 (0.7608)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [84]  [ 600/1251]  eta: 0:03:49  lr: 0.003499  min_lr: 0.003499  loss: 3.5328 (3.3797)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8672 (0.7913)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [84]  [ 800/1251]  eta: 0:02:38  lr: 0.003497  min_lr: 0.003497  loss: 3.4333 (3.3903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7339 (0.7780)  time: 0.3654  data: 0.0004  max mem: 28503
Epoch: [84]  [1000/1251]  eta: 0:01:28  lr: 0.003494  min_lr: 0.003494  loss: 3.7054 (3.4032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8351 (0.7921)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [84]  [1200/1251]  eta: 0:00:17  lr: 0.003492  min_lr: 0.003492  loss: 3.5061 (3.4102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8226 (0.8072)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [84]  [1250/1251]  eta: 0:00:00  lr: 0.003491  min_lr: 0.003491  loss: 3.4604 (3.4108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.8077)  time: 0.2915  data: 0.0006  max mem: 28503
Epoch: [84] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003491  min_lr: 0.003491  loss: 3.4604 (3.4175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6889 (0.8077)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6811 (0.6811)  acc1: 88.0000 (88.0000)  acc5: 99.6000 (99.6000)  time: 5.6910  data: 5.4874  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9490 (0.9183)  acc1: 83.2000 (81.4182)  acc5: 97.6000 (96.7273)  time: 0.7359  data: 0.5629  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1198 (1.0950)  acc1: 75.2000 (77.5048)  acc5: 94.8000 (94.6476)  time: 0.2188  data: 0.0496  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2330 (1.1027)  acc1: 74.0000 (76.9600)  acc5: 93.2000 (94.6080)  time: 0.2180  data: 0.0495  max mem: 28503
Test: Total time: 0:00:10 (0.4331 s / it)
* Acc@1 77.518 Acc@5 94.380 loss 1.098
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.55%
Epoch: [85]  [   0/1251]  eta: 1:12:34  lr: 0.003491  min_lr: 0.003491  loss: 2.7861 (2.7861)  weight_decay: 0.0500 (0.0500)  time: 3.4811  data: 1.6492  max mem: 28503
Epoch: [85]  [ 200/1251]  eta: 0:06:21  lr: 0.003489  min_lr: 0.003489  loss: 3.1779 (3.3878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.7379)  time: 0.3562  data: 0.0004  max mem: 28503
Epoch: [85]  [ 400/1251]  eta: 0:05:02  lr: 0.003487  min_lr: 0.003487  loss: 3.2165 (3.3798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8063 (0.8011)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [85]  [ 600/1251]  eta: 0:03:49  lr: 0.003484  min_lr: 0.003484  loss: 3.7189 (3.3984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6850 (0.7621)  time: 0.3468  data: 0.0003  max mem: 28503
Epoch: [85]  [ 800/1251]  eta: 0:02:38  lr: 0.003482  min_lr: 0.003482  loss: 3.2393 (3.3878)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7835 (0.8115)  time: 0.3579  data: 0.0004  max mem: 28503
Epoch: [85]  [1000/1251]  eta: 0:01:27  lr: 0.003479  min_lr: 0.003479  loss: 3.4876 (3.3965)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6975 (0.8057)  time: 0.3518  data: 0.0004  max mem: 28503
Epoch: [85]  [1200/1251]  eta: 0:00:17  lr: 0.003477  min_lr: 0.003477  loss: 3.4156 (3.3943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8220)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [85]  [1250/1251]  eta: 0:00:00  lr: 0.003476  min_lr: 0.003476  loss: 3.5765 (3.3962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.8162)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [85] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003476  min_lr: 0.003476  loss: 3.5765 (3.4030)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6931 (0.8162)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.7150 (0.7150)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.3142  data: 5.1070  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8593 (0.9048)  acc1: 81.6000 (81.2000)  acc5: 97.6000 (96.7636)  time: 0.6908  data: 0.5179  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0871 (1.0855)  acc1: 74.8000 (77.6000)  acc5: 93.2000 (94.4381)  time: 0.2063  data: 0.0374  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2076 (1.0970)  acc1: 74.8000 (77.3120)  acc5: 92.8000 (94.3840)  time: 0.2057  data: 0.0372  max mem: 28503
Test: Total time: 0:00:10 (0.4079 s / it)
* Acc@1 77.730 Acc@5 94.356 loss 1.091
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.73%
Epoch: [86]  [   0/1251]  eta: 1:07:42  lr: 0.003476  min_lr: 0.003476  loss: 3.7093 (3.7093)  weight_decay: 0.0500 (0.0500)  time: 3.2470  data: 2.8785  max mem: 28503
Epoch: [86]  [ 200/1251]  eta: 0:06:20  lr: 0.003474  min_lr: 0.003474  loss: 3.5550 (3.3924)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6937 (0.7228)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [86]  [ 400/1251]  eta: 0:05:01  lr: 0.003472  min_lr: 0.003472  loss: 3.4704 (3.4188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6532 (0.7610)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [86]  [ 600/1251]  eta: 0:03:48  lr: 0.003469  min_lr: 0.003469  loss: 3.5152 (3.4445)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8491 (0.8195)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [86]  [ 800/1251]  eta: 0:02:38  lr: 0.003467  min_lr: 0.003467  loss: 3.3690 (3.4345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0448 (0.8344)  time: 0.3636  data: 0.0004  max mem: 28503
Epoch: [86]  [1000/1251]  eta: 0:01:27  lr: 0.003464  min_lr: 0.003464  loss: 3.5260 (3.4347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6670 (0.8101)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [86]  [1200/1251]  eta: 0:00:17  lr: 0.003462  min_lr: 0.003462  loss: 3.4616 (3.4323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8080 (0.8011)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [86]  [1250/1251]  eta: 0:00:00  lr: 0.003461  min_lr: 0.003461  loss: 2.9556 (3.4271)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9089 (0.8048)  time: 0.2921  data: 0.0005  max mem: 28503
Epoch: [86] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.003461  min_lr: 0.003461  loss: 2.9556 (3.4005)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9089 (0.8048)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6288 (0.6288)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.2746  data: 5.0536  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8355 (0.8339)  acc1: 82.8000 (81.4909)  acc5: 97.2000 (96.2909)  time: 0.6597  data: 0.4858  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0143 (1.0094)  acc1: 75.6000 (77.8667)  acc5: 93.2000 (94.2286)  time: 0.1956  data: 0.0268  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1327 (1.0212)  acc1: 75.6000 (77.2960)  acc5: 92.8000 (94.1600)  time: 0.1990  data: 0.0305  max mem: 28503
Test: Total time: 0:00:10 (0.4007 s / it)
* Acc@1 77.434 Acc@5 94.310 loss 1.019
Accuracy of the model on the 50000 test images: 77.4%
Max accuracy: 77.73%
Epoch: [87]  [   0/1251]  eta: 1:08:06  lr: 0.003461  min_lr: 0.003461  loss: 2.5818 (2.5818)  weight_decay: 0.0500 (0.0500)  time: 3.2664  data: 2.6737  max mem: 28503
Epoch: [87]  [ 200/1251]  eta: 0:06:21  lr: 0.003459  min_lr: 0.003459  loss: 3.2610 (3.3328)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6715 (0.8792)  time: 0.3530  data: 0.0004  max mem: 28503
Epoch: [87]  [ 400/1251]  eta: 0:05:03  lr: 0.003456  min_lr: 0.003456  loss: 3.2821 (3.3561)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7492 (0.8083)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [87]  [ 600/1251]  eta: 0:03:50  lr: 0.003454  min_lr: 0.003454  loss: 3.4291 (3.3493)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8254 (0.8304)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [87]  [ 800/1251]  eta: 0:02:38  lr: 0.003451  min_lr: 0.003451  loss: 3.4431 (3.3627)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.8323)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [87]  [1000/1251]  eta: 0:01:28  lr: 0.003449  min_lr: 0.003449  loss: 3.3868 (3.3716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5426 (0.8119)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [87]  [1200/1251]  eta: 0:00:17  lr: 0.003446  min_lr: 0.003446  loss: 3.4016 (3.3683)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7215 (0.8281)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [87]  [1250/1251]  eta: 0:00:00  lr: 0.003446  min_lr: 0.003446  loss: 3.4298 (3.3705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6516 (0.8241)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [87] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.003446  min_lr: 0.003446  loss: 3.4298 (3.3933)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6516 (0.8241)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7106 (0.7106)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.4730  data: 5.2651  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9390 (0.9213)  acc1: 81.6000 (81.4182)  acc5: 96.8000 (96.6545)  time: 0.6907  data: 0.5177  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1076 (1.1039)  acc1: 75.6000 (77.7143)  acc5: 94.4000 (94.2857)  time: 0.1905  data: 0.0215  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2348 (1.1083)  acc1: 75.6000 (77.6160)  acc5: 92.8000 (94.2880)  time: 0.1900  data: 0.0214  max mem: 28503
Test: Total time: 0:00:10 (0.4015 s / it)
* Acc@1 77.514 Acc@5 94.314 loss 1.104
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [88]  [   0/1251]  eta: 1:13:57  lr: 0.003446  min_lr: 0.003446  loss: 3.9753 (3.9753)  weight_decay: 0.0500 (0.0500)  time: 3.5473  data: 2.6782  max mem: 28503
Epoch: [88]  [ 200/1251]  eta: 0:06:25  lr: 0.003443  min_lr: 0.003443  loss: 3.4070 (3.4272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.8250)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [88]  [ 400/1251]  eta: 0:05:03  lr: 0.003441  min_lr: 0.003441  loss: 3.5058 (3.4086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8123 (0.8125)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [88]  [ 600/1251]  eta: 0:03:50  lr: 0.003438  min_lr: 0.003438  loss: 3.4548 (3.3893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5320 (0.7972)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [88]  [ 800/1251]  eta: 0:02:38  lr: 0.003436  min_lr: 0.003436  loss: 3.4265 (3.3928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.8115)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [88]  [1000/1251]  eta: 0:01:28  lr: 0.003433  min_lr: 0.003433  loss: 3.2740 (3.3862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7219 (0.8055)  time: 0.3619  data: 0.0004  max mem: 28503
Epoch: [88]  [1200/1251]  eta: 0:00:17  lr: 0.003431  min_lr: 0.003431  loss: 3.4058 (3.3982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8816 (0.8006)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [88]  [1250/1251]  eta: 0:00:00  lr: 0.003430  min_lr: 0.003430  loss: 3.6231 (3.4022)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.7989)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [88] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003430  min_lr: 0.003430  loss: 3.6231 (3.4004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.7989)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7698 (0.7698)  acc1: 86.4000 (86.4000)  acc5: 98.8000 (98.8000)  time: 5.5461  data: 5.3398  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0250 (0.9873)  acc1: 80.4000 (81.6364)  acc5: 96.8000 (96.9455)  time: 0.6918  data: 0.5190  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2181 (1.1869)  acc1: 74.4000 (77.2191)  acc5: 93.2000 (94.4381)  time: 0.1875  data: 0.0185  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3486 (1.1929)  acc1: 73.6000 (76.9600)  acc5: 92.8000 (94.4320)  time: 0.1869  data: 0.0184  max mem: 28503
Test: Total time: 0:00:10 (0.4023 s / it)
* Acc@1 77.210 Acc@5 94.374 loss 1.188
Accuracy of the model on the 50000 test images: 77.2%
Max accuracy: 77.73%
Epoch: [89]  [   0/1251]  eta: 1:08:56  lr: 0.003430  min_lr: 0.003430  loss: 3.6011 (3.6011)  weight_decay: 0.0500 (0.0500)  time: 3.3068  data: 2.1846  max mem: 28503
Epoch: [89]  [ 200/1251]  eta: 0:06:20  lr: 0.003428  min_lr: 0.003428  loss: 3.3885 (3.4334)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8645 (0.9409)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [89]  [ 400/1251]  eta: 0:05:02  lr: 0.003425  min_lr: 0.003425  loss: 3.4037 (3.3821)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6965 (0.8528)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [89]  [ 600/1251]  eta: 0:03:49  lr: 0.003423  min_lr: 0.003423  loss: 3.0781 (3.3919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5639 (0.7867)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [89]  [ 800/1251]  eta: 0:02:38  lr: 0.003420  min_lr: 0.003420  loss: 3.6207 (3.3842)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0158 (0.8425)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [89]  [1000/1251]  eta: 0:01:27  lr: 0.003418  min_lr: 0.003418  loss: 3.6441 (3.3984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7152 (0.8214)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [89]  [1200/1251]  eta: 0:00:17  lr: 0.003415  min_lr: 0.003415  loss: 3.7042 (3.4039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6604 (0.8178)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [89]  [1250/1251]  eta: 0:00:00  lr: 0.003414  min_lr: 0.003414  loss: 3.5086 (3.4032)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5984 (0.8118)  time: 0.2958  data: 0.0006  max mem: 28503
Epoch: [89] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.003414  min_lr: 0.003414  loss: 3.5086 (3.3916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5984 (0.8118)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7221 (0.7221)  acc1: 86.4000 (86.4000)  acc5: 98.0000 (98.0000)  time: 5.4505  data: 5.2458  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9579 (0.9222)  acc1: 81.6000 (81.9273)  acc5: 96.8000 (96.4000)  time: 0.6713  data: 0.4989  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1525 (1.1080)  acc1: 74.4000 (77.5619)  acc5: 93.6000 (94.1714)  time: 0.1891  data: 0.0203  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1797 (1.1181)  acc1: 74.4000 (77.3920)  acc5: 93.2000 (94.0960)  time: 0.1940  data: 0.0257  max mem: 28503
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 77.616 Acc@5 94.156 loss 1.111
Accuracy of the model on the 50000 test images: 77.6%
Max accuracy: 77.73%
Epoch: [90]  [   0/1251]  eta: 1:05:34  lr: 0.003414  min_lr: 0.003414  loss: 4.1742 (4.1742)  weight_decay: 0.0500 (0.0500)  time: 3.1451  data: 2.5891  max mem: 28503
Epoch: [90]  [ 200/1251]  eta: 0:06:23  lr: 0.003412  min_lr: 0.003412  loss: 3.5010 (3.3403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7555 (0.7690)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [90]  [ 400/1251]  eta: 0:05:03  lr: 0.003409  min_lr: 0.003409  loss: 3.5009 (3.3572)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9430 (0.8253)  time: 0.3570  data: 0.0004  max mem: 28503
Epoch: [90]  [ 600/1251]  eta: 0:03:50  lr: 0.003407  min_lr: 0.003407  loss: 3.5051 (3.3624)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7190 (0.8124)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [90]  [ 800/1251]  eta: 0:02:38  lr: 0.003404  min_lr: 0.003404  loss: 3.5060 (3.3731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6977 (0.8387)  time: 0.3493  data: 0.0005  max mem: 28503
Epoch: [90]  [1000/1251]  eta: 0:01:28  lr: 0.003402  min_lr: 0.003402  loss: 3.1485 (3.3720)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6583 (0.8208)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [90]  [1200/1251]  eta: 0:00:17  lr: 0.003399  min_lr: 0.003399  loss: 3.5315 (3.3752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (0.8259)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [90]  [1250/1251]  eta: 0:00:00  lr: 0.003398  min_lr: 0.003398  loss: 3.5106 (3.3772)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.8257)  time: 0.2927  data: 0.0006  max mem: 28503
Epoch: [90] Total time: 0:07:19 (0.3513 s / it)
Averaged stats: lr: 0.003398  min_lr: 0.003398  loss: 3.5106 (3.3920)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7786 (0.8257)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7190 (0.7190)  acc1: 87.2000 (87.2000)  acc5: 98.0000 (98.0000)  time: 5.5167  data: 5.3059  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8395 (0.9021)  acc1: 84.0000 (81.3455)  acc5: 96.8000 (96.6182)  time: 0.6555  data: 0.4826  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1207 (1.0815)  acc1: 74.8000 (76.9143)  acc5: 94.4000 (94.6286)  time: 0.1725  data: 0.0038  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2226 (1.0915)  acc1: 74.0000 (76.6880)  acc5: 93.6000 (94.5440)  time: 0.1759  data: 0.0072  max mem: 28503
Test: Total time: 0:00:09 (0.3923 s / it)
* Acc@1 77.544 Acc@5 94.538 loss 1.080
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [91]  [   0/1251]  eta: 1:10:25  lr: 0.003398  min_lr: 0.003398  loss: 3.1604 (3.1604)  weight_decay: 0.0500 (0.0500)  time: 3.3781  data: 1.6032  max mem: 28503
Epoch: [91]  [ 200/1251]  eta: 0:06:23  lr: 0.003396  min_lr: 0.003396  loss: 3.3650 (3.3364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7350 (0.7301)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [91]  [ 400/1251]  eta: 0:05:02  lr: 0.003393  min_lr: 0.003393  loss: 3.4411 (3.3716)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8214 (0.7748)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [91]  [ 600/1251]  eta: 0:03:50  lr: 0.003391  min_lr: 0.003391  loss: 3.5164 (3.3852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7203 (0.7660)  time: 0.3584  data: 0.0004  max mem: 28503
Epoch: [91]  [ 800/1251]  eta: 0:02:38  lr: 0.003388  min_lr: 0.003388  loss: 3.4901 (3.3916)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6391 (0.7930)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [91]  [1000/1251]  eta: 0:01:28  lr: 0.003385  min_lr: 0.003385  loss: 3.6225 (3.4008)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8482 (0.8036)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [91]  [1200/1251]  eta: 0:00:17  lr: 0.003383  min_lr: 0.003383  loss: 3.6648 (3.4068)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8409 (0.8039)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [91]  [1250/1251]  eta: 0:00:00  lr: 0.003382  min_lr: 0.003382  loss: 3.4025 (3.4086)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.7996)  time: 0.2913  data: 0.0005  max mem: 28503
Epoch: [91] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003382  min_lr: 0.003382  loss: 3.4025 (3.3962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6311 (0.7996)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6779 (0.6779)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.4636  data: 5.2628  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9155 (0.8945)  acc1: 81.6000 (81.6727)  acc5: 96.8000 (96.5455)  time: 0.6703  data: 0.4970  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1335 (1.0603)  acc1: 76.0000 (77.8667)  acc5: 94.0000 (94.4762)  time: 0.1800  data: 0.0103  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1898 (1.0744)  acc1: 74.4000 (77.3280)  acc5: 93.6000 (94.4320)  time: 0.1794  data: 0.0102  max mem: 28503
Test: Total time: 0:00:09 (0.3988 s / it)
* Acc@1 77.538 Acc@5 94.318 loss 1.070
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.73%
Epoch: [92]  [   0/1251]  eta: 1:11:38  lr: 0.003382  min_lr: 0.003382  loss: 3.0013 (3.0013)  weight_decay: 0.0500 (0.0500)  time: 3.4360  data: 2.8400  max mem: 28503
Epoch: [92]  [ 200/1251]  eta: 0:06:23  lr: 0.003380  min_lr: 0.003380  loss: 3.5248 (3.3919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7653 (0.8774)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [92]  [ 400/1251]  eta: 0:05:03  lr: 0.003377  min_lr: 0.003377  loss: 3.6510 (3.3677)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7703 (nan)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [92]  [ 600/1251]  eta: 0:03:50  lr: 0.003374  min_lr: 0.003374  loss: 3.4830 (3.3841)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6723 (nan)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [92]  [ 800/1251]  eta: 0:02:39  lr: 0.003372  min_lr: 0.003372  loss: 3.6079 (3.3983)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6757 (nan)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [92]  [1000/1251]  eta: 0:01:28  lr: 0.003369  min_lr: 0.003369  loss: 3.6319 (3.4027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6333 (nan)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [92]  [1200/1251]  eta: 0:00:17  lr: 0.003367  min_lr: 0.003367  loss: 3.3963 (3.3943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6412 (nan)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [92]  [1250/1251]  eta: 0:00:00  lr: 0.003366  min_lr: 0.003366  loss: 3.1581 (3.3893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (nan)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [92] Total time: 0:07:19 (0.3512 s / it)
Averaged stats: lr: 0.003366  min_lr: 0.003366  loss: 3.1581 (3.3851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6404 (0.6404)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6805  data: 5.4965  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8297 (0.8452)  acc1: 83.2000 (81.5273)  acc5: 96.8000 (96.4000)  time: 0.7529  data: 0.5689  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0730 (1.0259)  acc1: 75.2000 (77.8476)  acc5: 94.0000 (94.2476)  time: 0.2142  data: 0.0381  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1216 (1.0320)  acc1: 74.4000 (77.6160)  acc5: 92.8000 (94.2720)  time: 0.2093  data: 0.0380  max mem: 28503
Test: Total time: 0:00:10 (0.4294 s / it)
* Acc@1 77.750 Acc@5 94.292 loss 1.031
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.75%
Epoch: [93]  [   0/1251]  eta: 1:08:48  lr: 0.003366  min_lr: 0.003366  loss: 3.4982 (3.4982)  weight_decay: 0.0500 (0.0500)  time: 3.3002  data: 2.9314  max mem: 28503
Epoch: [93]  [ 200/1251]  eta: 0:06:20  lr: 0.003363  min_lr: 0.003363  loss: 3.5191 (3.3838)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7728 (0.9038)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [93]  [ 400/1251]  eta: 0:05:01  lr: 0.003361  min_lr: 0.003361  loss: 3.5254 (3.3968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6605 (0.8341)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [93]  [ 600/1251]  eta: 0:03:49  lr: 0.003358  min_lr: 0.003358  loss: 3.5306 (3.4010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8091 (0.8318)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [93]  [ 800/1251]  eta: 0:02:38  lr: 0.003355  min_lr: 0.003355  loss: 3.4199 (3.3982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.8309)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [93]  [1000/1251]  eta: 0:01:27  lr: 0.003353  min_lr: 0.003353  loss: 3.3820 (3.3980)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7488 (0.8212)  time: 0.3488  data: 0.0005  max mem: 28503
Epoch: [93]  [1200/1251]  eta: 0:00:17  lr: 0.003350  min_lr: 0.003350  loss: 3.4637 (3.3982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7482 (0.8178)  time: 0.3527  data: 0.0004  max mem: 28503
Epoch: [93]  [1250/1251]  eta: 0:00:00  lr: 0.003350  min_lr: 0.003350  loss: 3.2276 (3.3984)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.8132)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [93] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003350  min_lr: 0.003350  loss: 3.2276 (3.3845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7075 (0.8132)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7074 (0.7074)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 5.3811  data: 5.1854  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9090 (0.9290)  acc1: 82.0000 (80.9455)  acc5: 97.2000 (96.7273)  time: 0.7399  data: 0.5651  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1260 (1.0995)  acc1: 74.8000 (77.4857)  acc5: 94.0000 (94.3810)  time: 0.2221  data: 0.0516  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2343 (1.1121)  acc1: 74.4000 (77.1200)  acc5: 92.8000 (94.2240)  time: 0.2208  data: 0.0515  max mem: 28503
Test: Total time: 0:00:10 (0.4236 s / it)
* Acc@1 77.786 Acc@5 94.336 loss 1.103
Accuracy of the model on the 50000 test images: 77.8%
Max accuracy: 77.79%
Epoch: [94]  [   0/1251]  eta: 0:57:07  lr: 0.003350  min_lr: 0.003350  loss: 2.2574 (2.2574)  weight_decay: 0.0500 (0.0500)  time: 2.7400  data: 2.3178  max mem: 28503
Epoch: [94]  [ 200/1251]  eta: 0:06:18  lr: 0.003347  min_lr: 0.003347  loss: 3.6849 (3.3574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7927 (0.7881)  time: 0.3553  data: 0.0004  max mem: 28503
Epoch: [94]  [ 400/1251]  eta: 0:05:01  lr: 0.003344  min_lr: 0.003344  loss: 3.2266 (3.3618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7120 (0.8105)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [94]  [ 600/1251]  eta: 0:03:48  lr: 0.003342  min_lr: 0.003342  loss: 3.5794 (3.3746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8165 (0.8464)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [94]  [ 800/1251]  eta: 0:02:38  lr: 0.003339  min_lr: 0.003339  loss: 3.6851 (3.3814)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6722 (0.8320)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [94]  [1000/1251]  eta: 0:01:27  lr: 0.003336  min_lr: 0.003336  loss: 3.6474 (3.3864)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6373 (0.8110)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [94]  [1200/1251]  eta: 0:00:17  lr: 0.003334  min_lr: 0.003334  loss: 3.1753 (3.3833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7992 (0.8099)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [94]  [1250/1251]  eta: 0:00:00  lr: 0.003333  min_lr: 0.003333  loss: 3.3499 (3.3818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (0.8240)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [94] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003333  min_lr: 0.003333  loss: 3.3499 (3.3671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0679 (0.8240)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7750 (0.7750)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.7395  data: 5.5426  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9537 (0.9578)  acc1: 81.6000 (81.0182)  acc5: 96.4000 (96.5091)  time: 0.7477  data: 0.5762  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1543 (1.1223)  acc1: 74.8000 (77.8286)  acc5: 93.6000 (94.3429)  time: 0.2111  data: 0.0424  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2110 (1.1297)  acc1: 74.8000 (77.4880)  acc5: 93.2000 (94.2560)  time: 0.2109  data: 0.0423  max mem: 28503
Test: Total time: 0:00:10 (0.4291 s / it)
* Acc@1 77.658 Acc@5 94.322 loss 1.128
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.79%
Epoch: [95]  [   0/1251]  eta: 1:10:57  lr: 0.003333  min_lr: 0.003333  loss: 2.3513 (2.3513)  weight_decay: 0.0500 (0.0500)  time: 3.4032  data: 1.7428  max mem: 28503
Epoch: [95]  [ 200/1251]  eta: 0:06:20  lr: 0.003330  min_lr: 0.003330  loss: 3.3173 (3.3337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6353 (0.8507)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [95]  [ 400/1251]  eta: 0:05:02  lr: 0.003327  min_lr: 0.003327  loss: 3.5525 (3.3413)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7356 (0.8337)  time: 0.3458  data: 0.0006  max mem: 28503
Epoch: [95]  [ 600/1251]  eta: 0:03:49  lr: 0.003325  min_lr: 0.003325  loss: 3.6339 (3.3618)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7322 (0.8184)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [95]  [ 800/1251]  eta: 0:02:38  lr: 0.003322  min_lr: 0.003322  loss: 3.4493 (3.3634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6483 (0.8096)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [95]  [1000/1251]  eta: 0:01:28  lr: 0.003319  min_lr: 0.003319  loss: 3.3431 (3.3665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7784 (0.8131)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [95]  [1200/1251]  eta: 0:00:17  lr: 0.003317  min_lr: 0.003317  loss: 3.4902 (3.3710)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6476 (0.8114)  time: 0.3507  data: 0.0004  max mem: 28503
Epoch: [95]  [1250/1251]  eta: 0:00:00  lr: 0.003316  min_lr: 0.003316  loss: 3.3116 (3.3737)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.8052)  time: 0.2921  data: 0.0009  max mem: 28503
Epoch: [95] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003316  min_lr: 0.003316  loss: 3.3116 (3.3759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6029 (0.8052)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6594 (0.6594)  acc1: 88.0000 (88.0000)  acc5: 97.6000 (97.6000)  time: 5.6282  data: 5.4325  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9486 (0.8957)  acc1: 80.8000 (80.9818)  acc5: 96.4000 (96.6182)  time: 0.6971  data: 0.5245  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1142 (1.0510)  acc1: 75.6000 (77.6571)  acc5: 94.8000 (94.3619)  time: 0.1951  data: 0.0257  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1646 (1.0606)  acc1: 75.6000 (77.2640)  acc5: 93.6000 (94.3200)  time: 0.1942  data: 0.0256  max mem: 28503
Test: Total time: 0:00:10 (0.4119 s / it)
* Acc@1 77.852 Acc@5 94.426 loss 1.049
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.85%
Epoch: [96]  [   0/1251]  eta: 0:59:29  lr: 0.003316  min_lr: 0.003316  loss: 3.3091 (3.3091)  weight_decay: 0.0500 (0.0500)  time: 2.8535  data: 2.4406  max mem: 28503
Epoch: [96]  [ 200/1251]  eta: 0:06:20  lr: 0.003313  min_lr: 0.003313  loss: 3.2107 (3.3234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7996 (0.9165)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [96]  [ 400/1251]  eta: 0:05:01  lr: 0.003311  min_lr: 0.003311  loss: 3.3376 (3.3388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7777 (0.8331)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [96]  [ 600/1251]  eta: 0:03:49  lr: 0.003308  min_lr: 0.003308  loss: 3.4499 (3.3568)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (0.8588)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [96]  [ 800/1251]  eta: 0:02:38  lr: 0.003305  min_lr: 0.003305  loss: 3.3280 (3.3580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6588 (0.8312)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [96]  [1000/1251]  eta: 0:01:27  lr: 0.003302  min_lr: 0.003302  loss: 3.5263 (3.3712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6306 (0.8223)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [96]  [1200/1251]  eta: 0:00:17  lr: 0.003300  min_lr: 0.003300  loss: 3.4683 (3.3798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8221)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [96]  [1250/1251]  eta: 0:00:00  lr: 0.003299  min_lr: 0.003299  loss: 3.3883 (3.3783)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8233)  time: 0.2924  data: 0.0005  max mem: 28503
Epoch: [96] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.003299  min_lr: 0.003299  loss: 3.3883 (3.3678)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8095 (0.8233)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6449 (0.6449)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.6955  data: 5.4927  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9584 (0.9374)  acc1: 82.4000 (81.3455)  acc5: 96.4000 (96.4727)  time: 0.6999  data: 0.5278  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1207 (1.1092)  acc1: 75.2000 (77.4857)  acc5: 94.4000 (94.3048)  time: 0.1850  data: 0.0157  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2070 (1.1220)  acc1: 75.2000 (77.2960)  acc5: 94.0000 (94.2080)  time: 0.1848  data: 0.0156  max mem: 28503
Test: Total time: 0:00:10 (0.4097 s / it)
* Acc@1 77.664 Acc@5 94.492 loss 1.108
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 77.85%
Epoch: [97]  [   0/1251]  eta: 1:11:06  lr: 0.003299  min_lr: 0.003299  loss: 2.3694 (2.3694)  weight_decay: 0.0500 (0.0500)  time: 3.4101  data: 2.7548  max mem: 28503
Epoch: [97]  [ 200/1251]  eta: 0:06:20  lr: 0.003296  min_lr: 0.003296  loss: 3.2946 (3.3643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8440 (0.7994)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [97]  [ 400/1251]  eta: 0:05:03  lr: 0.003294  min_lr: 0.003294  loss: 3.5507 (3.3739)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8124 (0.8554)  time: 0.3560  data: 0.0004  max mem: 28503
Epoch: [97]  [ 600/1251]  eta: 0:03:50  lr: 0.003291  min_lr: 0.003291  loss: 3.5850 (3.3921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7959 (0.8410)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [97]  [ 800/1251]  eta: 0:02:38  lr: 0.003288  min_lr: 0.003288  loss: 3.2677 (3.3826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.8397)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [97]  [1000/1251]  eta: 0:01:28  lr: 0.003285  min_lr: 0.003285  loss: 2.9937 (3.3827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (0.8305)  time: 0.3477  data: 0.0005  max mem: 28503
Epoch: [97]  [1200/1251]  eta: 0:00:17  lr: 0.003283  min_lr: 0.003283  loss: 3.5191 (3.3895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6019 (0.8352)  time: 0.3593  data: 0.0004  max mem: 28503
Epoch: [97]  [1250/1251]  eta: 0:00:00  lr: 0.003282  min_lr: 0.003282  loss: 3.1969 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.8394)  time: 0.2922  data: 0.0007  max mem: 28503
Epoch: [97] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.003282  min_lr: 0.003282  loss: 3.1969 (3.3631)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8154 (0.8394)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6554 (0.6554)  acc1: 86.4000 (86.4000)  acc5: 99.2000 (99.2000)  time: 5.6761  data: 5.4671  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8665 (0.8982)  acc1: 82.4000 (81.4545)  acc5: 97.2000 (96.5455)  time: 0.7244  data: 0.5514  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1616 (1.0682)  acc1: 74.4000 (77.8667)  acc5: 94.0000 (94.3238)  time: 0.2075  data: 0.0388  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1974 (1.0808)  acc1: 74.8000 (77.6480)  acc5: 93.6000 (94.2720)  time: 0.2069  data: 0.0387  max mem: 28503
Test: Total time: 0:00:10 (0.4240 s / it)
* Acc@1 77.930 Acc@5 94.512 loss 1.063
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.93%
Epoch: [98]  [   0/1251]  eta: 1:00:56  lr: 0.003282  min_lr: 0.003282  loss: 3.6981 (3.6981)  weight_decay: 0.0500 (0.0500)  time: 2.9230  data: 2.5075  max mem: 28503
Epoch: [98]  [ 200/1251]  eta: 0:06:19  lr: 0.003279  min_lr: 0.003279  loss: 3.2811 (3.2914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8209 (0.8420)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [98]  [ 400/1251]  eta: 0:05:01  lr: 0.003276  min_lr: 0.003276  loss: 3.3784 (3.3209)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6502 (0.8129)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [98]  [ 600/1251]  eta: 0:03:48  lr: 0.003274  min_lr: 0.003274  loss: 3.4900 (3.3400)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7116 (0.8021)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [98]  [ 800/1251]  eta: 0:02:38  lr: 0.003271  min_lr: 0.003271  loss: 3.2344 (3.3430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8703 (0.8286)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [98]  [1000/1251]  eta: 0:01:27  lr: 0.003268  min_lr: 0.003268  loss: 3.5529 (3.3420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.8513)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [98]  [1200/1251]  eta: 0:00:17  lr: 0.003265  min_lr: 0.003265  loss: 3.6713 (3.3446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7554 (0.8440)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [98]  [1250/1251]  eta: 0:00:00  lr: 0.003265  min_lr: 0.003265  loss: 3.6324 (3.3514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.8422)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [98] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003265  min_lr: 0.003265  loss: 3.6324 (3.3643)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7305 (0.8422)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.7586 (0.7586)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (98.0000)  time: 5.9257  data: 5.7324  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9577 (0.9824)  acc1: 82.4000 (81.7818)  acc5: 96.4000 (96.2182)  time: 0.6938  data: 0.5215  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1612 (1.1523)  acc1: 75.6000 (78.0191)  acc5: 93.2000 (94.1524)  time: 0.1697  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2145 (1.1599)  acc1: 76.4000 (77.7760)  acc5: 92.8000 (94.1600)  time: 0.1695  data: 0.0002  max mem: 28503
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 77.894 Acc@5 94.404 loss 1.148
Accuracy of the model on the 50000 test images: 77.9%
Max accuracy: 77.93%
Epoch: [99]  [   0/1251]  eta: 1:05:01  lr: 0.003265  min_lr: 0.003265  loss: 3.6895 (3.6895)  weight_decay: 0.0500 (0.0500)  time: 3.1187  data: 1.7002  max mem: 28503
Epoch: [99]  [ 200/1251]  eta: 0:06:20  lr: 0.003262  min_lr: 0.003262  loss: 3.4104 (3.3605)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6730 (0.8240)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [99]  [ 400/1251]  eta: 0:05:02  lr: 0.003259  min_lr: 0.003259  loss: 3.0307 (3.3376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5359 (0.7602)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [99]  [ 600/1251]  eta: 0:03:49  lr: 0.003256  min_lr: 0.003256  loss: 3.1015 (3.3277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6687 (0.7528)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [99]  [ 800/1251]  eta: 0:02:38  lr: 0.003253  min_lr: 0.003253  loss: 3.5557 (3.3482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8175 (0.7939)  time: 0.3604  data: 0.0004  max mem: 28503
Epoch: [99]  [1000/1251]  eta: 0:01:28  lr: 0.003251  min_lr: 0.003251  loss: 3.2914 (3.3403)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8860 (0.8058)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [99]  [1200/1251]  eta: 0:00:17  lr: 0.003248  min_lr: 0.003248  loss: 3.5739 (3.3507)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8823 (0.8128)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [99]  [1250/1251]  eta: 0:00:00  lr: 0.003247  min_lr: 0.003247  loss: 3.7037 (3.3543)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9263 (0.8180)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [99] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003247  min_lr: 0.003247  loss: 3.7037 (3.3579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9263 (0.8180)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.9024 (0.9024)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 5.6142  data: 5.3891  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0299 (1.0779)  acc1: 82.0000 (80.9091)  acc5: 96.8000 (96.5818)  time: 0.7211  data: 0.5470  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.3016 (1.2320)  acc1: 73.6000 (77.1810)  acc5: 94.0000 (94.3619)  time: 0.2014  data: 0.0314  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3518 (1.2404)  acc1: 73.6000 (76.8000)  acc5: 92.8000 (94.2720)  time: 0.2012  data: 0.0313  max mem: 28503
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 77.524 Acc@5 94.168 loss 1.233
Accuracy of the model on the 50000 test images: 77.5%
Max accuracy: 77.93%
Epoch: [100]  [   0/1251]  eta: 1:10:05  lr: 0.003247  min_lr: 0.003247  loss: 3.2915 (3.2915)  weight_decay: 0.0500 (0.0500)  time: 3.3619  data: 2.3436  max mem: 28503
Epoch: [100]  [ 200/1251]  eta: 0:06:22  lr: 0.003244  min_lr: 0.003244  loss: 3.4645 (3.3504)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6998 (0.7958)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [100]  [ 400/1251]  eta: 0:05:02  lr: 0.003242  min_lr: 0.003242  loss: 3.5996 (3.3808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7689 (0.8186)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [100]  [ 600/1251]  eta: 0:03:49  lr: 0.003239  min_lr: 0.003239  loss: 3.4639 (3.3496)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7985 (0.8280)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [100]  [ 800/1251]  eta: 0:02:38  lr: 0.003236  min_lr: 0.003236  loss: 3.5635 (3.3620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (0.8107)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [100]  [1000/1251]  eta: 0:01:27  lr: 0.003233  min_lr: 0.003233  loss: 3.5705 (3.3762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7792 (0.8110)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [100]  [1200/1251]  eta: 0:00:17  lr: 0.003230  min_lr: 0.003230  loss: 3.5425 (3.3659)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7070 (0.8063)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [100]  [1250/1251]  eta: 0:00:00  lr: 0.003230  min_lr: 0.003230  loss: 3.6523 (3.3681)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6391 (0.7995)  time: 0.2914  data: 0.0007  max mem: 28503
Epoch: [100] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.003230  min_lr: 0.003230  loss: 3.6523 (3.3558)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6391 (0.7995)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7582 (0.7582)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.6400  data: 5.4338  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9372 (0.9169)  acc1: 80.8000 (82.6909)  acc5: 96.8000 (96.7273)  time: 0.7102  data: 0.5378  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0586 (1.0796)  acc1: 76.8000 (78.5714)  acc5: 94.8000 (94.5714)  time: 0.1933  data: 0.0246  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1905 (1.0874)  acc1: 76.0000 (78.0960)  acc5: 93.6000 (94.5120)  time: 0.1929  data: 0.0246  max mem: 28503
Test: Total time: 0:00:10 (0.4108 s / it)
* Acc@1 78.050 Acc@5 94.518 loss 1.081
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.05%
Epoch: [101]  [   0/1251]  eta: 1:05:16  lr: 0.003230  min_lr: 0.003230  loss: 2.7949 (2.7949)  weight_decay: 0.0500 (0.0500)  time: 3.1309  data: 2.7554  max mem: 28503
Epoch: [101]  [ 200/1251]  eta: 0:06:21  lr: 0.003227  min_lr: 0.003227  loss: 3.4881 (3.3464)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (0.8063)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [101]  [ 400/1251]  eta: 0:05:03  lr: 0.003224  min_lr: 0.003224  loss: 3.2886 (3.3347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6842 (0.8392)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [101]  [ 600/1251]  eta: 0:03:49  lr: 0.003221  min_lr: 0.003221  loss: 3.1750 (3.3347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6643 (0.8341)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [101]  [ 800/1251]  eta: 0:02:38  lr: 0.003218  min_lr: 0.003218  loss: 3.3084 (3.3423)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5844 (0.8276)  time: 0.3542  data: 0.0005  max mem: 28503
Epoch: [101]  [1000/1251]  eta: 0:01:28  lr: 0.003215  min_lr: 0.003215  loss: 3.5357 (3.3495)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8265 (0.8117)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [101]  [1200/1251]  eta: 0:00:17  lr: 0.003212  min_lr: 0.003212  loss: 3.3246 (3.3549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8431 (0.8204)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [101]  [1250/1251]  eta: 0:00:00  lr: 0.003212  min_lr: 0.003212  loss: 3.4396 (3.3540)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.8173)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [101] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.003212  min_lr: 0.003212  loss: 3.4396 (3.3537)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7037 (0.8173)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7310 (0.7310)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.5158  data: 5.3172  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9407 (0.9142)  acc1: 81.2000 (81.7818)  acc5: 96.8000 (96.6182)  time: 0.7506  data: 0.5779  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0841 (1.0799)  acc1: 75.6000 (78.2095)  acc5: 94.0000 (94.5524)  time: 0.2212  data: 0.0520  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1519 (1.0838)  acc1: 74.8000 (77.8720)  acc5: 93.6000 (94.5280)  time: 0.2204  data: 0.0520  max mem: 28503
Test: Total time: 0:00:10 (0.4284 s / it)
* Acc@1 77.998 Acc@5 94.624 loss 1.078
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.05%
Epoch: [102]  [   0/1251]  eta: 1:13:06  lr: 0.003212  min_lr: 0.003212  loss: 3.0177 (3.0177)  weight_decay: 0.0500 (0.0500)  time: 3.5066  data: 1.6955  max mem: 28503
Epoch: [102]  [ 200/1251]  eta: 0:06:22  lr: 0.003209  min_lr: 0.003209  loss: 3.5169 (3.3112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0348 (0.9065)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [102]  [ 400/1251]  eta: 0:05:03  lr: 0.003206  min_lr: 0.003206  loss: 3.3727 (3.3241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7364 (0.8156)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [102]  [ 600/1251]  eta: 0:03:50  lr: 0.003203  min_lr: 0.003203  loss: 3.4976 (3.3350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7604 (0.8122)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [102]  [ 800/1251]  eta: 0:02:38  lr: 0.003200  min_lr: 0.003200  loss: 3.1580 (3.3455)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7872 (0.8286)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [102]  [1000/1251]  eta: 0:01:28  lr: 0.003197  min_lr: 0.003197  loss: 3.4233 (3.3446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.8264)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [102]  [1200/1251]  eta: 0:00:17  lr: 0.003195  min_lr: 0.003195  loss: 3.4532 (3.3476)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7806 (0.8242)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [102]  [1250/1251]  eta: 0:00:00  lr: 0.003194  min_lr: 0.003194  loss: 3.5615 (3.3544)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2922  data: 0.0005  max mem: 28503
Epoch: [102] Total time: 0:07:18 (0.3509 s / it)
Averaged stats: lr: 0.003194  min_lr: 0.003194  loss: 3.5615 (3.3457)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.8273 (0.8273)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5199  data: 5.3180  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0388 (1.0295)  acc1: 80.8000 (80.9818)  acc5: 96.4000 (96.5818)  time: 0.6692  data: 0.4973  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2580 (1.1931)  acc1: 74.4000 (77.2571)  acc5: 93.2000 (94.2857)  time: 0.1789  data: 0.0102  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.3147 (1.2087)  acc1: 74.4000 (76.8800)  acc5: 92.4000 (94.1440)  time: 0.1787  data: 0.0101  max mem: 28503
Test: Total time: 0:00:09 (0.3962 s / it)
* Acc@1 77.738 Acc@5 94.364 loss 1.194
Accuracy of the model on the 50000 test images: 77.7%
Max accuracy: 78.05%
Epoch: [103]  [   0/1251]  eta: 0:58:29  lr: 0.003194  min_lr: 0.003194  loss: 3.1153 (3.1153)  weight_decay: 0.0500 (0.0500)  time: 2.8056  data: 1.7204  max mem: 28503
Epoch: [103]  [ 200/1251]  eta: 0:06:19  lr: 0.003191  min_lr: 0.003191  loss: 3.2771 (3.2998)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7111 (0.8272)  time: 0.3448  data: 0.0005  max mem: 28503
Epoch: [103]  [ 400/1251]  eta: 0:05:01  lr: 0.003188  min_lr: 0.003188  loss: 3.5202 (3.3186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7687 (0.8434)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [103]  [ 600/1251]  eta: 0:03:49  lr: 0.003185  min_lr: 0.003185  loss: 3.4948 (3.3221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6422 (0.8210)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [103]  [ 800/1251]  eta: 0:02:38  lr: 0.003182  min_lr: 0.003182  loss: 3.2606 (3.3179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5961 (0.8420)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [103]  [1000/1251]  eta: 0:01:28  lr: 0.003179  min_lr: 0.003179  loss: 3.5351 (3.3267)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7287 (0.8429)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [103]  [1200/1251]  eta: 0:00:17  lr: 0.003176  min_lr: 0.003176  loss: 3.1815 (3.3223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0559 (0.8407)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [103]  [1250/1251]  eta: 0:00:00  lr: 0.003176  min_lr: 0.003176  loss: 3.5742 (3.3238)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8294 (0.8350)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [103] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.003176  min_lr: 0.003176  loss: 3.5742 (3.3430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8294 (0.8350)
Test:  [ 0/25]  eta: 0:01:56  loss: 0.7074 (0.7074)  acc1: 86.8000 (86.8000)  acc5: 97.6000 (97.6000)  time: 4.6761  data: 4.4690  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8963 (0.8847)  acc1: 80.8000 (81.3455)  acc5: 96.8000 (96.5818)  time: 0.6689  data: 0.4955  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0953 (1.0519)  acc1: 74.4000 (77.6952)  acc5: 94.0000 (94.2857)  time: 0.2441  data: 0.0749  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1475 (1.0649)  acc1: 74.4000 (77.3920)  acc5: 93.2000 (94.1920)  time: 0.1985  data: 0.0295  max mem: 28503
Test: Total time: 0:00:10 (0.4125 s / it)
* Acc@1 78.054 Acc@5 94.542 loss 1.050
Accuracy of the model on the 50000 test images: 78.1%
Max accuracy: 78.05%
Epoch: [104]  [   0/1251]  eta: 1:00:10  lr: 0.003176  min_lr: 0.003176  loss: 3.6167 (3.6167)  weight_decay: 0.0500 (0.0500)  time: 2.8859  data: 2.5321  max mem: 28503
Epoch: [104]  [ 200/1251]  eta: 0:06:20  lr: 0.003173  min_lr: 0.003173  loss: 3.3292 (3.3894)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (0.6999)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [104]  [ 400/1251]  eta: 0:05:02  lr: 0.003170  min_lr: 0.003170  loss: 3.5463 (3.3535)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7643 (0.7682)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [104]  [ 600/1251]  eta: 0:03:49  lr: 0.003167  min_lr: 0.003167  loss: 3.5555 (3.3611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.8112)  time: 0.3531  data: 0.0004  max mem: 28503
Epoch: [104]  [ 800/1251]  eta: 0:02:38  lr: 0.003164  min_lr: 0.003164  loss: 3.6069 (3.3491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6719 (0.7974)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [104]  [1000/1251]  eta: 0:01:28  lr: 0.003161  min_lr: 0.003161  loss: 3.4228 (3.3530)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7337 (0.8025)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [104]  [1200/1251]  eta: 0:00:17  lr: 0.003158  min_lr: 0.003158  loss: 3.5753 (3.3513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.8312)  time: 0.3545  data: 0.0004  max mem: 28503
Epoch: [104]  [1250/1251]  eta: 0:00:00  lr: 0.003158  min_lr: 0.003158  loss: 3.2399 (3.3481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.8328)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [104] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.003158  min_lr: 0.003158  loss: 3.2399 (3.3498)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7373 (0.8328)
Test:  [ 0/25]  eta: 0:01:54  loss: 0.6170 (0.6170)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 4.5723  data: 4.3865  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8296 (0.8248)  acc1: 83.2000 (82.4000)  acc5: 97.2000 (96.4000)  time: 0.6565  data: 0.4848  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0325 (0.9901)  acc1: 76.8000 (78.9143)  acc5: 94.0000 (94.5714)  time: 0.2284  data: 0.0548  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1368 (1.0026)  acc1: 76.8000 (78.3520)  acc5: 93.6000 (94.4640)  time: 0.2088  data: 0.0346  max mem: 28503
Test: Total time: 0:00:09 (0.3970 s / it)
* Acc@1 78.316 Acc@5 94.528 loss 0.993
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.32%
Epoch: [105]  [   0/1251]  eta: 1:00:59  lr: 0.003158  min_lr: 0.003158  loss: 3.2435 (3.2435)  weight_decay: 0.0500 (0.0500)  time: 2.9255  data: 2.5191  max mem: 28503
Epoch: [105]  [ 200/1251]  eta: 0:06:18  lr: 0.003155  min_lr: 0.003155  loss: 3.3152 (3.2889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7845 (0.7312)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [105]  [ 400/1251]  eta: 0:05:01  lr: 0.003152  min_lr: 0.003152  loss: 3.1238 (3.3085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8578 (0.7241)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [105]  [ 600/1251]  eta: 0:03:49  lr: 0.003149  min_lr: 0.003149  loss: 3.5610 (3.3111)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.7495)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [105]  [ 800/1251]  eta: 0:02:38  lr: 0.003146  min_lr: 0.003146  loss: 3.5137 (3.3090)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7972 (0.7696)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [105]  [1000/1251]  eta: 0:01:28  lr: 0.003143  min_lr: 0.003143  loss: 3.4943 (3.3126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8571 (0.7824)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [105]  [1200/1251]  eta: 0:00:17  lr: 0.003140  min_lr: 0.003140  loss: 3.5058 (3.3217)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9821 (0.8027)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [105]  [1250/1251]  eta: 0:00:00  lr: 0.003139  min_lr: 0.003139  loss: 3.3770 (3.3225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6824 (0.8027)  time: 0.2920  data: 0.0008  max mem: 28503
Epoch: [105] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003139  min_lr: 0.003139  loss: 3.3770 (3.3338)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6824 (0.8027)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7049 (0.7049)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.5302  data: 5.3261  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8740 (0.8958)  acc1: 84.0000 (82.2909)  acc5: 96.8000 (96.7273)  time: 0.6608  data: 0.4890  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0637 (1.0602)  acc1: 76.8000 (78.0191)  acc5: 94.0000 (94.5905)  time: 0.1712  data: 0.0027  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1787 (1.0763)  acc1: 74.4000 (77.6000)  acc5: 93.6000 (94.6080)  time: 0.1711  data: 0.0026  max mem: 28503
Test: Total time: 0:00:09 (0.3886 s / it)
* Acc@1 78.284 Acc@5 94.578 loss 1.063
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.32%
Epoch: [106]  [   0/1251]  eta: 1:07:13  lr: 0.003139  min_lr: 0.003139  loss: 3.9509 (3.9509)  weight_decay: 0.0500 (0.0500)  time: 3.2240  data: 2.3077  max mem: 28503
Epoch: [106]  [ 200/1251]  eta: 0:06:22  lr: 0.003136  min_lr: 0.003136  loss: 3.4750 (3.3284)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6887 (0.8206)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [106]  [ 400/1251]  eta: 0:05:02  lr: 0.003133  min_lr: 0.003133  loss: 3.0614 (3.3230)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7714 (0.7948)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [106]  [ 600/1251]  eta: 0:03:49  lr: 0.003130  min_lr: 0.003130  loss: 3.5949 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7089 (0.7851)  time: 0.3542  data: 0.0004  max mem: 28503
Epoch: [106]  [ 800/1251]  eta: 0:02:38  lr: 0.003127  min_lr: 0.003127  loss: 3.2200 (3.3360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7888 (0.8259)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [106]  [1000/1251]  eta: 0:01:28  lr: 0.003124  min_lr: 0.003124  loss: 3.6393 (3.3482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6340 (0.8103)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [106]  [1200/1251]  eta: 0:00:17  lr: 0.003121  min_lr: 0.003121  loss: 3.6516 (3.3492)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7404 (0.8111)  time: 0.3524  data: 0.0004  max mem: 28503
Epoch: [106]  [1250/1251]  eta: 0:00:00  lr: 0.003121  min_lr: 0.003121  loss: 3.3445 (3.3487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8081 (0.8231)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [106] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003121  min_lr: 0.003121  loss: 3.3445 (3.3425)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8081 (0.8231)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7442 (0.7442)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.4903  data: 5.2896  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.9299 (0.9236)  acc1: 81.2000 (81.9636)  acc5: 96.8000 (96.3636)  time: 0.6587  data: 0.4857  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1011 (1.0726)  acc1: 76.4000 (78.8952)  acc5: 93.2000 (94.4762)  time: 0.2017  data: 0.0322  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1535 (1.0798)  acc1: 75.6000 (78.4160)  acc5: 93.2000 (94.4160)  time: 0.2008  data: 0.0322  max mem: 28503
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 78.392 Acc@5 94.626 loss 1.071
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.39%
Epoch: [107]  [   0/1251]  eta: 1:01:21  lr: 0.003121  min_lr: 0.003121  loss: 2.3174 (2.3174)  weight_decay: 0.0500 (0.0500)  time: 2.9431  data: 2.5559  max mem: 28503
Epoch: [107]  [ 200/1251]  eta: 0:06:20  lr: 0.003118  min_lr: 0.003118  loss: 3.3709 (3.3223)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6635 (0.7436)  time: 0.3572  data: 0.0004  max mem: 28503
Epoch: [107]  [ 400/1251]  eta: 0:05:01  lr: 0.003115  min_lr: 0.003115  loss: 3.4358 (3.3014)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8563 (0.8028)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [107]  [ 600/1251]  eta: 0:03:49  lr: 0.003112  min_lr: 0.003112  loss: 3.3666 (3.3010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8560 (0.8457)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [107]  [ 800/1251]  eta: 0:02:38  lr: 0.003109  min_lr: 0.003109  loss: 3.2617 (3.3103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7529 (0.8308)  time: 0.3474  data: 0.0005  max mem: 28503
Epoch: [107]  [1000/1251]  eta: 0:01:28  lr: 0.003106  min_lr: 0.003106  loss: 3.4740 (3.3196)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7324 (0.8305)  time: 0.3445  data: 0.0005  max mem: 28503
Epoch: [107]  [1200/1251]  eta: 0:00:17  lr: 0.003103  min_lr: 0.003103  loss: 3.4659 (3.3179)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8554 (0.8417)  time: 0.3452  data: 0.0005  max mem: 28503
Epoch: [107]  [1250/1251]  eta: 0:00:00  lr: 0.003102  min_lr: 0.003102  loss: 3.4682 (3.3222)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8412)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [107] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003102  min_lr: 0.003102  loss: 3.4682 (3.3360)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8052 (0.8412)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.7379 (0.7379)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5134  data: 5.3018  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0200 (0.9736)  acc1: 82.8000 (81.8909)  acc5: 96.4000 (96.7273)  time: 0.7243  data: 0.5508  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.2305 (1.1323)  acc1: 75.6000 (78.0952)  acc5: 93.6000 (94.5333)  time: 0.2216  data: 0.0526  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2439 (1.1470)  acc1: 74.8000 (77.6640)  acc5: 93.6000 (94.4640)  time: 0.2209  data: 0.0525  max mem: 28503
Test: Total time: 0:00:10 (0.4282 s / it)
* Acc@1 78.032 Acc@5 94.598 loss 1.136
Accuracy of the model on the 50000 test images: 78.0%
Max accuracy: 78.39%
Epoch: [108]  [   0/1251]  eta: 1:12:15  lr: 0.003102  min_lr: 0.003102  loss: 3.8019 (3.8019)  weight_decay: 0.0500 (0.0500)  time: 3.4659  data: 1.6303  max mem: 28503
Epoch: [108]  [ 200/1251]  eta: 0:06:23  lr: 0.003099  min_lr: 0.003099  loss: 3.2211 (3.3264)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6671 (0.7483)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [108]  [ 400/1251]  eta: 0:05:02  lr: 0.003096  min_lr: 0.003096  loss: 3.4300 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6297 (0.7347)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [108]  [ 600/1251]  eta: 0:03:50  lr: 0.003093  min_lr: 0.003093  loss: 3.2878 (3.3405)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7229 (0.7613)  time: 0.3660  data: 0.0004  max mem: 28503
Epoch: [108]  [ 800/1251]  eta: 0:02:38  lr: 0.003090  min_lr: 0.003090  loss: 3.1363 (3.3357)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7263 (0.7580)  time: 0.3527  data: 0.0004  max mem: 28503
Epoch: [108]  [1000/1251]  eta: 0:01:28  lr: 0.003087  min_lr: 0.003087  loss: 3.5637 (3.3489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9063 (0.7866)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [108]  [1200/1251]  eta: 0:00:17  lr: 0.003084  min_lr: 0.003084  loss: 3.5181 (3.3514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.7861)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [108]  [1250/1251]  eta: 0:00:00  lr: 0.003083  min_lr: 0.003083  loss: 3.3719 (3.3490)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.7864)  time: 0.3014  data: 0.0007  max mem: 28503
Epoch: [108] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.003083  min_lr: 0.003083  loss: 3.3719 (3.3318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8275 (0.7864)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6883 (0.6883)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.2841  data: 5.0855  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8779 (0.8731)  acc1: 81.2000 (81.3091)  acc5: 97.2000 (96.9091)  time: 0.7322  data: 0.5585  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1178 (1.0352)  acc1: 75.6000 (77.7905)  acc5: 94.4000 (94.9714)  time: 0.2232  data: 0.0530  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1450 (1.0460)  acc1: 75.2000 (77.4880)  acc5: 94.4000 (94.8480)  time: 0.2219  data: 0.0529  max mem: 28503
Test: Total time: 0:00:10 (0.4201 s / it)
* Acc@1 78.406 Acc@5 94.654 loss 1.040
Accuracy of the model on the 50000 test images: 78.4%
Max accuracy: 78.41%
Epoch: [109]  [   0/1251]  eta: 1:09:37  lr: 0.003083  min_lr: 0.003083  loss: 3.1342 (3.1342)  weight_decay: 0.0500 (0.0500)  time: 3.3395  data: 2.9895  max mem: 28503
Epoch: [109]  [ 200/1251]  eta: 0:06:22  lr: 0.003080  min_lr: 0.003080  loss: 3.3869 (3.3541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6005 (0.8442)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [109]  [ 400/1251]  eta: 0:05:03  lr: 0.003077  min_lr: 0.003077  loss: 3.4382 (3.3573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6499 (0.7842)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [109]  [ 600/1251]  eta: 0:03:50  lr: 0.003074  min_lr: 0.003074  loss: 3.1051 (3.3387)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6693 (0.7813)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [109]  [ 800/1251]  eta: 0:02:38  lr: 0.003071  min_lr: 0.003071  loss: 3.3982 (3.3312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8237 (0.8180)  time: 0.3528  data: 0.0004  max mem: 28503
Epoch: [109]  [1000/1251]  eta: 0:01:28  lr: 0.003068  min_lr: 0.003068  loss: 3.5343 (3.3388)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7737 (0.8124)  time: 0.3537  data: 0.0004  max mem: 28503
Epoch: [109]  [1200/1251]  eta: 0:00:17  lr: 0.003065  min_lr: 0.003065  loss: 3.5634 (3.3349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6490 (0.8100)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [109]  [1250/1251]  eta: 0:00:00  lr: 0.003064  min_lr: 0.003064  loss: 3.5635 (3.3359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (0.8152)  time: 0.2924  data: 0.0006  max mem: 28503
Epoch: [109] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.003064  min_lr: 0.003064  loss: 3.5635 (3.3283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8153 (0.8152)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7163 (0.7163)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.3645  data: 5.1684  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.9685 (0.9397)  acc1: 82.0000 (80.9091)  acc5: 96.8000 (96.6909)  time: 0.6437  data: 0.4713  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1324 (1.1141)  acc1: 74.4000 (77.5619)  acc5: 94.4000 (94.4381)  time: 0.1728  data: 0.0034  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1871 (1.1167)  acc1: 75.2000 (77.4240)  acc5: 92.8000 (94.4160)  time: 0.1721  data: 0.0033  max mem: 28503
Test: Total time: 0:00:09 (0.3829 s / it)
* Acc@1 78.304 Acc@5 94.530 loss 1.100
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.41%
Epoch: [110]  [   0/1251]  eta: 1:10:45  lr: 0.003064  min_lr: 0.003064  loss: 3.7526 (3.7526)  weight_decay: 0.0500 (0.0500)  time: 3.3935  data: 2.9111  max mem: 28503
Epoch: [110]  [ 200/1251]  eta: 0:06:22  lr: 0.003061  min_lr: 0.003061  loss: 3.4931 (3.2930)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7187 (0.8664)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [110]  [ 400/1251]  eta: 0:05:02  lr: 0.003058  min_lr: 0.003058  loss: 3.4045 (3.3156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6994 (0.8477)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [110]  [ 600/1251]  eta: 0:03:49  lr: 0.003055  min_lr: 0.003055  loss: 3.4737 (3.3224)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6650 (0.8103)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [110]  [ 800/1251]  eta: 0:02:38  lr: 0.003052  min_lr: 0.003052  loss: 3.3861 (3.3260)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7561 (0.8138)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [110]  [1000/1251]  eta: 0:01:28  lr: 0.003049  min_lr: 0.003049  loss: 3.2939 (3.3306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7776 (nan)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [110]  [1200/1251]  eta: 0:00:17  lr: 0.003046  min_lr: 0.003046  loss: 3.4634 (3.3234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7001 (nan)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [110]  [1250/1251]  eta: 0:00:00  lr: 0.003045  min_lr: 0.003045  loss: 3.5701 (3.3208)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7776 (nan)  time: 0.2919  data: 0.0010  max mem: 28503
Epoch: [110] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.003045  min_lr: 0.003045  loss: 3.5701 (3.3203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7776 (nan)
Test:  [ 0/25]  eta: 0:02:26  loss: 0.6640 (0.6640)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.8424  data: 5.6487  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8926 (0.8604)  acc1: 82.8000 (81.8545)  acc5: 96.8000 (96.7636)  time: 0.7583  data: 0.5861  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0462 (1.0319)  acc1: 75.6000 (78.3429)  acc5: 94.4000 (94.7619)  time: 0.2098  data: 0.0399  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1545 (1.0421)  acc1: 74.8000 (77.8880)  acc5: 94.0000 (94.5920)  time: 0.2088  data: 0.0398  max mem: 28503
Test: Total time: 0:00:10 (0.4319 s / it)
* Acc@1 78.494 Acc@5 94.686 loss 1.025
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.49%
Epoch: [111]  [   0/1251]  eta: 1:09:15  lr: 0.003045  min_lr: 0.003045  loss: 3.5311 (3.5311)  weight_decay: 0.0500 (0.0500)  time: 3.3214  data: 2.9573  max mem: 28503
Epoch: [111]  [ 200/1251]  eta: 0:06:23  lr: 0.003042  min_lr: 0.003042  loss: 3.4642 (3.2275)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9279 (0.8443)  time: 0.3654  data: 0.0004  max mem: 28503
Epoch: [111]  [ 400/1251]  eta: 0:05:03  lr: 0.003039  min_lr: 0.003039  loss: 3.4411 (3.2686)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6554 (0.8382)  time: 0.3558  data: 0.0004  max mem: 28503
Epoch: [111]  [ 600/1251]  eta: 0:03:50  lr: 0.003036  min_lr: 0.003036  loss: 3.5841 (3.2879)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6958 (0.8203)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [111]  [ 800/1251]  eta: 0:02:38  lr: 0.003033  min_lr: 0.003033  loss: 3.4003 (3.2906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8311 (0.8084)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [111]  [1000/1251]  eta: 0:01:28  lr: 0.003030  min_lr: 0.003030  loss: 3.4782 (3.3004)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6575 (0.8105)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [111]  [1200/1251]  eta: 0:00:17  lr: 0.003027  min_lr: 0.003027  loss: 3.3977 (3.3006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7187 (0.8129)  time: 0.3582  data: 0.0004  max mem: 28503
Epoch: [111]  [1250/1251]  eta: 0:00:00  lr: 0.003026  min_lr: 0.003026  loss: 3.6333 (3.3006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7187 (0.8075)  time: 0.2967  data: 0.0005  max mem: 28503
Epoch: [111] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.003026  min_lr: 0.003026  loss: 3.6333 (3.3194)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7187 (0.8075)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.8236 (0.8236)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 5.7320  data: 5.5429  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9593 (0.9849)  acc1: 82.0000 (82.1455)  acc5: 97.6000 (96.8727)  time: 0.7180  data: 0.5464  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1657 (1.1413)  acc1: 76.8000 (78.2857)  acc5: 94.4000 (94.7619)  time: 0.1924  data: 0.0234  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2431 (1.1550)  acc1: 76.4000 (77.8240)  acc5: 93.6000 (94.4960)  time: 0.1916  data: 0.0233  max mem: 28503
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 78.304 Acc@5 94.626 loss 1.137
Accuracy of the model on the 50000 test images: 78.3%
Max accuracy: 78.49%
Epoch: [112]  [   0/1251]  eta: 1:10:30  lr: 0.003026  min_lr: 0.003026  loss: 3.6593 (3.6593)  weight_decay: 0.0500 (0.0500)  time: 3.3815  data: 2.2319  max mem: 28503
Epoch: [112]  [ 200/1251]  eta: 0:06:20  lr: 0.003023  min_lr: 0.003023  loss: 3.1180 (3.3443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7474 (0.8515)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [112]  [ 400/1251]  eta: 0:05:01  lr: 0.003020  min_lr: 0.003020  loss: 3.3103 (3.3311)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7329 (0.8305)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [112]  [ 600/1251]  eta: 0:03:49  lr: 0.003017  min_lr: 0.003017  loss: 3.1309 (3.3115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8970 (0.8254)  time: 0.3528  data: 0.0004  max mem: 28503
Epoch: [112]  [ 800/1251]  eta: 0:02:38  lr: 0.003014  min_lr: 0.003014  loss: 3.2254 (3.3210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7006 (0.8238)  time: 0.3527  data: 0.0004  max mem: 28503
Epoch: [112]  [1000/1251]  eta: 0:01:28  lr: 0.003011  min_lr: 0.003011  loss: 3.2855 (3.3184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7871 (0.8230)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [112]  [1200/1251]  eta: 0:00:17  lr: 0.003007  min_lr: 0.003007  loss: 3.4697 (3.3211)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6716 (0.8356)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [112]  [1250/1251]  eta: 0:00:00  lr: 0.003007  min_lr: 0.003007  loss: 3.2622 (3.3171)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8780 (0.8417)  time: 0.2915  data: 0.0006  max mem: 28503
Epoch: [112] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.003007  min_lr: 0.003007  loss: 3.2622 (3.3193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8780 (0.8417)
Test:  [ 0/25]  eta: 0:01:43  loss: 0.6653 (0.6653)  acc1: 87.2000 (87.2000)  acc5: 97.6000 (97.6000)  time: 4.1581  data: 3.9609  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8770 (0.8564)  acc1: 83.2000 (82.5818)  acc5: 97.2000 (96.6909)  time: 0.6427  data: 0.4676  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1234 (1.0329)  acc1: 75.2000 (78.4381)  acc5: 94.4000 (94.6857)  time: 0.2613  data: 0.0907  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1677 (1.0454)  acc1: 75.2000 (78.0640)  acc5: 94.0000 (94.6560)  time: 0.2082  data: 0.0397  max mem: 28503
Test: Total time: 0:00:10 (0.4059 s / it)
* Acc@1 78.546 Acc@5 94.786 loss 1.034
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.55%
Epoch: [113]  [   0/1251]  eta: 0:58:29  lr: 0.003007  min_lr: 0.003007  loss: 3.4015 (3.4015)  weight_decay: 0.0500 (0.0500)  time: 2.8055  data: 2.4150  max mem: 28503
Epoch: [113]  [ 200/1251]  eta: 0:06:20  lr: 0.003004  min_lr: 0.003004  loss: 2.9074 (3.2994)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6954 (0.7458)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [113]  [ 400/1251]  eta: 0:05:01  lr: 0.003000  min_lr: 0.003000  loss: 3.2812 (3.2658)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6277 (0.7030)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [113]  [ 600/1251]  eta: 0:03:49  lr: 0.002997  min_lr: 0.002997  loss: 3.4012 (3.2811)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6601 (0.7550)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [113]  [ 800/1251]  eta: 0:02:38  lr: 0.002994  min_lr: 0.002994  loss: 3.3709 (3.3020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7668 (0.7801)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [113]  [1000/1251]  eta: 0:01:28  lr: 0.002991  min_lr: 0.002991  loss: 3.2872 (3.3001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5970 (0.7926)  time: 0.3455  data: 0.0005  max mem: 28503
Epoch: [113]  [1200/1251]  eta: 0:00:17  lr: 0.002988  min_lr: 0.002988  loss: 3.3269 (3.3029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8058 (0.8107)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [113]  [1250/1251]  eta: 0:00:00  lr: 0.002987  min_lr: 0.002987  loss: 3.3747 (3.3015)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7675 (0.8099)  time: 0.2977  data: 0.0007  max mem: 28503
Epoch: [113] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002987  min_lr: 0.002987  loss: 3.3747 (3.3140)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7675 (0.8099)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7125 (0.7125)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.5775  data: 5.3898  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9525 (0.9064)  acc1: 81.2000 (81.9636)  acc5: 96.8000 (96.5455)  time: 0.6974  data: 0.5261  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0843 (1.0618)  acc1: 76.0000 (78.6286)  acc5: 94.8000 (94.6667)  time: 0.1890  data: 0.0199  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1554 (1.0740)  acc1: 76.0000 (78.2080)  acc5: 94.0000 (94.5920)  time: 0.1883  data: 0.0198  max mem: 28503
Test: Total time: 0:00:10 (0.4047 s / it)
* Acc@1 78.478 Acc@5 94.836 loss 1.067
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.55%
Epoch: [114]  [   0/1251]  eta: 1:06:16  lr: 0.002987  min_lr: 0.002987  loss: 2.9454 (2.9454)  weight_decay: 0.0500 (0.0500)  time: 3.1785  data: 1.7713  max mem: 28503
Epoch: [114]  [ 200/1251]  eta: 0:06:21  lr: 0.002984  min_lr: 0.002984  loss: 2.9504 (3.2641)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7274 (0.7288)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [114]  [ 400/1251]  eta: 0:05:02  lr: 0.002981  min_lr: 0.002981  loss: 3.3100 (3.2736)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6768 (0.8037)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [114]  [ 600/1251]  eta: 0:03:49  lr: 0.002978  min_lr: 0.002978  loss: 3.4615 (3.2871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7389 (0.7865)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [114]  [ 800/1251]  eta: 0:02:38  lr: 0.002975  min_lr: 0.002975  loss: 3.2947 (3.2922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (0.8067)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [114]  [1000/1251]  eta: 0:01:28  lr: 0.002972  min_lr: 0.002972  loss: 3.2583 (3.2908)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7510 (0.8053)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [114]  [1200/1251]  eta: 0:00:17  lr: 0.002968  min_lr: 0.002968  loss: 3.6250 (3.2968)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6440 (0.7914)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [114]  [1250/1251]  eta: 0:00:00  lr: 0.002968  min_lr: 0.002968  loss: 3.2445 (3.2962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6508 (0.7920)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [114] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.002968  min_lr: 0.002968  loss: 3.2445 (3.3119)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6508 (0.7920)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6144 (0.6144)  acc1: 85.6000 (85.6000)  acc5: 98.8000 (98.8000)  time: 5.5024  data: 5.3109  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8512 (0.8323)  acc1: 80.4000 (81.7455)  acc5: 96.8000 (96.9091)  time: 0.6538  data: 0.4831  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0277 (0.9983)  acc1: 76.8000 (78.0191)  acc5: 93.6000 (94.8762)  time: 0.1794  data: 0.0109  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0277 (1.0074)  acc1: 76.0000 (77.7920)  acc5: 93.6000 (94.7840)  time: 0.1807  data: 0.0123  max mem: 28503
Test: Total time: 0:00:09 (0.3991 s / it)
* Acc@1 78.612 Acc@5 94.852 loss 1.000
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.61%
Epoch: [115]  [   0/1251]  eta: 1:05:15  lr: 0.002968  min_lr: 0.002968  loss: 3.0405 (3.0405)  weight_decay: 0.0500 (0.0500)  time: 3.1302  data: 2.7827  max mem: 28503
Epoch: [115]  [ 200/1251]  eta: 0:06:20  lr: 0.002965  min_lr: 0.002965  loss: 3.4347 (3.2544)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7126 (0.8167)  time: 0.3572  data: 0.0004  max mem: 28503
Epoch: [115]  [ 400/1251]  eta: 0:05:02  lr: 0.002961  min_lr: 0.002961  loss: 3.3532 (3.2590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6926 (0.7965)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [115]  [ 600/1251]  eta: 0:03:49  lr: 0.002958  min_lr: 0.002958  loss: 3.2268 (3.2857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7736 (0.8001)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [115]  [ 800/1251]  eta: 0:02:38  lr: 0.002955  min_lr: 0.002955  loss: 3.5239 (3.2849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6240 (0.8044)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [115]  [1000/1251]  eta: 0:01:27  lr: 0.002952  min_lr: 0.002952  loss: 3.1832 (3.2807)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6132 (0.7928)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [115]  [1200/1251]  eta: 0:00:17  lr: 0.002949  min_lr: 0.002949  loss: 3.0795 (3.2874)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8048 (0.8054)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [115]  [1250/1251]  eta: 0:00:00  lr: 0.002948  min_lr: 0.002948  loss: 3.3548 (3.2847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8108 (0.8087)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [115] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.002948  min_lr: 0.002948  loss: 3.3548 (3.3006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8108 (0.8087)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7198 (0.7198)  acc1: 86.0000 (86.0000)  acc5: 97.2000 (97.2000)  time: 5.3636  data: 5.1654  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8519 (0.8722)  acc1: 82.8000 (82.6909)  acc5: 96.8000 (96.6182)  time: 0.6795  data: 0.5078  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0624 (1.0454)  acc1: 75.2000 (78.3238)  acc5: 93.6000 (94.7810)  time: 0.1902  data: 0.0211  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1867 (1.0557)  acc1: 74.8000 (77.9360)  acc5: 93.6000 (94.6560)  time: 0.1900  data: 0.0210  max mem: 28503
Test: Total time: 0:00:09 (0.3971 s / it)
* Acc@1 78.468 Acc@5 94.764 loss 1.045
Accuracy of the model on the 50000 test images: 78.5%
Max accuracy: 78.61%
Epoch: [116]  [   0/1251]  eta: 1:07:18  lr: 0.002948  min_lr: 0.002948  loss: 3.8042 (3.8042)  weight_decay: 0.0500 (0.0500)  time: 3.2281  data: 2.7188  max mem: 28503
Epoch: [116]  [ 200/1251]  eta: 0:06:21  lr: 0.002945  min_lr: 0.002945  loss: 3.2329 (3.2645)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8178)  time: 0.3448  data: 0.0005  max mem: 28503
Epoch: [116]  [ 400/1251]  eta: 0:05:02  lr: 0.002942  min_lr: 0.002942  loss: 3.3614 (3.2666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7814 (0.8622)  time: 0.3456  data: 0.0005  max mem: 28503
Epoch: [116]  [ 600/1251]  eta: 0:03:49  lr: 0.002938  min_lr: 0.002938  loss: 2.9821 (3.2708)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7213 (0.8565)  time: 0.3463  data: 0.0005  max mem: 28503
Epoch: [116]  [ 800/1251]  eta: 0:02:38  lr: 0.002935  min_lr: 0.002935  loss: 3.4367 (3.2833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9058 (0.8670)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [116]  [1000/1251]  eta: 0:01:28  lr: 0.002932  min_lr: 0.002932  loss: 3.2871 (3.2884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.8434)  time: 0.3446  data: 0.0005  max mem: 28503
Epoch: [116]  [1200/1251]  eta: 0:00:17  lr: 0.002929  min_lr: 0.002929  loss: 3.4323 (3.2914)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9305 (0.8472)  time: 0.3537  data: 0.0004  max mem: 28503
Epoch: [116]  [1250/1251]  eta: 0:00:00  lr: 0.002928  min_lr: 0.002928  loss: 3.5750 (3.2945)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9214 (0.8500)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [116] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.002928  min_lr: 0.002928  loss: 3.5750 (3.3062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9214 (0.8500)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.7621 (0.7621)  acc1: 86.4000 (86.4000)  acc5: 98.4000 (98.4000)  time: 4.5596  data: 4.3511  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9076 (0.9189)  acc1: 83.6000 (82.4727)  acc5: 97.6000 (97.0545)  time: 0.6692  data: 0.4955  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0878 (1.0858)  acc1: 76.4000 (78.6857)  acc5: 94.4000 (95.0857)  time: 0.2443  data: 0.0746  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2309 (1.0955)  acc1: 75.6000 (78.2720)  acc5: 93.6000 (94.9120)  time: 0.1987  data: 0.0299  max mem: 28503
Test: Total time: 0:00:10 (0.4083 s / it)
* Acc@1 78.786 Acc@5 94.720 loss 1.088
Accuracy of the model on the 50000 test images: 78.8%
Max accuracy: 78.79%
Epoch: [117]  [   0/1251]  eta: 1:05:34  lr: 0.002928  min_lr: 0.002928  loss: 3.5557 (3.5557)  weight_decay: 0.0500 (0.0500)  time: 3.1447  data: 2.7942  max mem: 28503
Epoch: [117]  [ 200/1251]  eta: 0:06:20  lr: 0.002925  min_lr: 0.002925  loss: 3.4858 (3.3272)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7200 (0.7890)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [117]  [ 400/1251]  eta: 0:05:02  lr: 0.002922  min_lr: 0.002922  loss: 3.4278 (3.3062)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8476 (0.8443)  time: 0.3657  data: 0.0004  max mem: 28503
Epoch: [117]  [ 600/1251]  eta: 0:03:49  lr: 0.002919  min_lr: 0.002919  loss: 3.2304 (3.3042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7961 (0.8391)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [117]  [ 800/1251]  eta: 0:02:38  lr: 0.002915  min_lr: 0.002915  loss: 3.4070 (3.3025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8316 (0.8464)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [117]  [1000/1251]  eta: 0:01:28  lr: 0.002912  min_lr: 0.002912  loss: 3.2991 (3.3071)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8577 (0.8375)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [117]  [1200/1251]  eta: 0:00:17  lr: 0.002909  min_lr: 0.002909  loss: 3.6456 (3.3056)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7947 (0.8491)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [117]  [1250/1251]  eta: 0:00:00  lr: 0.002908  min_lr: 0.002908  loss: 3.0966 (3.3038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (0.8493)  time: 0.2964  data: 0.0007  max mem: 28503
Epoch: [117] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.002908  min_lr: 0.002908  loss: 3.0966 (3.2970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7466 (0.8493)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6520 (0.6520)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.4459  data: 5.2148  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8649 (0.8676)  acc1: 83.6000 (82.1818)  acc5: 97.2000 (96.7636)  time: 0.6786  data: 0.5039  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0336 (1.0295)  acc1: 77.2000 (78.7048)  acc5: 94.4000 (94.8191)  time: 0.1852  data: 0.0165  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1432 (1.0370)  acc1: 76.4000 (78.3520)  acc5: 93.6000 (94.7040)  time: 0.1849  data: 0.0164  max mem: 28503
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 78.620 Acc@5 94.850 loss 1.023
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 78.79%
Epoch: [118]  [   0/1251]  eta: 1:11:25  lr: 0.002908  min_lr: 0.002908  loss: 2.4265 (2.4265)  weight_decay: 0.0500 (0.0500)  time: 3.4253  data: 2.9459  max mem: 28503
Epoch: [118]  [ 200/1251]  eta: 0:06:24  lr: 0.002905  min_lr: 0.002905  loss: 3.5514 (3.2893)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6662 (0.8482)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [118]  [ 400/1251]  eta: 0:05:02  lr: 0.002902  min_lr: 0.002902  loss: 3.4430 (3.3157)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8157 (0.8022)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [118]  [ 600/1251]  eta: 0:03:49  lr: 0.002899  min_lr: 0.002899  loss: 3.3443 (3.3097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7171 (0.8066)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [118]  [ 800/1251]  eta: 0:02:38  lr: 0.002895  min_lr: 0.002895  loss: 3.3951 (3.2993)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6008 (0.8012)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [118]  [1000/1251]  eta: 0:01:28  lr: 0.002892  min_lr: 0.002892  loss: 3.4414 (3.3052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8878 (0.8232)  time: 0.3488  data: 0.0005  max mem: 28503
Epoch: [118]  [1200/1251]  eta: 0:00:17  lr: 0.002889  min_lr: 0.002889  loss: 3.3689 (3.3075)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8298)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [118]  [1250/1251]  eta: 0:00:00  lr: 0.002888  min_lr: 0.002888  loss: 3.0566 (3.3035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (0.8303)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [118] Total time: 0:07:18 (0.3509 s / it)
Averaged stats: lr: 0.002888  min_lr: 0.002888  loss: 3.0566 (3.3026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7699 (0.8303)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6028 (0.6028)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.7655  data: 5.5698  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8846 (0.8147)  acc1: 83.2000 (82.0727)  acc5: 96.4000 (96.6909)  time: 0.6786  data: 0.5067  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9644 (0.9627)  acc1: 76.8000 (78.9143)  acc5: 94.0000 (94.8762)  time: 0.1693  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0969 (0.9734)  acc1: 76.8000 (78.6080)  acc5: 94.0000 (94.7680)  time: 0.1688  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3963 s / it)
* Acc@1 79.016 Acc@5 94.920 loss 0.966
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.02%
Epoch: [119]  [   0/1251]  eta: 1:03:01  lr: 0.002888  min_lr: 0.002888  loss: 3.4670 (3.4670)  weight_decay: 0.0500 (0.0500)  time: 3.0231  data: 2.6748  max mem: 28503
Epoch: [119]  [ 200/1251]  eta: 0:06:19  lr: 0.002885  min_lr: 0.002885  loss: 3.1634 (3.2619)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6679 (nan)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [119]  [ 400/1251]  eta: 0:05:02  lr: 0.002882  min_lr: 0.002882  loss: 3.5200 (3.2782)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0380 (nan)  time: 0.3573  data: 0.0004  max mem: 28503
Epoch: [119]  [ 600/1251]  eta: 0:03:49  lr: 0.002879  min_lr: 0.002879  loss: 3.4859 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7747 (nan)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [119]  [ 800/1251]  eta: 0:02:38  lr: 0.002875  min_lr: 0.002875  loss: 3.3819 (3.2611)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9669 (nan)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [119]  [1000/1251]  eta: 0:01:27  lr: 0.002872  min_lr: 0.002872  loss: 3.3821 (3.2759)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8610 (nan)  time: 0.3515  data: 0.0004  max mem: 28503
Epoch: [119]  [1200/1251]  eta: 0:00:17  lr: 0.002869  min_lr: 0.002869  loss: 3.3842 (3.2735)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7152 (nan)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [119]  [1250/1251]  eta: 0:00:00  lr: 0.002868  min_lr: 0.002868  loss: 3.3661 (3.2786)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8059 (nan)  time: 0.3018  data: 0.0007  max mem: 28503
Epoch: [119] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.002868  min_lr: 0.002868  loss: 3.3661 (3.2919)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8059 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.7266 (0.7266)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.5348  data: 5.3004  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9463 (0.9281)  acc1: 84.4000 (82.0364)  acc5: 96.8000 (97.0545)  time: 0.7344  data: 0.5578  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1156 (1.1149)  acc1: 76.0000 (78.4762)  acc5: 94.0000 (94.8191)  time: 0.2124  data: 0.0418  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.2077 (1.1196)  acc1: 76.4000 (78.3840)  acc5: 93.6000 (94.7200)  time: 0.2112  data: 0.0418  max mem: 28503
Test: Total time: 0:00:10 (0.4214 s / it)
* Acc@1 78.642 Acc@5 94.670 loss 1.115
Accuracy of the model on the 50000 test images: 78.6%
Max accuracy: 79.02%
Epoch: [120]  [   0/1251]  eta: 1:08:40  lr: 0.002868  min_lr: 0.002868  loss: 3.7101 (3.7101)  weight_decay: 0.0500 (0.0500)  time: 3.2938  data: 1.7396  max mem: 28503
Epoch: [120]  [ 200/1251]  eta: 0:06:21  lr: 0.002865  min_lr: 0.002865  loss: 3.5190 (3.3042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7150 (0.8076)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [120]  [ 400/1251]  eta: 0:05:02  lr: 0.002862  min_lr: 0.002862  loss: 3.4218 (3.3146)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7826 (0.8265)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [120]  [ 600/1251]  eta: 0:03:49  lr: 0.002858  min_lr: 0.002858  loss: 3.4199 (3.3027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8092 (0.8240)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [120]  [ 800/1251]  eta: 0:02:38  lr: 0.002855  min_lr: 0.002855  loss: 3.5261 (3.3155)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8288 (0.8246)  time: 0.3443  data: 0.0003  max mem: 28503
Epoch: [120]  [1000/1251]  eta: 0:01:28  lr: 0.002852  min_lr: 0.002852  loss: 3.5370 (3.3108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7382 (0.8159)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [120]  [1200/1251]  eta: 0:00:17  lr: 0.002849  min_lr: 0.002849  loss: 3.5300 (3.3093)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7705 (0.8265)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [120]  [1250/1251]  eta: 0:00:00  lr: 0.002848  min_lr: 0.002848  loss: 3.2429 (3.3089)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8282)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [120] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.002848  min_lr: 0.002848  loss: 3.2429 (3.2895)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6839 (0.8282)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7553 (0.7553)  acc1: 85.2000 (85.2000)  acc5: 98.4000 (98.4000)  time: 5.3614  data: 5.1586  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9026 (0.8965)  acc1: 81.6000 (81.4909)  acc5: 96.4000 (96.4000)  time: 0.7327  data: 0.5584  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0577 (1.0539)  acc1: 77.2000 (78.4381)  acc5: 94.0000 (94.5143)  time: 0.2215  data: 0.0492  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1369 (1.0657)  acc1: 77.2000 (78.1120)  acc5: 93.6000 (94.4160)  time: 0.2203  data: 0.0492  max mem: 28503
Test: Total time: 0:00:10 (0.4226 s / it)
* Acc@1 78.698 Acc@5 94.790 loss 1.049
Accuracy of the model on the 50000 test images: 78.7%
Max accuracy: 79.02%
Epoch: [121]  [   0/1251]  eta: 1:09:34  lr: 0.002848  min_lr: 0.002848  loss: 3.2086 (3.2086)  weight_decay: 0.0500 (0.0500)  time: 3.3369  data: 1.8587  max mem: 28503
Epoch: [121]  [ 200/1251]  eta: 0:06:23  lr: 0.002845  min_lr: 0.002845  loss: 3.1270 (3.2769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7348 (0.8485)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [121]  [ 400/1251]  eta: 0:05:03  lr: 0.002841  min_lr: 0.002841  loss: 3.3708 (3.2977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8062 (0.8566)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [121]  [ 600/1251]  eta: 0:03:50  lr: 0.002838  min_lr: 0.002838  loss: 3.4504 (3.2657)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7697 (0.8481)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [121]  [ 800/1251]  eta: 0:02:39  lr: 0.002835  min_lr: 0.002835  loss: 3.4240 (3.2763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7035 (0.8653)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [121]  [1000/1251]  eta: 0:01:28  lr: 0.002831  min_lr: 0.002831  loss: 3.2218 (3.2793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6621 (0.8406)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [121]  [1200/1251]  eta: 0:00:17  lr: 0.002828  min_lr: 0.002828  loss: 3.2038 (3.2835)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8361 (0.8323)  time: 0.3586  data: 0.0005  max mem: 28503
Epoch: [121]  [1250/1251]  eta: 0:00:00  lr: 0.002827  min_lr: 0.002827  loss: 3.4262 (3.2873)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7201 (0.8295)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [121] Total time: 0:07:19 (0.3511 s / it)
Averaged stats: lr: 0.002827  min_lr: 0.002827  loss: 3.4262 (3.2788)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7201 (0.8295)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6509 (0.6509)  acc1: 88.4000 (88.4000)  acc5: 100.0000 (100.0000)  time: 5.7774  data: 5.5743  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9150 (0.8941)  acc1: 82.4000 (82.5455)  acc5: 97.2000 (97.1273)  time: 0.7262  data: 0.5534  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0941 (1.0591)  acc1: 78.0000 (78.7619)  acc5: 94.0000 (95.0286)  time: 0.1948  data: 0.0257  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1917 (1.0695)  acc1: 77.2000 (78.5760)  acc5: 93.6000 (94.9600)  time: 0.1941  data: 0.0257  max mem: 28503
Test: Total time: 0:00:10 (0.4175 s / it)
* Acc@1 79.006 Acc@5 94.996 loss 1.060
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.02%
Epoch: [122]  [   0/1251]  eta: 1:09:23  lr: 0.002827  min_lr: 0.002827  loss: 2.3698 (2.3698)  weight_decay: 0.0500 (0.0500)  time: 3.3283  data: 2.7124  max mem: 28503
Epoch: [122]  [ 200/1251]  eta: 0:06:20  lr: 0.002824  min_lr: 0.002824  loss: 3.2030 (3.2610)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8477 (0.7770)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [122]  [ 400/1251]  eta: 0:05:02  lr: 0.002821  min_lr: 0.002821  loss: 3.4223 (3.2569)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7512 (0.8218)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [122]  [ 600/1251]  eta: 0:03:49  lr: 0.002818  min_lr: 0.002818  loss: 3.3723 (3.2347)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7405 (0.8379)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [122]  [ 800/1251]  eta: 0:02:39  lr: 0.002814  min_lr: 0.002814  loss: 3.4499 (3.2359)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7349 (0.8303)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [122]  [1000/1251]  eta: 0:01:28  lr: 0.002811  min_lr: 0.002811  loss: 3.4116 (3.2325)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6960 (0.8164)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [122]  [1200/1251]  eta: 0:00:17  lr: 0.002808  min_lr: 0.002808  loss: 3.3343 (3.2593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8126 (0.8095)  time: 0.3564  data: 0.0004  max mem: 28503
Epoch: [122]  [1250/1251]  eta: 0:00:00  lr: 0.002807  min_lr: 0.002807  loss: 3.6299 (3.2614)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.8131)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [122] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.002807  min_lr: 0.002807  loss: 3.6299 (3.2870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.8131)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.7247 (0.7247)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.7242  data: 5.5231  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9285 (0.9144)  acc1: 82.8000 (81.9273)  acc5: 97.6000 (96.8364)  time: 0.7690  data: 0.5972  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1324 (1.0639)  acc1: 77.2000 (78.9524)  acc5: 94.0000 (94.9333)  time: 0.2210  data: 0.0523  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1501 (1.0786)  acc1: 77.2000 (78.4640)  acc5: 94.0000 (94.8640)  time: 0.2207  data: 0.0523  max mem: 28503
Test: Total time: 0:00:10 (0.4369 s / it)
* Acc@1 79.008 Acc@5 94.866 loss 1.060
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.02%
Epoch: [123]  [   0/1251]  eta: 1:09:38  lr: 0.002807  min_lr: 0.002807  loss: 3.5166 (3.5166)  weight_decay: 0.0500 (0.0500)  time: 3.3403  data: 2.1976  max mem: 28503
Epoch: [123]  [ 200/1251]  eta: 0:06:22  lr: 0.002804  min_lr: 0.002804  loss: 3.1044 (3.2277)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6503 (0.7657)  time: 0.3530  data: 0.0004  max mem: 28503
Epoch: [123]  [ 400/1251]  eta: 0:05:03  lr: 0.002800  min_lr: 0.002800  loss: 3.3182 (3.2565)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9855 (0.8211)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [123]  [ 600/1251]  eta: 0:03:50  lr: 0.002797  min_lr: 0.002797  loss: 3.4352 (3.2745)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7406 (0.8214)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [123]  [ 800/1251]  eta: 0:02:38  lr: 0.002794  min_lr: 0.002794  loss: 3.2777 (3.2747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8486 (0.8389)  time: 0.3548  data: 0.0004  max mem: 28503
Epoch: [123]  [1000/1251]  eta: 0:01:28  lr: 0.002790  min_lr: 0.002790  loss: 3.5090 (3.2805)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6645 (0.8208)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [123]  [1200/1251]  eta: 0:00:17  lr: 0.002787  min_lr: 0.002787  loss: 3.2819 (3.2851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7114 (0.8107)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [123]  [1250/1251]  eta: 0:00:00  lr: 0.002786  min_lr: 0.002786  loss: 3.3101 (3.2852)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.8105)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [123] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.002786  min_lr: 0.002786  loss: 3.3101 (3.2849)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7244 (0.8105)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6101 (0.6101)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.5139  data: 5.3179  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8620 (0.8289)  acc1: 83.2000 (82.6182)  acc5: 97.2000 (97.0546)  time: 0.6845  data: 0.5128  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0024 (0.9831)  acc1: 76.0000 (78.9714)  acc5: 94.4000 (94.9524)  time: 0.1849  data: 0.0162  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0395 (0.9918)  acc1: 76.4000 (78.6560)  acc5: 94.4000 (94.9760)  time: 0.1843  data: 0.0161  max mem: 28503
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 79.116 Acc@5 95.026 loss 0.983
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.12%
Epoch: [124]  [   0/1251]  eta: 1:03:23  lr: 0.002786  min_lr: 0.002786  loss: 3.6935 (3.6935)  weight_decay: 0.0500 (0.0500)  time: 3.0403  data: 2.6483  max mem: 28503
Epoch: [124]  [ 200/1251]  eta: 0:06:20  lr: 0.002783  min_lr: 0.002783  loss: 3.3840 (3.2725)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7671 (0.8424)  time: 0.3593  data: 0.0004  max mem: 28503
Epoch: [124]  [ 400/1251]  eta: 0:05:03  lr: 0.002780  min_lr: 0.002780  loss: 3.2797 (3.2512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7407 (0.8516)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [124]  [ 600/1251]  eta: 0:03:50  lr: 0.002776  min_lr: 0.002776  loss: 3.4592 (3.2556)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6603 (0.8142)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [124]  [ 800/1251]  eta: 0:02:39  lr: 0.002773  min_lr: 0.002773  loss: 3.4821 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6901 (0.8183)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [124]  [1000/1251]  eta: 0:01:28  lr: 0.002770  min_lr: 0.002770  loss: 3.4014 (3.2773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7419 (0.8236)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [124]  [1200/1251]  eta: 0:00:17  lr: 0.002766  min_lr: 0.002766  loss: 3.4574 (3.2812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8766 (0.8223)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [124]  [1250/1251]  eta: 0:00:00  lr: 0.002766  min_lr: 0.002766  loss: 3.3940 (3.2752)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8244)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [124] Total time: 0:07:18 (0.3509 s / it)
Averaged stats: lr: 0.002766  min_lr: 0.002766  loss: 3.3940 (3.2773)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7840 (0.8244)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6158 (0.6158)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.5049  data: 5.3062  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8528 (0.8184)  acc1: 82.4000 (82.5455)  acc5: 96.4000 (96.6546)  time: 0.6553  data: 0.4827  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0238 (0.9776)  acc1: 76.0000 (79.0667)  acc5: 94.4000 (94.8762)  time: 0.1876  data: 0.0185  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0826 (0.9887)  acc1: 76.0000 (78.7360)  acc5: 93.6000 (94.8640)  time: 0.1867  data: 0.0184  max mem: 28503
Test: Total time: 0:00:10 (0.4012 s / it)
* Acc@1 79.236 Acc@5 95.072 loss 0.973
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.24%
Epoch: [125]  [   0/1251]  eta: 1:01:00  lr: 0.002766  min_lr: 0.002766  loss: 3.7442 (3.7442)  weight_decay: 0.0500 (0.0500)  time: 2.9263  data: 2.4973  max mem: 28503
Epoch: [125]  [ 200/1251]  eta: 0:06:21  lr: 0.002762  min_lr: 0.002762  loss: 3.5004 (3.3035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7694 (0.8099)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [125]  [ 400/1251]  eta: 0:05:03  lr: 0.002759  min_lr: 0.002759  loss: 3.4083 (3.2926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9084 (0.8824)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [125]  [ 600/1251]  eta: 0:03:49  lr: 0.002756  min_lr: 0.002756  loss: 3.4198 (3.2862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.5927 (0.8490)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [125]  [ 800/1251]  eta: 0:02:38  lr: 0.002752  min_lr: 0.002752  loss: 3.0950 (3.2888)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9302 (0.8635)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [125]  [1000/1251]  eta: 0:01:28  lr: 0.002749  min_lr: 0.002749  loss: 3.4622 (3.2827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7237 (0.8604)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [125]  [1200/1251]  eta: 0:00:17  lr: 0.002746  min_lr: 0.002746  loss: 3.1140 (3.2730)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6789 (0.8677)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [125]  [1250/1251]  eta: 0:00:00  lr: 0.002745  min_lr: 0.002745  loss: 3.1803 (3.2729)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8512 (0.8719)  time: 0.2923  data: 0.0005  max mem: 28503
Epoch: [125] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.002745  min_lr: 0.002745  loss: 3.1803 (3.2671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8512 (0.8719)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6571 (0.6571)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.7476  data: 5.5188  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8834 (0.8658)  acc1: 83.2000 (82.7636)  acc5: 96.8000 (96.7273)  time: 0.7174  data: 0.5433  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0783 (1.0162)  acc1: 77.2000 (79.2000)  acc5: 94.8000 (95.0857)  time: 0.1922  data: 0.0229  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1232 (1.0285)  acc1: 76.4000 (78.7040)  acc5: 94.4000 (95.1200)  time: 0.1920  data: 0.0228  max mem: 28503
Test: Total time: 0:00:10 (0.4139 s / it)
* Acc@1 79.176 Acc@5 95.088 loss 1.015
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.24%
Epoch: [126]  [   0/1251]  eta: 1:07:31  lr: 0.002745  min_lr: 0.002745  loss: 3.4052 (3.4052)  weight_decay: 0.0500 (0.0500)  time: 3.2385  data: 2.5537  max mem: 28503
Epoch: [126]  [ 200/1251]  eta: 0:06:20  lr: 0.002742  min_lr: 0.002742  loss: 3.2677 (3.2550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7861 (0.9216)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [126]  [ 400/1251]  eta: 0:05:02  lr: 0.002738  min_lr: 0.002738  loss: 3.2264 (3.2790)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6100 (0.8329)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [126]  [ 600/1251]  eta: 0:03:49  lr: 0.002735  min_lr: 0.002735  loss: 3.4381 (3.2853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6284 (0.8039)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [126]  [ 800/1251]  eta: 0:02:38  lr: 0.002732  min_lr: 0.002732  loss: 3.4067 (3.2885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7438 (0.8003)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [126]  [1000/1251]  eta: 0:01:28  lr: 0.002728  min_lr: 0.002728  loss: 3.3015 (3.2840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7004 (0.8306)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [126]  [1200/1251]  eta: 0:00:17  lr: 0.002725  min_lr: 0.002725  loss: 3.3050 (3.2813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (0.8372)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [126]  [1250/1251]  eta: 0:00:00  lr: 0.002724  min_lr: 0.002724  loss: 3.3842 (3.2851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (0.8452)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [126] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.002724  min_lr: 0.002724  loss: 3.3842 (3.2698)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9207 (0.8452)
Test:  [ 0/25]  eta: 0:02:31  loss: 0.7866 (0.7866)  acc1: 85.6000 (85.6000)  acc5: 97.6000 (97.6000)  time: 6.0754  data: 5.8793  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9802 (0.9500)  acc1: 83.6000 (82.2182)  acc5: 97.2000 (96.7636)  time: 0.7061  data: 0.5348  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1292 (1.0930)  acc1: 76.0000 (78.7429)  acc5: 94.4000 (94.4762)  time: 0.1689  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1796 (1.1039)  acc1: 75.2000 (78.3840)  acc5: 93.2000 (94.4640)  time: 0.1686  data: 0.0001  max mem: 28503
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 79.266 Acc@5 95.040 loss 1.083
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.27%
Epoch: [127]  [   0/1251]  eta: 1:02:54  lr: 0.002724  min_lr: 0.002724  loss: 2.9367 (2.9367)  weight_decay: 0.0500 (0.0500)  time: 3.0175  data: 2.5918  max mem: 28503
Epoch: [127]  [ 200/1251]  eta: 0:06:19  lr: 0.002721  min_lr: 0.002721  loss: 3.3577 (3.2210)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8067 (0.9150)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [127]  [ 400/1251]  eta: 0:05:02  lr: 0.002717  min_lr: 0.002717  loss: 3.3790 (3.2515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6612 (0.8584)  time: 0.3452  data: 0.0003  max mem: 28503
Epoch: [127]  [ 600/1251]  eta: 0:03:49  lr: 0.002714  min_lr: 0.002714  loss: 3.2975 (3.2518)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9018 (0.8931)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [127]  [ 800/1251]  eta: 0:02:38  lr: 0.002711  min_lr: 0.002711  loss: 3.2627 (3.2573)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8044 (0.8647)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [127]  [1000/1251]  eta: 0:01:27  lr: 0.002707  min_lr: 0.002707  loss: 3.3459 (3.2524)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8451 (0.8701)  time: 0.3440  data: 0.0005  max mem: 28503
Epoch: [127]  [1200/1251]  eta: 0:00:17  lr: 0.002704  min_lr: 0.002704  loss: 3.3216 (3.2521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7289 (0.8783)  time: 0.3612  data: 0.0005  max mem: 28503
Epoch: [127]  [1250/1251]  eta: 0:00:00  lr: 0.002703  min_lr: 0.002703  loss: 3.2184 (3.2508)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.8742)  time: 0.2996  data: 0.0006  max mem: 28503
Epoch: [127] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.002703  min_lr: 0.002703  loss: 3.2184 (3.2637)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (0.8742)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6387 (0.6387)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.6354  data: 5.4396  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8693 (0.8560)  acc1: 82.0000 (82.2545)  acc5: 97.2000 (96.6546)  time: 0.7065  data: 0.5353  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0315 (1.0077)  acc1: 75.6000 (79.0286)  acc5: 94.0000 (94.7429)  time: 0.1915  data: 0.0225  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1164 (1.0140)  acc1: 75.6000 (78.6880)  acc5: 93.6000 (94.6720)  time: 0.1917  data: 0.0224  max mem: 28503
Test: Total time: 0:00:10 (0.4093 s / it)
* Acc@1 79.186 Acc@5 94.922 loss 1.001
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.27%
Epoch: [128]  [   0/1251]  eta: 1:13:41  lr: 0.002703  min_lr: 0.002703  loss: 3.1529 (3.1529)  weight_decay: 0.0500 (0.0500)  time: 3.5348  data: 2.6327  max mem: 28503
Epoch: [128]  [ 200/1251]  eta: 0:06:22  lr: 0.002700  min_lr: 0.002700  loss: 3.3045 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6952 (0.7086)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [128]  [ 400/1251]  eta: 0:05:02  lr: 0.002696  min_lr: 0.002696  loss: 3.4184 (3.2348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7542 (0.7846)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [128]  [ 600/1251]  eta: 0:03:49  lr: 0.002693  min_lr: 0.002693  loss: 3.4995 (3.2577)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7372 (0.7882)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [128]  [ 800/1251]  eta: 0:02:38  lr: 0.002690  min_lr: 0.002690  loss: 3.3013 (3.2517)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7724 (0.8054)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [128]  [1000/1251]  eta: 0:01:28  lr: 0.002686  min_lr: 0.002686  loss: 3.5051 (3.2519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6733 (0.7879)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [128]  [1200/1251]  eta: 0:00:17  lr: 0.002683  min_lr: 0.002683  loss: 3.1399 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.7862)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [128]  [1250/1251]  eta: 0:00:00  lr: 0.002682  min_lr: 0.002682  loss: 3.5188 (3.2515)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.7852)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [128] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.002682  min_lr: 0.002682  loss: 3.5188 (3.2648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.7852)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6849 (0.6849)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.6739  data: 5.4908  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9463 (0.9052)  acc1: 81.6000 (82.1818)  acc5: 97.2000 (96.8727)  time: 0.7506  data: 0.5764  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0630 (1.0563)  acc1: 77.2000 (78.9333)  acc5: 94.4000 (94.8571)  time: 0.2134  data: 0.0425  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1633 (1.0667)  acc1: 76.8000 (78.6080)  acc5: 94.4000 (94.8800)  time: 0.2120  data: 0.0424  max mem: 28503
Test: Total time: 0:00:10 (0.4278 s / it)
* Acc@1 79.040 Acc@5 95.056 loss 1.054
Accuracy of the model on the 50000 test images: 79.0%
Max accuracy: 79.27%
Epoch: [129]  [   0/1251]  eta: 1:13:49  lr: 0.002682  min_lr: 0.002682  loss: 3.0004 (3.0004)  weight_decay: 0.0500 (0.0500)  time: 3.5406  data: 2.5766  max mem: 28503
Epoch: [129]  [ 200/1251]  eta: 0:06:24  lr: 0.002679  min_lr: 0.002679  loss: 3.2957 (3.2484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8418 (0.8986)  time: 0.3554  data: 0.0005  max mem: 28503
Epoch: [129]  [ 400/1251]  eta: 0:05:03  lr: 0.002675  min_lr: 0.002675  loss: 3.4253 (3.2398)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7924 (0.8699)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [129]  [ 600/1251]  eta: 0:03:49  lr: 0.002672  min_lr: 0.002672  loss: 3.4855 (3.2437)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8173 (0.8521)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [129]  [ 800/1251]  eta: 0:02:38  lr: 0.002668  min_lr: 0.002668  loss: 3.2197 (3.2636)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6539 (0.8419)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [129]  [1000/1251]  eta: 0:01:28  lr: 0.002665  min_lr: 0.002665  loss: 3.2652 (3.2665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7778 (0.8348)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [129]  [1200/1251]  eta: 0:00:17  lr: 0.002662  min_lr: 0.002662  loss: 3.0552 (3.2662)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (nan)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [129]  [1250/1251]  eta: 0:00:00  lr: 0.002661  min_lr: 0.002661  loss: 3.5130 (3.2705)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (nan)  time: 0.2952  data: 0.0005  max mem: 28503
Epoch: [129] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.002661  min_lr: 0.002661  loss: 3.5130 (3.2601)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7273 (0.7273)  acc1: 87.6000 (87.6000)  acc5: 100.0000 (100.0000)  time: 5.7072  data: 5.5054  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 1.0116 (0.9798)  acc1: 82.8000 (82.0000)  acc5: 96.0000 (96.6909)  time: 0.7351  data: 0.5633  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1987 (1.1266)  acc1: 76.8000 (78.5714)  acc5: 94.4000 (94.9143)  time: 0.2032  data: 0.0346  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1987 (1.1330)  acc1: 75.6000 (78.2400)  acc5: 93.6000 (94.8160)  time: 0.2030  data: 0.0345  max mem: 28503
Test: Total time: 0:00:10 (0.4216 s / it)
* Acc@1 79.062 Acc@5 94.978 loss 1.120
Accuracy of the model on the 50000 test images: 79.1%
Max accuracy: 79.27%
Epoch: [130]  [   0/1251]  eta: 1:10:07  lr: 0.002661  min_lr: 0.002661  loss: 3.0616 (3.0616)  weight_decay: 0.0500 (0.0500)  time: 3.3636  data: 2.2155  max mem: 28503
Epoch: [130]  [ 200/1251]  eta: 0:06:21  lr: 0.002657  min_lr: 0.002657  loss: 3.4003 (3.2856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9137 (0.9543)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [130]  [ 400/1251]  eta: 0:05:02  lr: 0.002654  min_lr: 0.002654  loss: 3.4670 (3.2503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6703 (0.8689)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [130]  [ 600/1251]  eta: 0:03:49  lr: 0.002651  min_lr: 0.002651  loss: 3.3382 (3.2771)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7060 (0.8714)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [130]  [ 800/1251]  eta: 0:02:38  lr: 0.002647  min_lr: 0.002647  loss: 3.1333 (3.2775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7750 (0.8796)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [130]  [1000/1251]  eta: 0:01:28  lr: 0.002644  min_lr: 0.002644  loss: 3.2706 (3.2834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8575 (0.8789)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [130]  [1200/1251]  eta: 0:00:17  lr: 0.002640  min_lr: 0.002640  loss: 3.2639 (3.2795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6542 (0.8640)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [130]  [1250/1251]  eta: 0:00:00  lr: 0.002640  min_lr: 0.002640  loss: 3.3090 (3.2762)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (0.8587)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [130] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.002640  min_lr: 0.002640  loss: 3.3090 (3.2594)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7269 (0.8587)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6998 (0.6998)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.1661  data: 4.9609  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9092 (0.8771)  acc1: 81.6000 (82.7273)  acc5: 97.2000 (97.0909)  time: 0.6769  data: 0.5039  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0689 (1.0358)  acc1: 77.2000 (79.4476)  acc5: 94.4000 (95.1429)  time: 0.1983  data: 0.0292  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1548 (1.0502)  acc1: 77.2000 (79.0880)  acc5: 94.0000 (95.0080)  time: 0.1976  data: 0.0291  max mem: 28503
Test: Total time: 0:00:09 (0.3966 s / it)
* Acc@1 79.232 Acc@5 94.982 loss 1.051
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.27%
Epoch: [131]  [   0/1251]  eta: 1:04:00  lr: 0.002640  min_lr: 0.002640  loss: 3.7126 (3.7126)  weight_decay: 0.0500 (0.0500)  time: 3.0701  data: 2.0728  max mem: 28503
Epoch: [131]  [ 200/1251]  eta: 0:06:20  lr: 0.002636  min_lr: 0.002636  loss: 3.2679 (3.2970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7631 (0.7907)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [131]  [ 400/1251]  eta: 0:05:02  lr: 0.002633  min_lr: 0.002633  loss: 3.3968 (3.3029)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8359 (0.8126)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [131]  [ 600/1251]  eta: 0:03:49  lr: 0.002629  min_lr: 0.002629  loss: 3.1209 (3.2804)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8391 (0.8492)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [131]  [ 800/1251]  eta: 0:02:38  lr: 0.002626  min_lr: 0.002626  loss: 3.3787 (3.2748)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6764 (0.8163)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [131]  [1000/1251]  eta: 0:01:28  lr: 0.002623  min_lr: 0.002623  loss: 3.4544 (3.2604)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7978 (0.8168)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [131]  [1200/1251]  eta: 0:00:17  lr: 0.002619  min_lr: 0.002619  loss: 3.3240 (3.2602)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6676 (0.8042)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [131]  [1250/1251]  eta: 0:00:00  lr: 0.002618  min_lr: 0.002618  loss: 3.3922 (3.2635)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.8147)  time: 0.2971  data: 0.0005  max mem: 28503
Epoch: [131] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.002618  min_lr: 0.002618  loss: 3.3922 (3.2519)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8186 (0.8147)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6703 (0.6703)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.4545  data: 5.2497  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8859 (0.8559)  acc1: 83.6000 (82.7273)  acc5: 96.8000 (96.9091)  time: 0.7087  data: 0.5364  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0407 (1.0005)  acc1: 76.4000 (79.0286)  acc5: 94.8000 (94.9524)  time: 0.2015  data: 0.0326  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1039 (1.0076)  acc1: 76.4000 (78.7840)  acc5: 94.0000 (94.9600)  time: 0.2014  data: 0.0325  max mem: 28503
Test: Total time: 0:00:10 (0.4099 s / it)
* Acc@1 79.318 Acc@5 95.050 loss 0.996
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.32%
Epoch: [132]  [   0/1251]  eta: 1:12:37  lr: 0.002618  min_lr: 0.002618  loss: 2.9964 (2.9964)  weight_decay: 0.0500 (0.0500)  time: 3.4832  data: 3.1373  max mem: 28503
Epoch: [132]  [ 200/1251]  eta: 0:06:22  lr: 0.002615  min_lr: 0.002615  loss: 3.4017 (3.2409)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7302 (0.8621)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [132]  [ 400/1251]  eta: 0:05:03  lr: 0.002612  min_lr: 0.002612  loss: 3.1314 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6208 (0.8383)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [132]  [ 600/1251]  eta: 0:03:49  lr: 0.002608  min_lr: 0.002608  loss: 3.2348 (3.2376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8887 (0.8722)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [132]  [ 800/1251]  eta: 0:02:38  lr: 0.002605  min_lr: 0.002605  loss: 3.4309 (3.2541)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9198 (0.8443)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [132]  [1000/1251]  eta: 0:01:28  lr: 0.002601  min_lr: 0.002601  loss: 3.3664 (3.2626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7563 (0.8311)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [132]  [1200/1251]  eta: 0:00:17  lr: 0.002598  min_lr: 0.002598  loss: 3.5776 (3.2546)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6708 (0.8602)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [132]  [1250/1251]  eta: 0:00:00  lr: 0.002597  min_lr: 0.002597  loss: 3.0506 (3.2559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6947 (0.8563)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [132] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.002597  min_lr: 0.002597  loss: 3.0506 (3.2499)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6947 (0.8563)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6603 (0.6603)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.6746  data: 5.4626  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8485 (0.8614)  acc1: 84.8000 (82.8727)  acc5: 96.8000 (96.6545)  time: 0.7252  data: 0.5519  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0496 (1.0024)  acc1: 76.8000 (79.5810)  acc5: 94.4000 (95.0667)  time: 0.1994  data: 0.0305  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1007 (1.0113)  acc1: 76.8000 (79.0400)  acc5: 94.0000 (94.9760)  time: 0.1989  data: 0.0304  max mem: 28503
Test: Total time: 0:00:10 (0.4169 s / it)
* Acc@1 79.446 Acc@5 95.140 loss 1.004
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.45%
Epoch: [133]  [   0/1251]  eta: 1:07:49  lr: 0.002597  min_lr: 0.002597  loss: 2.6109 (2.6109)  weight_decay: 0.0500 (0.0500)  time: 3.2529  data: 2.8882  max mem: 28503
Epoch: [133]  [ 200/1251]  eta: 0:06:20  lr: 0.002594  min_lr: 0.002594  loss: 3.2193 (3.2178)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8383 (0.8055)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [133]  [ 400/1251]  eta: 0:05:02  lr: 0.002590  min_lr: 0.002590  loss: 3.2580 (3.2393)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9518 (0.8489)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [133]  [ 600/1251]  eta: 0:03:49  lr: 0.002587  min_lr: 0.002587  loss: 3.4245 (3.2452)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.8512)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [133]  [ 800/1251]  eta: 0:02:38  lr: 0.002583  min_lr: 0.002583  loss: 3.3608 (3.2433)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7963 (0.8556)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [133]  [1000/1251]  eta: 0:01:27  lr: 0.002580  min_lr: 0.002580  loss: 3.2098 (3.2521)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7667 (0.8571)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [133]  [1200/1251]  eta: 0:00:17  lr: 0.002576  min_lr: 0.002576  loss: 3.3770 (3.2439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6765 (0.8394)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [133]  [1250/1251]  eta: 0:00:00  lr: 0.002576  min_lr: 0.002576  loss: 3.3088 (3.2443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6121 (0.8336)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [133] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002576  min_lr: 0.002576  loss: 3.3088 (3.2491)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6121 (0.8336)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6680 (0.6680)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.6289  data: 5.4185  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8695 (0.8585)  acc1: 83.6000 (82.1455)  acc5: 97.6000 (97.0546)  time: 0.7318  data: 0.5589  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0980 (1.0047)  acc1: 77.2000 (79.3905)  acc5: 94.0000 (95.1429)  time: 0.2074  data: 0.0365  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0983 (1.0127)  acc1: 77.2000 (79.2800)  acc5: 94.0000 (95.1520)  time: 0.2068  data: 0.0364  max mem: 28503
Test: Total time: 0:00:10 (0.4210 s / it)
* Acc@1 79.480 Acc@5 95.142 loss 1.000
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.48%
Epoch: [134]  [   0/1251]  eta: 1:01:13  lr: 0.002576  min_lr: 0.002576  loss: 3.7664 (3.7664)  weight_decay: 0.0500 (0.0500)  time: 2.9362  data: 2.5413  max mem: 28503
Epoch: [134]  [ 200/1251]  eta: 0:06:18  lr: 0.002572  min_lr: 0.002572  loss: 3.4074 (3.1952)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8392 (0.9175)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [134]  [ 400/1251]  eta: 0:05:02  lr: 0.002569  min_lr: 0.002569  loss: 3.3518 (3.2263)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6854 (0.8886)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [134]  [ 600/1251]  eta: 0:03:49  lr: 0.002565  min_lr: 0.002565  loss: 3.2441 (3.2193)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7769 (0.8714)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [134]  [ 800/1251]  eta: 0:02:38  lr: 0.002562  min_lr: 0.002562  loss: 3.5108 (3.2376)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7259 (0.8668)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [134]  [1000/1251]  eta: 0:01:28  lr: 0.002558  min_lr: 0.002558  loss: 3.1791 (3.2336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.8945)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [134]  [1200/1251]  eta: 0:00:17  lr: 0.002555  min_lr: 0.002555  loss: 3.3801 (3.2428)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8565 (0.8944)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [134]  [1250/1251]  eta: 0:00:00  lr: 0.002554  min_lr: 0.002554  loss: 3.4244 (3.2450)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8559 (0.8930)  time: 0.2919  data: 0.0005  max mem: 28503
Epoch: [134] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.002554  min_lr: 0.002554  loss: 3.4244 (3.2465)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8559 (0.8930)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.6693 (0.6693)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.1882  data: 4.9878  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9097 (0.8684)  acc1: 82.8000 (82.6545)  acc5: 97.6000 (97.0909)  time: 0.7277  data: 0.5531  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0795 (1.0069)  acc1: 77.2000 (79.4286)  acc5: 95.6000 (95.3333)  time: 0.2251  data: 0.0549  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1385 (1.0168)  acc1: 77.2000 (79.1360)  acc5: 94.4000 (95.2960)  time: 0.2164  data: 0.0481  max mem: 28503
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 79.310 Acc@5 95.174 loss 1.008
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [135]  [   0/1251]  eta: 1:11:33  lr: 0.002554  min_lr: 0.002554  loss: 2.7931 (2.7931)  weight_decay: 0.0500 (0.0500)  time: 3.4321  data: 2.2587  max mem: 28503
Epoch: [135]  [ 200/1251]  eta: 0:06:21  lr: 0.002551  min_lr: 0.002551  loss: 3.3274 (3.2142)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6494 (0.7490)  time: 0.3497  data: 0.0004  max mem: 28503
Epoch: [135]  [ 400/1251]  eta: 0:05:02  lr: 0.002547  min_lr: 0.002547  loss: 3.1283 (3.2350)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7094 (0.7605)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [135]  [ 600/1251]  eta: 0:03:50  lr: 0.002544  min_lr: 0.002544  loss: 3.5246 (3.2651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7634 (0.8067)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [135]  [ 800/1251]  eta: 0:02:38  lr: 0.002540  min_lr: 0.002540  loss: 3.2581 (3.2592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7912 (0.8053)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [135]  [1000/1251]  eta: 0:01:28  lr: 0.002537  min_lr: 0.002537  loss: 3.4837 (3.2699)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6204 (0.8144)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [135]  [1200/1251]  eta: 0:00:17  lr: 0.002533  min_lr: 0.002533  loss: 3.3354 (3.2701)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7443 (0.8210)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [135]  [1250/1251]  eta: 0:00:00  lr: 0.002533  min_lr: 0.002533  loss: 3.2164 (3.2697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.8226)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [135] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.002533  min_lr: 0.002533  loss: 3.2164 (3.2372)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7139 (0.8226)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6309 (0.6309)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5586  data: 5.3541  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8561 (0.8168)  acc1: 82.8000 (82.7273)  acc5: 97.2000 (96.8727)  time: 0.6591  data: 0.4870  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9766 (0.9649)  acc1: 77.2000 (79.2191)  acc5: 94.4000 (94.9905)  time: 0.1690  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0812 (0.9753)  acc1: 76.4000 (78.7360)  acc5: 93.6000 (94.9120)  time: 0.1688  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3880 s / it)
* Acc@1 79.318 Acc@5 95.152 loss 0.965
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [136]  [   0/1251]  eta: 1:06:37  lr: 0.002532  min_lr: 0.002532  loss: 2.8098 (2.8098)  weight_decay: 0.0500 (0.0500)  time: 3.1953  data: 2.4007  max mem: 28503
Epoch: [136]  [ 200/1251]  eta: 0:06:21  lr: 0.002529  min_lr: 0.002529  loss: 3.2887 (3.2105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7971 (0.8181)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [136]  [ 400/1251]  eta: 0:05:02  lr: 0.002526  min_lr: 0.002526  loss: 3.2832 (3.2167)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8459 (0.8396)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [136]  [ 600/1251]  eta: 0:03:50  lr: 0.002522  min_lr: 0.002522  loss: 3.4102 (3.2426)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8794 (nan)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [136]  [ 800/1251]  eta: 0:02:38  lr: 0.002519  min_lr: 0.002519  loss: 3.3071 (3.2473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6217 (nan)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [136]  [1000/1251]  eta: 0:01:28  lr: 0.002515  min_lr: 0.002515  loss: 3.3889 (3.2472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8021 (nan)  time: 0.3631  data: 0.0004  max mem: 28503
Epoch: [136]  [1200/1251]  eta: 0:00:17  lr: 0.002512  min_lr: 0.002512  loss: 3.4090 (3.2501)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7705 (nan)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [136]  [1250/1251]  eta: 0:00:00  lr: 0.002511  min_lr: 0.002511  loss: 3.3534 (3.2514)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (nan)  time: 0.2914  data: 0.0005  max mem: 28503
Epoch: [136] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002511  min_lr: 0.002511  loss: 3.3534 (3.2410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7050 (nan)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.7748 (0.7748)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.2099  data: 4.9935  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9408 (0.9215)  acc1: 82.0000 (82.2909)  acc5: 97.2000 (96.9818)  time: 0.6891  data: 0.5139  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0770 (1.0684)  acc1: 76.4000 (78.7048)  acc5: 94.8000 (95.1238)  time: 0.2028  data: 0.0330  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1799 (1.0825)  acc1: 76.4000 (78.0960)  acc5: 94.4000 (95.0400)  time: 0.2014  data: 0.0330  max mem: 28503
Test: Total time: 0:00:10 (0.4008 s / it)
* Acc@1 79.184 Acc@5 95.102 loss 1.067
Accuracy of the model on the 50000 test images: 79.2%
Max accuracy: 79.48%
Epoch: [137]  [   0/1251]  eta: 1:16:17  lr: 0.002511  min_lr: 0.002511  loss: 3.4166 (3.4166)  weight_decay: 0.0500 (0.0500)  time: 3.6587  data: 2.4446  max mem: 28503
Epoch: [137]  [ 200/1251]  eta: 0:06:23  lr: 0.002507  min_lr: 0.002507  loss: 3.4557 (3.1834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0572 (0.9925)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [137]  [ 400/1251]  eta: 0:05:03  lr: 0.002504  min_lr: 0.002504  loss: 3.4947 (3.2118)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7830 (0.8791)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [137]  [ 600/1251]  eta: 0:03:49  lr: 0.002500  min_lr: 0.002500  loss: 3.3278 (3.2250)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [137]  [ 800/1251]  eta: 0:02:38  lr: 0.002497  min_lr: 0.002497  loss: 3.3603 (3.2186)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8376 (nan)  time: 0.3441  data: 0.0004  max mem: 28503
Epoch: [137]  [1000/1251]  eta: 0:01:27  lr: 0.002493  min_lr: 0.002493  loss: 3.2739 (3.2175)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8895 (nan)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [137]  [1200/1251]  eta: 0:00:17  lr: 0.002490  min_lr: 0.002490  loss: 3.3154 (3.2147)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7956 (nan)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [137]  [1250/1251]  eta: 0:00:00  lr: 0.002489  min_lr: 0.002489  loss: 3.0281 (3.2110)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8439 (nan)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [137] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.002489  min_lr: 0.002489  loss: 3.0281 (3.2321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8439 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6219 (0.6219)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.7076  data: 5.5061  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8613 (0.8137)  acc1: 81.6000 (82.2182)  acc5: 97.6000 (97.1636)  time: 0.7327  data: 0.5593  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9783 (0.9668)  acc1: 76.4000 (78.8952)  acc5: 94.8000 (95.3143)  time: 0.2076  data: 0.0324  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0352 (0.9728)  acc1: 76.4000 (78.6080)  acc5: 94.0000 (95.2320)  time: 0.2071  data: 0.0323  max mem: 28503
Test: Total time: 0:00:10 (0.4244 s / it)
* Acc@1 79.312 Acc@5 95.090 loss 0.961
Accuracy of the model on the 50000 test images: 79.3%
Max accuracy: 79.48%
Epoch: [138]  [   0/1251]  eta: 1:06:55  lr: 0.002489  min_lr: 0.002489  loss: 3.5997 (3.5997)  weight_decay: 0.0500 (0.0500)  time: 3.2096  data: 2.4388  max mem: 28503
Epoch: [138]  [ 200/1251]  eta: 0:06:22  lr: 0.002486  min_lr: 0.002486  loss: 3.4061 (3.2831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7540 (0.8333)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [138]  [ 400/1251]  eta: 0:05:02  lr: 0.002482  min_lr: 0.002482  loss: 3.3399 (3.2582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8001 (0.8383)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [138]  [ 600/1251]  eta: 0:03:49  lr: 0.002479  min_lr: 0.002479  loss: 3.0877 (3.2596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8208 (0.8779)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [138]  [ 800/1251]  eta: 0:02:38  lr: 0.002475  min_lr: 0.002475  loss: 3.0600 (3.2475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8554 (0.8742)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [138]  [1000/1251]  eta: 0:01:28  lr: 0.002472  min_lr: 0.002472  loss: 2.9270 (3.2308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7227 (0.8528)  time: 0.3602  data: 0.0004  max mem: 28503
Epoch: [138]  [1200/1251]  eta: 0:00:17  lr: 0.002468  min_lr: 0.002468  loss: 3.3339 (3.2297)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6292 (0.8344)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [138]  [1250/1251]  eta: 0:00:00  lr: 0.002467  min_lr: 0.002467  loss: 3.3354 (3.2299)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7476 (0.8337)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [138] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.002467  min_lr: 0.002467  loss: 3.3354 (3.2330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7476 (0.8337)
Test:  [ 0/25]  eta: 0:01:29  loss: 0.6660 (0.6660)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 3.5639  data: 3.3239  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8234 (0.8285)  acc1: 82.4000 (83.0545)  acc5: 97.2000 (96.7636)  time: 0.6504  data: 0.4730  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9891 (0.9929)  acc1: 76.8000 (79.1429)  acc5: 95.2000 (95.0095)  time: 0.2767  data: 0.1069  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1539 (1.0046)  acc1: 76.8000 (78.7680)  acc5: 94.0000 (95.0080)  time: 0.2286  data: 0.0593  max mem: 28503
Test: Total time: 0:00:09 (0.3946 s / it)
* Acc@1 79.504 Acc@5 95.266 loss 0.984
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.50%
Epoch: [139]  [   0/1251]  eta: 1:04:22  lr: 0.002467  min_lr: 0.002467  loss: 3.8768 (3.8768)  weight_decay: 0.0500 (0.0500)  time: 3.0875  data: 2.7372  max mem: 28503
Epoch: [139]  [ 200/1251]  eta: 0:06:22  lr: 0.002464  min_lr: 0.002464  loss: 3.1912 (3.3026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7619 (0.9420)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [139]  [ 400/1251]  eta: 0:05:02  lr: 0.002460  min_lr: 0.002460  loss: 3.3638 (3.2665)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7717 (0.9100)  time: 0.3599  data: 0.0004  max mem: 28503
Epoch: [139]  [ 600/1251]  eta: 0:03:50  lr: 0.002457  min_lr: 0.002457  loss: 3.1796 (3.2574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8635 (0.8639)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [139]  [ 800/1251]  eta: 0:02:38  lr: 0.002453  min_lr: 0.002453  loss: 3.6558 (3.2549)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9162 (0.8980)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [139]  [1000/1251]  eta: 0:01:28  lr: 0.002450  min_lr: 0.002450  loss: 3.2221 (3.2539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7053 (0.8674)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [139]  [1200/1251]  eta: 0:00:17  lr: 0.002446  min_lr: 0.002446  loss: 3.3390 (3.2481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7862 (0.8776)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [139]  [1250/1251]  eta: 0:00:00  lr: 0.002446  min_lr: 0.002446  loss: 3.2387 (3.2459)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.8709)  time: 0.2928  data: 0.0006  max mem: 28503
Epoch: [139] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.002446  min_lr: 0.002446  loss: 3.2387 (3.2286)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7210 (0.8709)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6139 (0.6139)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.6397  data: 5.4301  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8365 (0.8097)  acc1: 83.2000 (82.6545)  acc5: 97.2000 (96.9818)  time: 0.6945  data: 0.5204  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9880 (0.9571)  acc1: 76.8000 (79.5619)  acc5: 94.8000 (95.3524)  time: 0.1889  data: 0.0194  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0417 (0.9611)  acc1: 76.8000 (79.2960)  acc5: 94.8000 (95.3760)  time: 0.1882  data: 0.0193  max mem: 28503
Test: Total time: 0:00:10 (0.4068 s / it)
* Acc@1 79.696 Acc@5 95.324 loss 0.949
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.70%
Epoch: [140]  [   0/1251]  eta: 1:01:52  lr: 0.002445  min_lr: 0.002445  loss: 2.5495 (2.5495)  weight_decay: 0.0500 (0.0500)  time: 2.9679  data: 2.5982  max mem: 28503
Epoch: [140]  [ 200/1251]  eta: 0:06:22  lr: 0.002442  min_lr: 0.002442  loss: 3.4046 (3.1697)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8810 (0.8010)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [140]  [ 400/1251]  eta: 0:05:01  lr: 0.002438  min_lr: 0.002438  loss: 3.4197 (3.2039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7057 (0.8484)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [140]  [ 600/1251]  eta: 0:03:49  lr: 0.002435  min_lr: 0.002435  loss: 3.4487 (3.2134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7460 (0.8515)  time: 0.3673  data: 0.0004  max mem: 28503
Epoch: [140]  [ 800/1251]  eta: 0:02:38  lr: 0.002431  min_lr: 0.002431  loss: 3.4009 (3.2183)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8669 (0.8505)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [140]  [1000/1251]  eta: 0:01:28  lr: 0.002428  min_lr: 0.002428  loss: 3.4451 (3.2251)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8505 (0.8506)  time: 0.3536  data: 0.0005  max mem: 28503
Epoch: [140]  [1200/1251]  eta: 0:00:17  lr: 0.002424  min_lr: 0.002424  loss: 3.0696 (3.2278)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6302 (0.8504)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [140]  [1250/1251]  eta: 0:00:00  lr: 0.002424  min_lr: 0.002424  loss: 3.4968 (3.2318)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7760 (0.8505)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [140] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.002424  min_lr: 0.002424  loss: 3.4968 (3.2228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7760 (0.8505)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7771 (0.7771)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.3821  data: 5.1808  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 1.0074 (0.9663)  acc1: 83.2000 (82.5818)  acc5: 96.4000 (96.9455)  time: 0.6872  data: 0.5151  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1238 (1.1045)  acc1: 77.2000 (79.6571)  acc5: 95.6000 (95.3524)  time: 0.1982  data: 0.0294  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1374 (1.1167)  acc1: 77.2000 (79.1840)  acc5: 95.6000 (95.3600)  time: 0.1978  data: 0.0294  max mem: 28503
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 79.356 Acc@5 95.256 loss 1.113
Accuracy of the model on the 50000 test images: 79.4%
Max accuracy: 79.70%
Epoch: [141]  [   0/1251]  eta: 1:07:05  lr: 0.002424  min_lr: 0.002424  loss: 3.2568 (3.2568)  weight_decay: 0.0500 (0.0500)  time: 3.2180  data: 2.1076  max mem: 28503
Epoch: [141]  [ 200/1251]  eta: 0:06:20  lr: 0.002420  min_lr: 0.002420  loss: 3.3624 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0286 (0.9196)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [141]  [ 400/1251]  eta: 0:05:01  lr: 0.002417  min_lr: 0.002417  loss: 3.4600 (3.2391)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7275 (0.9290)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [141]  [ 600/1251]  eta: 0:03:49  lr: 0.002413  min_lr: 0.002413  loss: 3.1231 (3.2143)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8354 (0.9153)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [141]  [ 800/1251]  eta: 0:02:38  lr: 0.002409  min_lr: 0.002409  loss: 3.2084 (3.2007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8375 (0.8922)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [141]  [1000/1251]  eta: 0:01:27  lr: 0.002406  min_lr: 0.002406  loss: 3.2646 (3.2067)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8756 (0.8755)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [141]  [1200/1251]  eta: 0:00:17  lr: 0.002402  min_lr: 0.002402  loss: 3.4246 (3.2154)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8706)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [141]  [1250/1251]  eta: 0:00:00  lr: 0.002402  min_lr: 0.002402  loss: 3.2394 (3.2112)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.8656)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [141] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.002402  min_lr: 0.002402  loss: 3.2394 (3.2234)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7700 (0.8656)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6089 (0.6089)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5611  data: 5.3568  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8603 (0.8432)  acc1: 83.6000 (82.8364)  acc5: 97.2000 (96.9455)  time: 0.7230  data: 0.5497  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0597 (0.9825)  acc1: 76.8000 (79.2571)  acc5: 94.4000 (95.0667)  time: 0.2038  data: 0.0346  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0597 (0.9941)  acc1: 76.8000 (78.8000)  acc5: 94.4000 (95.0400)  time: 0.2029  data: 0.0345  max mem: 28503
Test: Total time: 0:00:10 (0.4157 s / it)
* Acc@1 79.512 Acc@5 95.220 loss 0.977
Accuracy of the model on the 50000 test images: 79.5%
Max accuracy: 79.70%
Epoch: [142]  [   0/1251]  eta: 1:09:32  lr: 0.002402  min_lr: 0.002402  loss: 3.2829 (3.2829)  weight_decay: 0.0500 (0.0500)  time: 3.3350  data: 2.6201  max mem: 28503
Epoch: [142]  [ 200/1251]  eta: 0:06:23  lr: 0.002398  min_lr: 0.002398  loss: 3.4358 (3.2912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8519 (0.9078)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [142]  [ 400/1251]  eta: 0:05:02  lr: 0.002395  min_lr: 0.002395  loss: 3.4336 (3.2823)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.9182)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [142]  [ 600/1251]  eta: 0:03:50  lr: 0.002391  min_lr: 0.002391  loss: 3.3721 (3.2632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7488 (0.8781)  time: 0.3661  data: 0.0004  max mem: 28503
Epoch: [142]  [ 800/1251]  eta: 0:02:38  lr: 0.002387  min_lr: 0.002387  loss: 3.0843 (3.2513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6458 (0.8629)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [142]  [1000/1251]  eta: 0:01:28  lr: 0.002384  min_lr: 0.002384  loss: 3.3332 (3.2364)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9186 (0.8662)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [142]  [1200/1251]  eta: 0:00:17  lr: 0.002380  min_lr: 0.002380  loss: 3.3788 (3.2327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8500 (0.8816)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [142]  [1250/1251]  eta: 0:00:00  lr: 0.002380  min_lr: 0.002380  loss: 3.1249 (3.2303)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.8820)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [142] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.002380  min_lr: 0.002380  loss: 3.1249 (3.2188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.8820)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6073 (0.6073)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.4000)  time: 5.5804  data: 5.3763  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8673 (0.8353)  acc1: 83.2000 (82.4727)  acc5: 97.2000 (97.1636)  time: 0.7550  data: 0.5814  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9907 (0.9861)  acc1: 77.6000 (79.2381)  acc5: 94.4000 (95.3333)  time: 0.2205  data: 0.0510  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0993 (0.9895)  acc1: 77.6000 (79.0240)  acc5: 94.4000 (95.2640)  time: 0.2200  data: 0.0509  max mem: 28503
Test: Total time: 0:00:10 (0.4303 s / it)
* Acc@1 79.610 Acc@5 95.368 loss 0.972
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.70%
Epoch: [143]  [   0/1251]  eta: 1:06:24  lr: 0.002380  min_lr: 0.002380  loss: 2.9440 (2.9440)  weight_decay: 0.0500 (0.0500)  time: 3.1848  data: 2.6269  max mem: 28503
Epoch: [143]  [ 200/1251]  eta: 0:06:23  lr: 0.002376  min_lr: 0.002376  loss: 3.4377 (3.2826)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8670 (0.8861)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [143]  [ 400/1251]  eta: 0:05:02  lr: 0.002373  min_lr: 0.002373  loss: 3.3839 (3.2308)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8673 (0.8827)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [143]  [ 600/1251]  eta: 0:03:50  lr: 0.002369  min_lr: 0.002369  loss: 3.2457 (3.2276)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7588 (0.8476)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [143]  [ 800/1251]  eta: 0:02:38  lr: 0.002365  min_lr: 0.002365  loss: 3.1533 (3.2189)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6934 (0.8302)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [143]  [1000/1251]  eta: 0:01:28  lr: 0.002362  min_lr: 0.002362  loss: 3.5781 (3.2218)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7817 (0.8447)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [143]  [1200/1251]  eta: 0:00:17  lr: 0.002358  min_lr: 0.002358  loss: 3.3434 (3.2085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8382 (0.8525)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [143]  [1250/1251]  eta: 0:00:00  lr: 0.002358  min_lr: 0.002358  loss: 3.3193 (3.2059)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.8542)  time: 0.2998  data: 0.0006  max mem: 28503
Epoch: [143] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.002358  min_lr: 0.002358  loss: 3.3193 (3.2123)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8277 (0.8542)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.6153 (0.6153)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.9479  data: 5.7474  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8444 (0.8273)  acc1: 83.6000 (82.7273)  acc5: 97.2000 (96.8364)  time: 0.7329  data: 0.5612  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9751 (0.9663)  acc1: 77.6000 (79.5238)  acc5: 94.0000 (95.1238)  time: 0.1899  data: 0.0213  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0562 (0.9773)  acc1: 77.6000 (79.2480)  acc5: 94.0000 (95.0720)  time: 0.1897  data: 0.0213  max mem: 28503
Test: Total time: 0:00:10 (0.4204 s / it)
* Acc@1 79.776 Acc@5 95.264 loss 0.958
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.78%
Epoch: [144]  [   0/1251]  eta: 1:01:19  lr: 0.002358  min_lr: 0.002358  loss: 3.6165 (3.6165)  weight_decay: 0.0500 (0.0500)  time: 2.9414  data: 2.5546  max mem: 28503
Epoch: [144]  [ 200/1251]  eta: 0:06:18  lr: 0.002354  min_lr: 0.002354  loss: 3.2650 (3.1629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.8181)  time: 0.3447  data: 0.0005  max mem: 28503
Epoch: [144]  [ 400/1251]  eta: 0:05:00  lr: 0.002350  min_lr: 0.002350  loss: 3.3134 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7787 (nan)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [144]  [ 600/1251]  eta: 0:03:48  lr: 0.002347  min_lr: 0.002347  loss: 3.1444 (3.1798)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7725 (nan)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [144]  [ 800/1251]  eta: 0:02:37  lr: 0.002343  min_lr: 0.002343  loss: 3.1787 (3.1885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8128 (nan)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [144]  [1000/1251]  eta: 0:01:27  lr: 0.002340  min_lr: 0.002340  loss: 3.3940 (3.1991)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7683 (nan)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [144]  [1200/1251]  eta: 0:00:17  lr: 0.002336  min_lr: 0.002336  loss: 3.2516 (3.1935)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9118 (nan)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [144]  [1250/1251]  eta: 0:00:00  lr: 0.002335  min_lr: 0.002335  loss: 3.2632 (3.1943)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (nan)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [144] Total time: 0:07:16 (0.3485 s / it)
Averaged stats: lr: 0.002335  min_lr: 0.002335  loss: 3.2632 (3.2052)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8042 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5916 (0.5916)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.7198  data: 5.5155  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8101 (0.8106)  acc1: 83.2000 (82.8364)  acc5: 97.2000 (97.0182)  time: 0.6981  data: 0.5249  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9833 (0.9568)  acc1: 76.0000 (79.3333)  acc5: 94.8000 (95.2000)  time: 0.1823  data: 0.0130  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0440 (0.9645)  acc1: 76.0000 (78.9440)  acc5: 94.8000 (95.1360)  time: 0.1814  data: 0.0129  max mem: 28503
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 79.806 Acc@5 95.214 loss 0.953
Accuracy of the model on the 50000 test images: 79.8%
Max accuracy: 79.81%
Epoch: [145]  [   0/1251]  eta: 1:04:11  lr: 0.002335  min_lr: 0.002335  loss: 3.1310 (3.1310)  weight_decay: 0.0500 (0.0500)  time: 3.0791  data: 2.7014  max mem: 28503
Epoch: [145]  [ 200/1251]  eta: 0:06:21  lr: 0.002332  min_lr: 0.002332  loss: 3.2156 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8642 (0.8826)  time: 0.3553  data: 0.0004  max mem: 28503
Epoch: [145]  [ 400/1251]  eta: 0:05:02  lr: 0.002328  min_lr: 0.002328  loss: 3.1253 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6969 (0.8266)  time: 0.3587  data: 0.0004  max mem: 28503
Epoch: [145]  [ 600/1251]  eta: 0:03:49  lr: 0.002325  min_lr: 0.002325  loss: 3.4747 (3.1992)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8173 (0.8594)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [145]  [ 800/1251]  eta: 0:02:38  lr: 0.002321  min_lr: 0.002321  loss: 3.2149 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8686 (0.8663)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [145]  [1000/1251]  eta: 0:01:27  lr: 0.002318  min_lr: 0.002318  loss: 3.2038 (3.2010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7701 (0.8625)  time: 0.3471  data: 0.0003  max mem: 28503
Epoch: [145]  [1200/1251]  eta: 0:00:17  lr: 0.002314  min_lr: 0.002314  loss: 3.3210 (3.2009)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8415 (0.8752)  time: 0.3448  data: 0.0003  max mem: 28503
Epoch: [145]  [1250/1251]  eta: 0:00:00  lr: 0.002313  min_lr: 0.002313  loss: 3.3804 (3.2042)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8847 (0.8824)  time: 0.2966  data: 0.0006  max mem: 28503
Epoch: [145] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002313  min_lr: 0.002313  loss: 3.3804 (3.1955)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8847 (0.8824)
Test:  [ 0/25]  eta: 0:01:40  loss: 0.7168 (0.7168)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 4.0075  data: 3.8153  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.9118 (0.9158)  acc1: 84.4000 (82.9091)  acc5: 97.2000 (96.6909)  time: 0.6585  data: 0.4868  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1022 (1.0647)  acc1: 77.6000 (79.2191)  acc5: 94.0000 (94.8191)  time: 0.2571  data: 0.0883  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1548 (1.0728)  acc1: 77.2000 (78.8640)  acc5: 94.0000 (94.8320)  time: 0.1957  data: 0.0276  max mem: 28503
Test: Total time: 0:00:10 (0.4004 s / it)
* Acc@1 79.596 Acc@5 95.180 loss 1.053
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.81%
Epoch: [146]  [   0/1251]  eta: 1:10:10  lr: 0.002313  min_lr: 0.002313  loss: 2.4684 (2.4684)  weight_decay: 0.0500 (0.0500)  time: 3.3657  data: 1.6345  max mem: 28503
Epoch: [146]  [ 200/1251]  eta: 0:06:21  lr: 0.002310  min_lr: 0.002310  loss: 3.0759 (3.1516)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7235 (0.8074)  time: 0.3469  data: 0.0003  max mem: 28503
Epoch: [146]  [ 400/1251]  eta: 0:05:02  lr: 0.002306  min_lr: 0.002306  loss: 3.4275 (3.1754)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6717 (0.8158)  time: 0.3466  data: 0.0003  max mem: 28503
Epoch: [146]  [ 600/1251]  eta: 0:03:49  lr: 0.002303  min_lr: 0.002303  loss: 3.1586 (3.1651)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9102 (0.8519)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [146]  [ 800/1251]  eta: 0:02:38  lr: 0.002299  min_lr: 0.002299  loss: 3.3129 (3.1700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7091 (0.8626)  time: 0.3523  data: 0.0004  max mem: 28503
Epoch: [146]  [1000/1251]  eta: 0:01:28  lr: 0.002296  min_lr: 0.002296  loss: 2.9246 (3.1759)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0152 (0.8884)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [146]  [1200/1251]  eta: 0:00:17  lr: 0.002292  min_lr: 0.002292  loss: 3.3731 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8005 (0.8803)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [146]  [1250/1251]  eta: 0:00:00  lr: 0.002291  min_lr: 0.002291  loss: 3.4107 (3.1794)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.8824)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [146] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.002291  min_lr: 0.002291  loss: 3.4107 (3.2001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8012 (0.8824)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6041 (0.6041)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.5304  data: 5.2985  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8481 (0.8079)  acc1: 83.6000 (82.0364)  acc5: 97.6000 (97.3091)  time: 0.7129  data: 0.5361  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9953 (0.9674)  acc1: 76.0000 (78.7429)  acc5: 94.0000 (95.3143)  time: 0.2045  data: 0.0300  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0810 (0.9712)  acc1: 77.2000 (78.8960)  acc5: 93.6000 (95.2000)  time: 0.2013  data: 0.0276  max mem: 28503
Test: Total time: 0:00:10 (0.4156 s / it)
* Acc@1 79.642 Acc@5 95.298 loss 0.957
Accuracy of the model on the 50000 test images: 79.6%
Max accuracy: 79.81%
Epoch: [147]  [   0/1251]  eta: 1:08:29  lr: 0.002291  min_lr: 0.002291  loss: 3.4724 (3.4724)  weight_decay: 0.0500 (0.0500)  time: 3.2850  data: 1.8459  max mem: 28503
Epoch: [147]  [ 200/1251]  eta: 0:06:21  lr: 0.002288  min_lr: 0.002288  loss: 3.3177 (3.1862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7951 (0.8511)  time: 0.3536  data: 0.0004  max mem: 28503
Epoch: [147]  [ 400/1251]  eta: 0:05:04  lr: 0.002284  min_lr: 0.002284  loss: 3.1106 (3.1854)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9073 (0.8684)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [147]  [ 600/1251]  eta: 0:03:50  lr: 0.002280  min_lr: 0.002280  loss: 3.3243 (3.1793)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8381 (0.8827)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [147]  [ 800/1251]  eta: 0:02:38  lr: 0.002277  min_lr: 0.002277  loss: 3.1976 (3.1760)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6967 (0.8631)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [147]  [1000/1251]  eta: 0:01:28  lr: 0.002273  min_lr: 0.002273  loss: 3.4259 (3.1831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8215 (0.8868)  time: 0.3516  data: 0.0004  max mem: 28503
Epoch: [147]  [1200/1251]  eta: 0:00:17  lr: 0.002270  min_lr: 0.002270  loss: 3.4830 (3.1886)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7125 (0.8756)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [147]  [1250/1251]  eta: 0:00:00  lr: 0.002269  min_lr: 0.002269  loss: 3.2293 (3.1910)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7288 (0.8741)  time: 0.2914  data: 0.0007  max mem: 28503
Epoch: [147] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.002269  min_lr: 0.002269  loss: 3.2293 (3.2041)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7288 (0.8741)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6856 (0.6856)  acc1: 86.8000 (86.8000)  acc5: 98.8000 (98.8000)  time: 5.5886  data: 5.3927  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8808 (0.8868)  acc1: 84.0000 (82.1818)  acc5: 98.0000 (97.2364)  time: 0.7302  data: 0.5584  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0993 (1.0337)  acc1: 76.4000 (79.0286)  acc5: 95.2000 (95.4857)  time: 0.2064  data: 0.0375  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1079 (1.0428)  acc1: 77.2000 (78.9600)  acc5: 94.8000 (95.4400)  time: 0.2059  data: 0.0375  max mem: 28503
Test: Total time: 0:00:10 (0.4191 s / it)
* Acc@1 79.698 Acc@5 95.294 loss 1.026
Accuracy of the model on the 50000 test images: 79.7%
Max accuracy: 79.81%
Epoch: [148]  [   0/1251]  eta: 1:13:23  lr: 0.002269  min_lr: 0.002269  loss: 3.7636 (3.7636)  weight_decay: 0.0500 (0.0500)  time: 3.5202  data: 1.6967  max mem: 28503
Epoch: [148]  [ 200/1251]  eta: 0:06:21  lr: 0.002265  min_lr: 0.002265  loss: 3.2447 (3.1776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7061 (0.8098)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [148]  [ 400/1251]  eta: 0:05:01  lr: 0.002262  min_lr: 0.002262  loss: 3.3186 (3.1709)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8882 (0.8223)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [148]  [ 600/1251]  eta: 0:03:49  lr: 0.002258  min_lr: 0.002258  loss: 3.0312 (3.1833)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8296 (0.8802)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [148]  [ 800/1251]  eta: 0:02:38  lr: 0.002255  min_lr: 0.002255  loss: 3.2665 (3.1810)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7097 (0.8487)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [148]  [1000/1251]  eta: 0:01:28  lr: 0.002251  min_lr: 0.002251  loss: 3.3473 (3.1859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7415 (0.8380)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [148]  [1200/1251]  eta: 0:00:17  lr: 0.002248  min_lr: 0.002248  loss: 3.3374 (3.1889)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8563 (0.8539)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [148]  [1250/1251]  eta: 0:00:00  lr: 0.002247  min_lr: 0.002247  loss: 3.3022 (3.1880)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.8471)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [148] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.002247  min_lr: 0.002247  loss: 3.3022 (3.1982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6786 (0.8471)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6811 (0.6811)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.8368  data: 5.6425  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8868 (0.8494)  acc1: 82.8000 (82.9091)  acc5: 97.2000 (97.0909)  time: 0.6844  data: 0.5133  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0507 (0.9866)  acc1: 77.6000 (79.4095)  acc5: 95.2000 (95.3905)  time: 0.1689  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0523 (0.9958)  acc1: 76.8000 (79.0400)  acc5: 94.8000 (95.3120)  time: 0.1688  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3990 s / it)
* Acc@1 79.882 Acc@5 95.466 loss 0.981
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.88%
Epoch: [149]  [   0/1251]  eta: 1:02:54  lr: 0.002247  min_lr: 0.002247  loss: 3.3626 (3.3626)  weight_decay: 0.0500 (0.0500)  time: 3.0173  data: 2.6669  max mem: 28503
Epoch: [149]  [ 200/1251]  eta: 0:06:18  lr: 0.002243  min_lr: 0.002243  loss: 3.3455 (3.2185)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9962 (0.9515)  time: 0.3525  data: 0.0004  max mem: 28503
Epoch: [149]  [ 400/1251]  eta: 0:05:02  lr: 0.002240  min_lr: 0.002240  loss: 3.5001 (3.2109)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8897 (0.9192)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [149]  [ 600/1251]  eta: 0:03:49  lr: 0.002236  min_lr: 0.002236  loss: 3.4478 (3.2107)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7870 (0.8927)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [149]  [ 800/1251]  eta: 0:02:38  lr: 0.002232  min_lr: 0.002232  loss: 3.4014 (3.2114)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8180 (0.8785)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [149]  [1000/1251]  eta: 0:01:28  lr: 0.002229  min_lr: 0.002229  loss: 3.3983 (3.2136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7864 (0.8739)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [149]  [1200/1251]  eta: 0:00:17  lr: 0.002225  min_lr: 0.002225  loss: 3.2416 (3.2029)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [149]  [1250/1251]  eta: 0:00:00  lr: 0.002224  min_lr: 0.002224  loss: 3.1820 (3.1975)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (nan)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [149] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.002224  min_lr: 0.002224  loss: 3.1820 (3.1977)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9547 (nan)
Test:  [ 0/25]  eta: 0:02:03  loss: 0.6382 (0.6382)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 4.9529  data: 4.7542  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8352 (0.8146)  acc1: 84.0000 (83.1636)  acc5: 96.8000 (96.9818)  time: 0.6813  data: 0.5081  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9906 (0.9586)  acc1: 79.2000 (79.6191)  acc5: 94.4000 (95.3143)  time: 0.2116  data: 0.0418  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0425 (0.9644)  acc1: 76.8000 (79.2640)  acc5: 94.4000 (95.2480)  time: 0.1931  data: 0.0243  max mem: 28503
Test: Total time: 0:00:09 (0.3975 s / it)
* Acc@1 79.860 Acc@5 95.380 loss 0.950
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.88%
Epoch: [150]  [   0/1251]  eta: 1:04:56  lr: 0.002224  min_lr: 0.002224  loss: 3.7353 (3.7353)  weight_decay: 0.0500 (0.0500)  time: 3.1146  data: 2.4010  max mem: 28503
Epoch: [150]  [ 200/1251]  eta: 0:06:21  lr: 0.002221  min_lr: 0.002221  loss: 3.2440 (3.1853)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7294 (0.9702)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [150]  [ 400/1251]  eta: 0:05:03  lr: 0.002217  min_lr: 0.002217  loss: 3.1297 (3.1715)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7232 (0.9283)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [150]  [ 600/1251]  eta: 0:03:49  lr: 0.002214  min_lr: 0.002214  loss: 3.2423 (3.1688)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8426 (0.9051)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [150]  [ 800/1251]  eta: 0:02:38  lr: 0.002210  min_lr: 0.002210  loss: 3.3491 (3.1753)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7223 (0.8938)  time: 0.3568  data: 0.0004  max mem: 28503
Epoch: [150]  [1000/1251]  eta: 0:01:28  lr: 0.002207  min_lr: 0.002207  loss: 3.3793 (3.1545)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8649 (0.8843)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [150]  [1200/1251]  eta: 0:00:17  lr: 0.002203  min_lr: 0.002203  loss: 3.3866 (3.1629)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7266 (0.8770)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [150]  [1250/1251]  eta: 0:00:00  lr: 0.002202  min_lr: 0.002202  loss: 3.3301 (3.1626)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8031 (0.8792)  time: 0.3008  data: 0.0005  max mem: 28503
Epoch: [150] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.002202  min_lr: 0.002202  loss: 3.3301 (3.1903)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8031 (0.8792)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6676 (0.6676)  acc1: 88.0000 (88.0000)  acc5: 98.8000 (98.8000)  time: 5.4827  data: 5.2751  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8061 (0.8356)  acc1: 82.4000 (83.9636)  acc5: 97.6000 (97.3818)  time: 0.6601  data: 0.4867  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9937 (0.9832)  acc1: 78.4000 (79.9619)  acc5: 94.8000 (95.5238)  time: 0.1860  data: 0.0147  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0565 (0.9853)  acc1: 76.4000 (79.6000)  acc5: 94.8000 (95.5520)  time: 0.1855  data: 0.0146  max mem: 28503
Test: Total time: 0:00:10 (0.4011 s / it)
* Acc@1 79.930 Acc@5 95.384 loss 0.989
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 79.93%
Epoch: [151]  [   0/1251]  eta: 1:06:44  lr: 0.002202  min_lr: 0.002202  loss: 3.3274 (3.3274)  weight_decay: 0.0500 (0.0500)  time: 3.2011  data: 2.8343  max mem: 28503
Epoch: [151]  [ 200/1251]  eta: 0:06:21  lr: 0.002198  min_lr: 0.002198  loss: 3.1989 (3.1615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9409 (0.9779)  time: 0.3452  data: 0.0005  max mem: 28503
Epoch: [151]  [ 400/1251]  eta: 0:05:02  lr: 0.002195  min_lr: 0.002195  loss: 3.1929 (3.1713)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9512 (0.9709)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [151]  [ 600/1251]  eta: 0:03:50  lr: 0.002191  min_lr: 0.002191  loss: 3.4050 (3.1836)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8228 (0.9569)  time: 0.3451  data: 0.0005  max mem: 28503
Epoch: [151]  [ 800/1251]  eta: 0:02:38  lr: 0.002188  min_lr: 0.002188  loss: 3.0821 (3.1818)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7459 (0.9181)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [151]  [1000/1251]  eta: 0:01:28  lr: 0.002184  min_lr: 0.002184  loss: 3.3846 (3.1915)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7548 (0.9061)  time: 0.3521  data: 0.0005  max mem: 28503
Epoch: [151]  [1200/1251]  eta: 0:00:17  lr: 0.002181  min_lr: 0.002181  loss: 3.3174 (3.1902)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9203 (0.9045)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [151]  [1250/1251]  eta: 0:00:00  lr: 0.002180  min_lr: 0.002180  loss: 3.3160 (3.1921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (0.9141)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [151] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.002180  min_lr: 0.002180  loss: 3.3160 (3.1856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (0.9141)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6029 (0.6029)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.5587  data: 5.3590  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8145 (0.8246)  acc1: 83.2000 (83.3818)  acc5: 97.2000 (96.9818)  time: 0.6590  data: 0.4875  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9866 (0.9799)  acc1: 76.4000 (79.4857)  acc5: 95.2000 (95.1429)  time: 0.1730  data: 0.0044  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0429 (0.9790)  acc1: 76.4000 (79.2960)  acc5: 94.4000 (95.1680)  time: 0.1802  data: 0.0118  max mem: 28503
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 79.962 Acc@5 95.350 loss 0.965
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 79.96%
Epoch: [152]  [   0/1251]  eta: 1:04:23  lr: 0.002180  min_lr: 0.002180  loss: 3.6335 (3.6335)  weight_decay: 0.0500 (0.0500)  time: 3.0883  data: 2.7230  max mem: 28503
Epoch: [152]  [ 200/1251]  eta: 0:06:21  lr: 0.002176  min_lr: 0.002176  loss: 3.3202 (3.2010)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7703 (0.9159)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [152]  [ 400/1251]  eta: 0:05:01  lr: 0.002173  min_lr: 0.002173  loss: 3.2998 (3.2233)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9825 (0.9280)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [152]  [ 600/1251]  eta: 0:03:49  lr: 0.002169  min_lr: 0.002169  loss: 3.4612 (3.2201)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7670 (0.9110)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [152]  [ 800/1251]  eta: 0:02:38  lr: 0.002165  min_lr: 0.002165  loss: 3.2329 (3.2212)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7493 (0.8953)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [152]  [1000/1251]  eta: 0:01:28  lr: 0.002162  min_lr: 0.002162  loss: 3.3730 (3.2082)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7457 (0.8824)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [152]  [1200/1251]  eta: 0:00:17  lr: 0.002158  min_lr: 0.002158  loss: 3.0460 (3.2001)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8841 (0.8976)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [152]  [1250/1251]  eta: 0:00:00  lr: 0.002157  min_lr: 0.002157  loss: 3.0477 (3.1997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0319 (0.9107)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [152] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.002157  min_lr: 0.002157  loss: 3.0477 (3.1825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0319 (0.9107)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6171 (0.6171)  acc1: 87.6000 (87.6000)  acc5: 98.0000 (98.0000)  time: 5.6930  data: 5.4848  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8094 (0.8219)  acc1: 82.8000 (82.6909)  acc5: 96.8000 (97.0546)  time: 0.6724  data: 0.4990  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0212 (0.9583)  acc1: 76.8000 (79.5238)  acc5: 95.6000 (95.4476)  time: 0.1881  data: 0.0146  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0408 (0.9637)  acc1: 76.8000 (79.3440)  acc5: 94.0000 (95.3760)  time: 0.1974  data: 0.0245  max mem: 28503
Test: Total time: 0:00:10 (0.4167 s / it)
* Acc@1 80.042 Acc@5 95.464 loss 0.945
Accuracy of the model on the 50000 test images: 80.0%
Max accuracy: 80.04%
Epoch: [153]  [   0/1251]  eta: 1:11:08  lr: 0.002157  min_lr: 0.002157  loss: 2.9485 (2.9485)  weight_decay: 0.0500 (0.0500)  time: 3.4123  data: 3.0476  max mem: 28503
Epoch: [153]  [ 200/1251]  eta: 0:06:23  lr: 0.002154  min_lr: 0.002154  loss: 3.2525 (3.2302)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7912 (0.8393)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [153]  [ 400/1251]  eta: 0:05:02  lr: 0.002150  min_lr: 0.002150  loss: 3.3402 (3.1986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7735 (0.8582)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [153]  [ 600/1251]  eta: 0:03:50  lr: 0.002147  min_lr: 0.002147  loss: 2.9557 (3.1747)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8846 (0.8974)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [153]  [ 800/1251]  eta: 0:02:38  lr: 0.002143  min_lr: 0.002143  loss: 3.3703 (3.1784)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7702 (0.8856)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [153]  [1000/1251]  eta: 0:01:28  lr: 0.002139  min_lr: 0.002139  loss: 3.1148 (3.1769)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8071 (0.9029)  time: 0.3566  data: 0.0004  max mem: 28503
Epoch: [153]  [1200/1251]  eta: 0:00:17  lr: 0.002136  min_lr: 0.002136  loss: 3.4384 (3.1812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7981 (0.9011)  time: 0.3494  data: 0.0005  max mem: 28503
Epoch: [153]  [1250/1251]  eta: 0:00:00  lr: 0.002135  min_lr: 0.002135  loss: 3.1304 (3.1822)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8600 (0.8975)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [153] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.002135  min_lr: 0.002135  loss: 3.1304 (3.1808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8600 (0.8975)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5471 (0.5471)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5624  data: 5.3620  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7785 (0.7799)  acc1: 84.0000 (83.5273)  acc5: 97.6000 (97.2364)  time: 0.6924  data: 0.5207  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9770 (0.9196)  acc1: 78.0000 (80.0571)  acc5: 95.2000 (95.7333)  time: 0.1882  data: 0.0183  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0157 (0.9276)  acc1: 78.0000 (79.9200)  acc5: 94.4000 (95.6320)  time: 0.1889  data: 0.0182  max mem: 28503
Test: Total time: 0:00:10 (0.4065 s / it)
* Acc@1 80.226 Acc@5 95.466 loss 0.920
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.23%
Epoch: [154]  [   0/1251]  eta: 1:04:49  lr: 0.002135  min_lr: 0.002135  loss: 2.8997 (2.8997)  weight_decay: 0.0500 (0.0500)  time: 3.1089  data: 2.6986  max mem: 28503
Epoch: [154]  [ 200/1251]  eta: 0:06:19  lr: 0.002131  min_lr: 0.002131  loss: 3.4421 (3.1531)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7171 (0.8155)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [154]  [ 400/1251]  eta: 0:05:01  lr: 0.002128  min_lr: 0.002128  loss: 3.0349 (3.1269)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9782 (0.8793)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [154]  [ 600/1251]  eta: 0:03:49  lr: 0.002124  min_lr: 0.002124  loss: 3.4282 (3.1415)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7800 (0.8693)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [154]  [ 800/1251]  eta: 0:02:38  lr: 0.002121  min_lr: 0.002121  loss: 3.3360 (3.1439)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8787 (0.8756)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [154]  [1000/1251]  eta: 0:01:28  lr: 0.002117  min_lr: 0.002117  loss: 3.3604 (3.1481)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7649 (0.8686)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [154]  [1200/1251]  eta: 0:00:17  lr: 0.002113  min_lr: 0.002113  loss: 3.2821 (3.1590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1353 (0.9078)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [154]  [1250/1251]  eta: 0:00:00  lr: 0.002113  min_lr: 0.002113  loss: 3.0158 (3.1575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.9011)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [154] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002113  min_lr: 0.002113  loss: 3.0158 (3.1671)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7506 (0.9011)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5748 (0.5748)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.4685  data: 5.2767  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8225 (0.7951)  acc1: 83.6000 (83.3455)  acc5: 97.6000 (97.4546)  time: 0.6711  data: 0.4996  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9773 (0.9432)  acc1: 77.6000 (80.1905)  acc5: 95.6000 (95.5619)  time: 0.1882  data: 0.0155  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0575 (0.9532)  acc1: 77.6000 (79.9040)  acc5: 93.6000 (95.4880)  time: 0.1883  data: 0.0154  max mem: 28503
Test: Total time: 0:00:09 (0.4000 s / it)
* Acc@1 80.088 Acc@5 95.536 loss 0.946
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.23%
Epoch: [155]  [   0/1251]  eta: 1:10:29  lr: 0.002113  min_lr: 0.002113  loss: 2.1903 (2.1903)  weight_decay: 0.0500 (0.0500)  time: 3.3810  data: 2.3088  max mem: 28503
Epoch: [155]  [ 200/1251]  eta: 0:06:24  lr: 0.002109  min_lr: 0.002109  loss: 3.1013 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7573 (0.8904)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [155]  [ 400/1251]  eta: 0:05:03  lr: 0.002105  min_lr: 0.002105  loss: 3.3009 (3.1290)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9924 (0.9108)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [155]  [ 600/1251]  eta: 0:03:50  lr: 0.002102  min_lr: 0.002102  loss: 3.2807 (3.1414)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8857 (0.9102)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [155]  [ 800/1251]  eta: 0:02:38  lr: 0.002098  min_lr: 0.002098  loss: 3.3808 (3.1510)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8135 (0.9287)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [155]  [1000/1251]  eta: 0:01:28  lr: 0.002095  min_lr: 0.002095  loss: 3.2152 (3.1482)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6856 (0.8987)  time: 0.3581  data: 0.0004  max mem: 28503
Epoch: [155]  [1200/1251]  eta: 0:00:17  lr: 0.002091  min_lr: 0.002091  loss: 3.5224 (3.1600)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7693 (0.8938)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [155]  [1250/1251]  eta: 0:00:00  lr: 0.002090  min_lr: 0.002090  loss: 3.1206 (3.1570)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.8890)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [155] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.002090  min_lr: 0.002090  loss: 3.1206 (3.1734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7311 (0.8890)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6386 (0.6386)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.5514  data: 5.3556  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8862 (0.8383)  acc1: 82.8000 (83.0182)  acc5: 97.2000 (97.1273)  time: 0.7093  data: 0.5379  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0016 (0.9847)  acc1: 77.6000 (79.9429)  acc5: 95.2000 (95.3524)  time: 0.1968  data: 0.0281  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1053 (0.9961)  acc1: 77.6000 (79.6160)  acc5: 94.8000 (95.2800)  time: 0.1965  data: 0.0280  max mem: 28503
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 80.194 Acc@5 95.412 loss 0.979
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.23%
Epoch: [156]  [   0/1251]  eta: 1:11:47  lr: 0.002090  min_lr: 0.002090  loss: 3.1449 (3.1449)  weight_decay: 0.0500 (0.0500)  time: 3.4433  data: 2.1200  max mem: 28503
Epoch: [156]  [ 200/1251]  eta: 0:06:22  lr: 0.002087  min_lr: 0.002087  loss: 3.2330 (3.1928)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.9012)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [156]  [ 400/1251]  eta: 0:05:03  lr: 0.002083  min_lr: 0.002083  loss: 3.3059 (3.1795)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8645 (0.9456)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [156]  [ 600/1251]  eta: 0:03:49  lr: 0.002079  min_lr: 0.002079  loss: 3.3112 (3.1740)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8421 (0.9230)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [156]  [ 800/1251]  eta: 0:02:38  lr: 0.002076  min_lr: 0.002076  loss: 3.5007 (3.1638)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.9180)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [156]  [1000/1251]  eta: 0:01:27  lr: 0.002072  min_lr: 0.002072  loss: 3.2269 (3.1620)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7540 (0.8943)  time: 0.3512  data: 0.0005  max mem: 28503
Epoch: [156]  [1200/1251]  eta: 0:00:17  lr: 0.002069  min_lr: 0.002069  loss: 3.2644 (3.1607)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9266 (0.8896)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [156]  [1250/1251]  eta: 0:00:00  lr: 0.002068  min_lr: 0.002068  loss: 2.9326 (3.1598)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8928)  time: 0.2924  data: 0.0007  max mem: 28503
Epoch: [156] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.002068  min_lr: 0.002068  loss: 2.9326 (3.1596)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8472 (0.8928)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6417 (0.6417)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.6699  data: 5.4683  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8397 (0.8317)  acc1: 83.6000 (83.1636)  acc5: 97.6000 (97.3455)  time: 0.6704  data: 0.4975  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9748 (0.9922)  acc1: 77.2000 (79.8476)  acc5: 96.0000 (95.5238)  time: 0.1849  data: 0.0157  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0613 (1.0020)  acc1: 77.6000 (79.7600)  acc5: 94.0000 (95.3440)  time: 0.1841  data: 0.0156  max mem: 28503
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 80.096 Acc@5 95.534 loss 0.982
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.23%
Epoch: [157]  [   0/1251]  eta: 1:09:24  lr: 0.002068  min_lr: 0.002068  loss: 3.6532 (3.6532)  weight_decay: 0.0500 (0.0500)  time: 3.3288  data: 2.4055  max mem: 28503
Epoch: [157]  [ 200/1251]  eta: 0:06:21  lr: 0.002064  min_lr: 0.002064  loss: 3.2538 (3.1926)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9607 (1.0848)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [157]  [ 400/1251]  eta: 0:05:02  lr: 0.002061  min_lr: 0.002061  loss: 3.2611 (3.1711)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8349 (0.9726)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [157]  [ 600/1251]  eta: 0:03:49  lr: 0.002057  min_lr: 0.002057  loss: 2.7947 (3.1778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8407 (0.9370)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [157]  [ 800/1251]  eta: 0:02:38  lr: 0.002053  min_lr: 0.002053  loss: 3.2446 (3.1767)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9039 (0.9589)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [157]  [1000/1251]  eta: 0:01:28  lr: 0.002050  min_lr: 0.002050  loss: 3.1592 (3.1690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8925 (0.9561)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [157]  [1200/1251]  eta: 0:00:17  lr: 0.002046  min_lr: 0.002046  loss: 3.4399 (3.1756)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7800 (0.9431)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [157]  [1250/1251]  eta: 0:00:00  lr: 0.002045  min_lr: 0.002045  loss: 3.2541 (3.1763)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.9386)  time: 0.2923  data: 0.0006  max mem: 28503
Epoch: [157] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.002045  min_lr: 0.002045  loss: 3.2541 (3.1613)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7640 (0.9386)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.7612 (0.7612)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.7674  data: 5.5705  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9194 (0.8948)  acc1: 84.4000 (83.7818)  acc5: 97.2000 (97.2727)  time: 0.7122  data: 0.5407  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0803 (1.0447)  acc1: 78.8000 (80.1333)  acc5: 95.6000 (95.6000)  time: 0.1876  data: 0.0189  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0977 (1.0542)  acc1: 78.8000 (79.6480)  acc5: 95.2000 (95.4720)  time: 0.1873  data: 0.0188  max mem: 28503
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 79.900 Acc@5 95.460 loss 1.040
Accuracy of the model on the 50000 test images: 79.9%
Max accuracy: 80.23%
Epoch: [158]  [   0/1251]  eta: 1:09:10  lr: 0.002045  min_lr: 0.002045  loss: 3.8544 (3.8544)  weight_decay: 0.0500 (0.0500)  time: 3.3178  data: 2.6735  max mem: 28503
Epoch: [158]  [ 200/1251]  eta: 0:06:22  lr: 0.002042  min_lr: 0.002042  loss: 3.0457 (3.1307)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7532 (0.8845)  time: 0.3482  data: 0.0005  max mem: 28503
Epoch: [158]  [ 400/1251]  eta: 0:05:04  lr: 0.002038  min_lr: 0.002038  loss: 3.4741 (3.1341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8295 (0.8983)  time: 0.3562  data: 0.0004  max mem: 28503
Epoch: [158]  [ 600/1251]  eta: 0:03:50  lr: 0.002035  min_lr: 0.002035  loss: 3.1958 (3.1432)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9169 (0.8926)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [158]  [ 800/1251]  eta: 0:02:38  lr: 0.002031  min_lr: 0.002031  loss: 3.3704 (3.1380)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8831 (0.8970)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [158]  [1000/1251]  eta: 0:01:28  lr: 0.002027  min_lr: 0.002027  loss: 3.2970 (3.1449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1287 (0.9155)  time: 0.3604  data: 0.0004  max mem: 28503
Epoch: [158]  [1200/1251]  eta: 0:00:17  lr: 0.002024  min_lr: 0.002024  loss: 3.2694 (3.1592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8261 (0.9046)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [158]  [1250/1251]  eta: 0:00:00  lr: 0.002023  min_lr: 0.002023  loss: 2.9768 (3.1553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8481 (0.9066)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [158] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.002023  min_lr: 0.002023  loss: 2.9768 (3.1632)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8481 (0.9066)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6309 (0.6309)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.2733  data: 5.0390  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8123 (0.8247)  acc1: 84.4000 (83.3091)  acc5: 97.6000 (97.2000)  time: 0.7428  data: 0.5664  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0083 (0.9785)  acc1: 78.0000 (79.7714)  acc5: 94.8000 (95.3524)  time: 0.2291  data: 0.0596  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1088 (0.9860)  acc1: 77.6000 (79.3760)  acc5: 94.8000 (95.2320)  time: 0.2278  data: 0.0596  max mem: 28503
Test: Total time: 0:00:10 (0.4244 s / it)
* Acc@1 80.110 Acc@5 95.272 loss 0.974
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.23%
Epoch: [159]  [   0/1251]  eta: 1:10:38  lr: 0.002023  min_lr: 0.002023  loss: 3.4423 (3.4423)  weight_decay: 0.0500 (0.0500)  time: 3.3883  data: 1.5575  max mem: 28503
Epoch: [159]  [ 200/1251]  eta: 0:06:22  lr: 0.002019  min_lr: 0.002019  loss: 3.0987 (3.1727)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7157 (0.8910)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [159]  [ 400/1251]  eta: 0:05:02  lr: 0.002016  min_lr: 0.002016  loss: 3.0358 (3.1851)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6933 (0.8722)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [159]  [ 600/1251]  eta: 0:03:49  lr: 0.002012  min_lr: 0.002012  loss: 3.1722 (3.1739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (0.9256)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [159]  [ 800/1251]  eta: 0:02:38  lr: 0.002009  min_lr: 0.002009  loss: 3.4445 (3.1850)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7344 (0.9058)  time: 0.3587  data: 0.0004  max mem: 28503
Epoch: [159]  [1000/1251]  eta: 0:01:28  lr: 0.002005  min_lr: 0.002005  loss: 2.8337 (3.1579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8289 (0.9253)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [159]  [1200/1251]  eta: 0:00:17  lr: 0.002001  min_lr: 0.002001  loss: 3.2374 (3.1616)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.9148)  time: 0.3572  data: 0.0004  max mem: 28503
Epoch: [159]  [1250/1251]  eta: 0:00:00  lr: 0.002001  min_lr: 0.002001  loss: 3.2907 (3.1639)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.9124)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [159] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.002001  min_lr: 0.002001  loss: 3.2907 (3.1555)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8287 (0.9124)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6142 (0.6142)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5012  data: 5.3008  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8243 (0.7984)  acc1: 84.0000 (84.0000)  acc5: 97.6000 (97.0909)  time: 0.6930  data: 0.5213  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9865 (0.9568)  acc1: 78.4000 (79.9810)  acc5: 94.4000 (95.2000)  time: 0.1951  data: 0.0264  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0740 (0.9661)  acc1: 76.8000 (79.6320)  acc5: 94.0000 (95.0720)  time: 0.2071  data: 0.0385  max mem: 28503
Test: Total time: 0:00:10 (0.4162 s / it)
* Acc@1 80.252 Acc@5 95.570 loss 0.944
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.25%
Epoch: [160]  [   0/1251]  eta: 1:07:33  lr: 0.002001  min_lr: 0.002001  loss: 3.4392 (3.4392)  weight_decay: 0.0500 (0.0500)  time: 3.2405  data: 2.8627  max mem: 28503
Epoch: [160]  [ 200/1251]  eta: 0:06:20  lr: 0.001997  min_lr: 0.001997  loss: 3.2859 (3.1283)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7694 (0.8575)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [160]  [ 400/1251]  eta: 0:05:03  lr: 0.001993  min_lr: 0.001993  loss: 3.3760 (3.1348)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8887 (0.9123)  time: 0.3591  data: 0.0004  max mem: 28503
Epoch: [160]  [ 600/1251]  eta: 0:03:49  lr: 0.001990  min_lr: 0.001990  loss: 2.9964 (3.1338)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0761 (0.9164)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [160]  [ 800/1251]  eta: 0:02:38  lr: 0.001986  min_lr: 0.001986  loss: 3.3603 (3.1383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7923 (0.9054)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [160]  [1000/1251]  eta: 0:01:27  lr: 0.001983  min_lr: 0.001983  loss: 3.2414 (3.1330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7330 (0.8905)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [160]  [1200/1251]  eta: 0:00:17  lr: 0.001979  min_lr: 0.001979  loss: 3.0548 (3.1278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0512 (0.9137)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [160]  [1250/1251]  eta: 0:00:00  lr: 0.001978  min_lr: 0.001978  loss: 3.4524 (3.1298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9545 (0.9131)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [160] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001978  min_lr: 0.001978  loss: 3.4524 (3.1552)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9545 (0.9131)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6579 (0.6579)  acc1: 89.6000 (89.6000)  acc5: 98.4000 (98.4000)  time: 5.7158  data: 5.5301  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8750 (0.8739)  acc1: 83.6000 (82.7273)  acc5: 97.2000 (96.9455)  time: 0.7309  data: 0.5606  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0633 (1.0156)  acc1: 76.8000 (79.3714)  acc5: 95.2000 (95.2381)  time: 0.2003  data: 0.0319  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1199 (1.0245)  acc1: 76.4000 (79.1520)  acc5: 94.0000 (95.1200)  time: 0.1999  data: 0.0318  max mem: 28503
Test: Total time: 0:00:10 (0.4193 s / it)
* Acc@1 80.050 Acc@5 95.528 loss 1.003
Accuracy of the model on the 50000 test images: 80.1%
Max accuracy: 80.25%
Epoch: [161]  [   0/1251]  eta: 1:12:59  lr: 0.001978  min_lr: 0.001978  loss: 3.1651 (3.1651)  weight_decay: 0.0500 (0.0500)  time: 3.5010  data: 2.3389  max mem: 28503
Epoch: [161]  [ 200/1251]  eta: 0:06:21  lr: 0.001974  min_lr: 0.001974  loss: 3.2423 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8458 (0.9720)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [161]  [ 400/1251]  eta: 0:05:02  lr: 0.001971  min_lr: 0.001971  loss: 2.9140 (3.0834)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8532 (0.9617)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [161]  [ 600/1251]  eta: 0:03:49  lr: 0.001967  min_lr: 0.001967  loss: 3.2351 (3.0776)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8024 (0.9699)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [161]  [ 800/1251]  eta: 0:02:38  lr: 0.001964  min_lr: 0.001964  loss: 3.2459 (3.0883)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7591 (0.9368)  time: 0.3532  data: 0.0004  max mem: 28503
Epoch: [161]  [1000/1251]  eta: 0:01:28  lr: 0.001960  min_lr: 0.001960  loss: 3.1123 (3.1049)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8482 (0.9176)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [161]  [1200/1251]  eta: 0:00:17  lr: 0.001956  min_lr: 0.001956  loss: 3.3271 (3.1097)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9264 (0.9410)  time: 0.3513  data: 0.0004  max mem: 28503
Epoch: [161]  [1250/1251]  eta: 0:00:00  lr: 0.001956  min_lr: 0.001956  loss: 3.1937 (3.1138)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (0.9456)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [161] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001956  min_lr: 0.001956  loss: 3.1937 (3.1445)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0506 (0.9456)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6359 (0.6359)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.4897  data: 5.2841  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8417 (0.8236)  acc1: 82.8000 (83.4182)  acc5: 97.6000 (97.1636)  time: 0.6816  data: 0.5084  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0429 (0.9611)  acc1: 78.0000 (80.0000)  acc5: 95.6000 (95.7524)  time: 0.1894  data: 0.0155  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0588 (0.9652)  acc1: 78.0000 (79.9840)  acc5: 95.2000 (95.6960)  time: 0.1899  data: 0.0154  max mem: 28503
Test: Total time: 0:00:10 (0.4018 s / it)
* Acc@1 80.496 Acc@5 95.600 loss 0.958
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.50%
Epoch: [162]  [   0/1251]  eta: 0:57:47  lr: 0.001956  min_lr: 0.001956  loss: 2.4625 (2.4625)  weight_decay: 0.0500 (0.0500)  time: 2.7719  data: 2.2872  max mem: 28503
Epoch: [162]  [ 200/1251]  eta: 0:06:17  lr: 0.001952  min_lr: 0.001952  loss: 3.3924 (3.1579)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8432 (0.9137)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [162]  [ 400/1251]  eta: 0:05:01  lr: 0.001948  min_lr: 0.001948  loss: 3.2162 (3.1590)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8996 (0.8901)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [162]  [ 600/1251]  eta: 0:03:49  lr: 0.001945  min_lr: 0.001945  loss: 3.2861 (3.1462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7881 (0.8761)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [162]  [ 800/1251]  eta: 0:02:38  lr: 0.001941  min_lr: 0.001941  loss: 3.4334 (3.1360)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0043 (0.8957)  time: 0.3570  data: 0.0004  max mem: 28503
Epoch: [162]  [1000/1251]  eta: 0:01:27  lr: 0.001938  min_lr: 0.001938  loss: 3.4763 (3.1448)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [162]  [1200/1251]  eta: 0:00:17  lr: 0.001934  min_lr: 0.001934  loss: 3.4618 (3.1505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9518 (nan)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [162]  [1250/1251]  eta: 0:00:00  lr: 0.001933  min_lr: 0.001933  loss: 3.2121 (3.1497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (nan)  time: 0.2974  data: 0.0006  max mem: 28503
Epoch: [162] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.001933  min_lr: 0.001933  loss: 3.2121 (3.1430)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9385 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6612 (0.6612)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.4717  data: 5.2493  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8357 (0.8413)  acc1: 84.4000 (83.3091)  acc5: 97.2000 (97.2727)  time: 0.6532  data: 0.4776  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0165 (1.0028)  acc1: 77.6000 (79.6952)  acc5: 95.2000 (95.5810)  time: 0.1866  data: 0.0170  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1082 (1.0152)  acc1: 77.2000 (79.3280)  acc5: 94.4000 (95.4080)  time: 0.1857  data: 0.0174  max mem: 28503
Test: Total time: 0:00:09 (0.3990 s / it)
* Acc@1 80.278 Acc@5 95.530 loss 0.996
Accuracy of the model on the 50000 test images: 80.3%
Max accuracy: 80.50%
Epoch: [163]  [   0/1251]  eta: 1:10:02  lr: 0.001933  min_lr: 0.001933  loss: 3.3765 (3.3765)  weight_decay: 0.0500 (0.0500)  time: 3.3590  data: 2.8401  max mem: 28503
Epoch: [163]  [ 200/1251]  eta: 0:06:20  lr: 0.001930  min_lr: 0.001930  loss: 3.0621 (3.1417)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8798 (0.9014)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [163]  [ 400/1251]  eta: 0:05:01  lr: 0.001926  min_lr: 0.001926  loss: 3.3728 (3.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8020 (0.9269)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [163]  [ 600/1251]  eta: 0:03:49  lr: 0.001922  min_lr: 0.001922  loss: 3.2564 (3.1593)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9375 (0.9391)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [163]  [ 800/1251]  eta: 0:02:38  lr: 0.001919  min_lr: 0.001919  loss: 3.2952 (3.1503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8318 (0.9221)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [163]  [1000/1251]  eta: 0:01:28  lr: 0.001915  min_lr: 0.001915  loss: 3.3348 (3.1489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7239 (0.9147)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [163]  [1200/1251]  eta: 0:00:17  lr: 0.001912  min_lr: 0.001912  loss: 3.1173 (3.1500)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9217 (0.9315)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [163]  [1250/1251]  eta: 0:00:00  lr: 0.001911  min_lr: 0.001911  loss: 3.2997 (3.1508)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0011 (0.9368)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [163] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001911  min_lr: 0.001911  loss: 3.2997 (3.1377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0011 (0.9368)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6234 (0.6234)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.5306  data: 5.3371  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8935 (0.8480)  acc1: 82.4000 (83.4545)  acc5: 97.2000 (97.2000)  time: 0.7178  data: 0.5439  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0305 (0.9989)  acc1: 79.2000 (79.9810)  acc5: 94.8000 (95.4095)  time: 0.2054  data: 0.0323  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0949 (1.0071)  acc1: 77.2000 (79.5360)  acc5: 94.4000 (95.3760)  time: 0.2041  data: 0.0323  max mem: 28503
Test: Total time: 0:00:10 (0.4159 s / it)
* Acc@1 80.246 Acc@5 95.618 loss 0.983
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.50%
Epoch: [164]  [   0/1251]  eta: 1:11:18  lr: 0.001911  min_lr: 0.001911  loss: 3.4429 (3.4429)  weight_decay: 0.0500 (0.0500)  time: 3.4201  data: 1.5483  max mem: 28503
Epoch: [164]  [ 200/1251]  eta: 0:06:21  lr: 0.001907  min_lr: 0.001907  loss: 3.1285 (3.1336)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7666 (0.8553)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [164]  [ 400/1251]  eta: 0:05:03  lr: 0.001904  min_lr: 0.001904  loss: 2.8395 (3.1237)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8973 (0.9096)  time: 0.3662  data: 0.0005  max mem: 28503
Epoch: [164]  [ 600/1251]  eta: 0:03:50  lr: 0.001900  min_lr: 0.001900  loss: 3.1659 (3.1462)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7536 (0.9030)  time: 0.3474  data: 0.0005  max mem: 28503
Epoch: [164]  [ 800/1251]  eta: 0:02:39  lr: 0.001896  min_lr: 0.001896  loss: 3.0675 (3.1453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8927 (0.9071)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [164]  [1000/1251]  eta: 0:01:28  lr: 0.001893  min_lr: 0.001893  loss: 3.0146 (3.1427)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9951 (0.9271)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [164]  [1200/1251]  eta: 0:00:17  lr: 0.001889  min_lr: 0.001889  loss: 3.4489 (3.1458)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8705 (0.9386)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [164]  [1250/1251]  eta: 0:00:00  lr: 0.001888  min_lr: 0.001888  loss: 3.2883 (3.1513)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (0.9295)  time: 0.3025  data: 0.0007  max mem: 28503
Epoch: [164] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.001888  min_lr: 0.001888  loss: 3.2883 (3.1390)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7413 (0.9295)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6979 (0.6979)  acc1: 87.6000 (87.6000)  acc5: 98.8000 (98.8000)  time: 5.5724  data: 5.3838  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8800 (0.8961)  acc1: 84.8000 (83.2727)  acc5: 97.6000 (97.2364)  time: 0.6878  data: 0.5158  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0798 (1.0502)  acc1: 78.0000 (79.6952)  acc5: 94.8000 (95.3905)  time: 0.1866  data: 0.0173  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1425 (1.0581)  acc1: 78.0000 (79.3440)  acc5: 94.8000 (95.3280)  time: 0.1854  data: 0.0173  max mem: 28503
Test: Total time: 0:00:10 (0.4031 s / it)
* Acc@1 80.194 Acc@5 95.500 loss 1.049
Accuracy of the model on the 50000 test images: 80.2%
Max accuracy: 80.50%
Epoch: [165]  [   0/1251]  eta: 1:07:46  lr: 0.001888  min_lr: 0.001888  loss: 2.7393 (2.7393)  weight_decay: 0.0500 (0.0500)  time: 3.2507  data: 2.4191  max mem: 28503
Epoch: [165]  [ 200/1251]  eta: 0:06:21  lr: 0.001885  min_lr: 0.001885  loss: 3.0111 (3.1221)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8973 (0.9500)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [165]  [ 400/1251]  eta: 0:05:02  lr: 0.001881  min_lr: 0.001881  loss: 3.3917 (3.1306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8817 (0.9356)  time: 0.3470  data: 0.0005  max mem: 28503
Epoch: [165]  [ 600/1251]  eta: 0:03:49  lr: 0.001878  min_lr: 0.001878  loss: 3.1975 (3.1475)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9503 (0.9535)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [165]  [ 800/1251]  eta: 0:02:38  lr: 0.001874  min_lr: 0.001874  loss: 3.3226 (3.1487)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8043 (0.9412)  time: 0.3490  data: 0.0005  max mem: 28503
Epoch: [165]  [1000/1251]  eta: 0:01:28  lr: 0.001870  min_lr: 0.001870  loss: 3.0926 (3.1463)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9413 (0.9413)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [165]  [1200/1251]  eta: 0:00:17  lr: 0.001867  min_lr: 0.001867  loss: 3.3376 (3.1561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0201 (0.9464)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [165]  [1250/1251]  eta: 0:00:00  lr: 0.001866  min_lr: 0.001866  loss: 3.3029 (3.1575)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.9433)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [165] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001866  min_lr: 0.001866  loss: 3.3029 (3.1410)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8387 (0.9433)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6413 (0.6413)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.5898  data: 5.3865  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8423 (0.8354)  acc1: 85.6000 (83.8909)  acc5: 97.6000 (97.6000)  time: 0.7415  data: 0.5691  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0475 (0.9881)  acc1: 77.6000 (80.1905)  acc5: 95.2000 (95.7524)  time: 0.2126  data: 0.0437  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0698 (0.9975)  acc1: 78.0000 (79.9680)  acc5: 94.4000 (95.5840)  time: 0.2121  data: 0.0436  max mem: 28503
Test: Total time: 0:00:10 (0.4239 s / it)
* Acc@1 80.448 Acc@5 95.676 loss 0.985
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.50%
Epoch: [166]  [   0/1251]  eta: 1:06:10  lr: 0.001866  min_lr: 0.001866  loss: 2.9164 (2.9164)  weight_decay: 0.0500 (0.0500)  time: 3.1739  data: 1.7365  max mem: 28503
Epoch: [166]  [ 200/1251]  eta: 0:06:21  lr: 0.001862  min_lr: 0.001862  loss: 2.9562 (3.0897)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7942 (0.9308)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [166]  [ 400/1251]  eta: 0:05:02  lr: 0.001859  min_lr: 0.001859  loss: 3.3051 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [166]  [ 600/1251]  eta: 0:03:49  lr: 0.001855  min_lr: 0.001855  loss: 3.3602 (3.1108)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8226 (nan)  time: 0.3610  data: 0.0004  max mem: 28503
Epoch: [166]  [ 800/1251]  eta: 0:02:38  lr: 0.001852  min_lr: 0.001852  loss: 3.2821 (3.0999)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7766 (nan)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [166]  [1000/1251]  eta: 0:01:27  lr: 0.001848  min_lr: 0.001848  loss: 3.2279 (3.1025)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9239 (nan)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [166]  [1200/1251]  eta: 0:00:17  lr: 0.001844  min_lr: 0.001844  loss: 3.3904 (3.1115)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7628 (nan)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [166]  [1250/1251]  eta: 0:00:00  lr: 0.001844  min_lr: 0.001844  loss: 3.2285 (3.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (nan)  time: 0.2922  data: 0.0006  max mem: 28503
Epoch: [166] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.001844  min_lr: 0.001844  loss: 3.2285 (3.1326)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8475 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5844 (0.5844)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.4651  data: 5.2621  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8206 (0.8096)  acc1: 85.2000 (83.8182)  acc5: 97.6000 (97.2000)  time: 0.7330  data: 0.5600  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0089 (0.9658)  acc1: 78.4000 (80.4381)  acc5: 94.8000 (95.3905)  time: 0.2140  data: 0.0450  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0938 (0.9795)  acc1: 78.4000 (80.0480)  acc5: 94.0000 (95.2160)  time: 0.2138  data: 0.0449  max mem: 28503
Test: Total time: 0:00:10 (0.4202 s / it)
* Acc@1 80.544 Acc@5 95.584 loss 0.958
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.54%
Epoch: [167]  [   0/1251]  eta: 1:05:53  lr: 0.001844  min_lr: 0.001844  loss: 3.6448 (3.6448)  weight_decay: 0.0500 (0.0500)  time: 3.1601  data: 2.7976  max mem: 28503
Epoch: [167]  [ 200/1251]  eta: 0:06:18  lr: 0.001840  min_lr: 0.001840  loss: 3.3335 (3.1170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0425 (1.0322)  time: 0.3545  data: 0.0004  max mem: 28503
Epoch: [167]  [ 400/1251]  eta: 0:05:01  lr: 0.001836  min_lr: 0.001836  loss: 3.2887 (3.1131)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8661 (0.9980)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [167]  [ 600/1251]  eta: 0:03:49  lr: 0.001833  min_lr: 0.001833  loss: 3.2494 (3.1330)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7684 (0.9635)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [167]  [ 800/1251]  eta: 0:02:38  lr: 0.001829  min_lr: 0.001829  loss: 3.2614 (3.1560)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8792 (0.9637)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [167]  [1000/1251]  eta: 0:01:27  lr: 0.001826  min_lr: 0.001826  loss: 3.2553 (3.1503)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9562 (0.9531)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [167]  [1200/1251]  eta: 0:00:17  lr: 0.001822  min_lr: 0.001822  loss: 3.2633 (3.1542)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7964 (0.9484)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [167]  [1250/1251]  eta: 0:00:00  lr: 0.001821  min_lr: 0.001821  loss: 3.1160 (3.1489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9901 (0.9527)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [167] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.001821  min_lr: 0.001821  loss: 3.1160 (3.1298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9901 (0.9527)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6533 (0.6533)  acc1: 87.6000 (87.6000)  acc5: 99.2000 (99.2000)  time: 5.7418  data: 5.5056  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8911 (0.8291)  acc1: 84.0000 (83.7455)  acc5: 97.6000 (97.4545)  time: 0.7433  data: 0.5685  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0166 (0.9791)  acc1: 77.2000 (80.1524)  acc5: 95.6000 (95.7524)  time: 0.2076  data: 0.0374  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0983 (0.9846)  acc1: 76.8000 (79.9360)  acc5: 94.8000 (95.6160)  time: 0.2074  data: 0.0373  max mem: 28503
Test: Total time: 0:00:10 (0.4263 s / it)
* Acc@1 80.354 Acc@5 95.670 loss 0.974
Accuracy of the model on the 50000 test images: 80.4%
Max accuracy: 80.54%
Epoch: [168]  [   0/1251]  eta: 1:14:01  lr: 0.001821  min_lr: 0.001821  loss: 3.1394 (3.1394)  weight_decay: 0.0500 (0.0500)  time: 3.5501  data: 2.2799  max mem: 28503
Epoch: [168]  [ 200/1251]  eta: 0:06:23  lr: 0.001818  min_lr: 0.001818  loss: 2.9960 (3.1228)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9516 (0.9485)  time: 0.3539  data: 0.0004  max mem: 28503
Epoch: [168]  [ 400/1251]  eta: 0:05:03  lr: 0.001814  min_lr: 0.001814  loss: 3.3112 (3.1177)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9674 (0.9425)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [168]  [ 600/1251]  eta: 0:03:50  lr: 0.001811  min_lr: 0.001811  loss: 3.3022 (3.1190)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8316 (0.9303)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [168]  [ 800/1251]  eta: 0:02:38  lr: 0.001807  min_lr: 0.001807  loss: 3.1395 (3.1098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9052 (0.9582)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [168]  [1000/1251]  eta: 0:01:28  lr: 0.001803  min_lr: 0.001803  loss: 3.1997 (3.1134)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8644 (0.9446)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [168]  [1200/1251]  eta: 0:00:17  lr: 0.001800  min_lr: 0.001800  loss: 3.2315 (3.1073)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9859 (0.9550)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [168]  [1250/1251]  eta: 0:00:00  lr: 0.001799  min_lr: 0.001799  loss: 3.2461 (3.1080)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0172 (0.9579)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [168] Total time: 0:07:18 (0.3501 s / it)
Averaged stats: lr: 0.001799  min_lr: 0.001799  loss: 3.2461 (3.1195)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0172 (0.9579)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.6374 (0.6374)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 4.2852  data: 4.0879  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8709 (0.8242)  acc1: 84.4000 (83.8909)  acc5: 97.2000 (97.0909)  time: 0.6339  data: 0.4536  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0431 (0.9906)  acc1: 78.4000 (80.1524)  acc5: 95.2000 (95.4857)  time: 0.2589  data: 0.0843  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0742 (0.9959)  acc1: 78.4000 (79.7920)  acc5: 94.8000 (95.4720)  time: 0.2202  data: 0.0505  max mem: 28503
Test: Total time: 0:00:10 (0.4170 s / it)
* Acc@1 80.690 Acc@5 95.560 loss 0.975
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.69%
Epoch: [169]  [   0/1251]  eta: 1:08:14  lr: 0.001799  min_lr: 0.001799  loss: 3.1734 (3.1734)  weight_decay: 0.0500 (0.0500)  time: 3.2732  data: 2.8935  max mem: 28503
Epoch: [169]  [ 200/1251]  eta: 0:06:22  lr: 0.001795  min_lr: 0.001795  loss: 3.0057 (3.1489)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9034 (1.0012)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [169]  [ 400/1251]  eta: 0:05:02  lr: 0.001792  min_lr: 0.001792  loss: 3.1267 (3.1298)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6896 (0.8987)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [169]  [ 600/1251]  eta: 0:03:49  lr: 0.001788  min_lr: 0.001788  loss: 3.1908 (3.1113)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7082 (0.8711)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [169]  [ 800/1251]  eta: 0:02:38  lr: 0.001785  min_lr: 0.001785  loss: 3.2488 (3.1144)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8469 (0.8768)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [169]  [1000/1251]  eta: 0:01:28  lr: 0.001781  min_lr: 0.001781  loss: 3.1282 (3.1257)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1263 (0.9231)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [169]  [1200/1251]  eta: 0:00:17  lr: 0.001777  min_lr: 0.001777  loss: 3.2112 (3.1327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7370 (0.9214)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [169]  [1250/1251]  eta: 0:00:00  lr: 0.001777  min_lr: 0.001777  loss: 3.2778 (3.1337)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (0.9205)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [169] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001777  min_lr: 0.001777  loss: 3.2778 (3.1126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7233 (0.9205)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6743 (0.6743)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.5357  data: 5.3363  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8746 (0.8578)  acc1: 84.8000 (83.3818)  acc5: 96.8000 (97.0182)  time: 0.7266  data: 0.5524  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0160 (1.0004)  acc1: 77.2000 (80.1524)  acc5: 95.2000 (95.3524)  time: 0.2102  data: 0.0371  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1052 (1.0059)  acc1: 77.2000 (79.7760)  acc5: 94.8000 (95.4080)  time: 0.2093  data: 0.0369  max mem: 28503
Test: Total time: 0:00:10 (0.4206 s / it)
* Acc@1 80.530 Acc@5 95.538 loss 0.984
Accuracy of the model on the 50000 test images: 80.5%
Max accuracy: 80.69%
Epoch: [170]  [   0/1251]  eta: 1:12:14  lr: 0.001777  min_lr: 0.001777  loss: 3.0234 (3.0234)  weight_decay: 0.0500 (0.0500)  time: 3.4646  data: 2.6318  max mem: 28503
Epoch: [170]  [ 200/1251]  eta: 0:06:22  lr: 0.001773  min_lr: 0.001773  loss: 3.0753 (3.0696)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7445 (0.9006)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [170]  [ 400/1251]  eta: 0:05:02  lr: 0.001769  min_lr: 0.001769  loss: 3.4462 (3.1197)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7562 (0.8702)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [170]  [ 600/1251]  eta: 0:03:50  lr: 0.001766  min_lr: 0.001766  loss: 3.2570 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0794 (0.9218)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [170]  [ 800/1251]  eta: 0:02:38  lr: 0.001762  min_lr: 0.001762  loss: 3.2460 (3.1174)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8721 (0.9155)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [170]  [1000/1251]  eta: 0:01:28  lr: 0.001759  min_lr: 0.001759  loss: 3.0034 (3.1133)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9926 (0.9308)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [170]  [1200/1251]  eta: 0:00:17  lr: 0.001755  min_lr: 0.001755  loss: 3.3618 (3.1149)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8498 (0.9143)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [170]  [1250/1251]  eta: 0:00:00  lr: 0.001754  min_lr: 0.001754  loss: 3.3867 (3.1163)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.9193)  time: 0.2922  data: 0.0008  max mem: 28503
Epoch: [170] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.001754  min_lr: 0.001754  loss: 3.3867 (3.1164)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7890 (0.9193)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.6997 (0.6997)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8000  data: 5.5792  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8981 (0.8566)  acc1: 85.2000 (84.1818)  acc5: 97.6000 (97.2364)  time: 0.7130  data: 0.5393  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0289 (1.0110)  acc1: 76.8000 (80.0952)  acc5: 95.6000 (95.7714)  time: 0.1864  data: 0.0177  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1237 (1.0168)  acc1: 76.8000 (79.8400)  acc5: 94.8000 (95.6960)  time: 0.1860  data: 0.0176  max mem: 28503
Test: Total time: 0:00:10 (0.4141 s / it)
* Acc@1 80.662 Acc@5 95.620 loss 1.009
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.69%
Epoch: [171]  [   0/1251]  eta: 1:11:51  lr: 0.001754  min_lr: 0.001754  loss: 3.4853 (3.4853)  weight_decay: 0.0500 (0.0500)  time: 3.4461  data: 2.3545  max mem: 28503
Epoch: [171]  [ 200/1251]  eta: 0:06:24  lr: 0.001751  min_lr: 0.001751  loss: 3.0910 (3.1313)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7578 (0.8515)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [171]  [ 400/1251]  eta: 0:05:02  lr: 0.001747  min_lr: 0.001747  loss: 3.0896 (3.0813)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8353 (0.9188)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [171]  [ 600/1251]  eta: 0:03:50  lr: 0.001744  min_lr: 0.001744  loss: 3.0582 (3.0847)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9157 (0.9268)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [171]  [ 800/1251]  eta: 0:02:38  lr: 0.001740  min_lr: 0.001740  loss: 2.9202 (3.0887)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8364 (0.9401)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [171]  [1000/1251]  eta: 0:01:28  lr: 0.001737  min_lr: 0.001737  loss: 3.3312 (3.0937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7917 (0.9160)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [171]  [1200/1251]  eta: 0:00:17  lr: 0.001733  min_lr: 0.001733  loss: 3.4111 (3.1051)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8689 (0.9144)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [171]  [1250/1251]  eta: 0:00:00  lr: 0.001732  min_lr: 0.001732  loss: 3.0968 (3.1057)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9612 (0.9193)  time: 0.2921  data: 0.0005  max mem: 28503
Epoch: [171] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.001732  min_lr: 0.001732  loss: 3.0968 (3.1153)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9612 (0.9193)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6056 (0.6056)  acc1: 88.0000 (88.0000)  acc5: 98.4000 (98.4000)  time: 5.4583  data: 5.2537  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8246 (0.7922)  acc1: 83.2000 (83.6364)  acc5: 97.2000 (97.2364)  time: 0.7057  data: 0.5324  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9785 (0.9323)  acc1: 79.2000 (80.4000)  acc5: 94.8000 (95.7333)  time: 0.1995  data: 0.0302  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0194 (0.9403)  acc1: 79.2000 (80.0640)  acc5: 94.8000 (95.7280)  time: 0.1985  data: 0.0301  max mem: 28503
Test: Total time: 0:00:10 (0.4081 s / it)
* Acc@1 80.884 Acc@5 95.720 loss 0.930
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.88%
Epoch: [172]  [   0/1251]  eta: 1:01:02  lr: 0.001732  min_lr: 0.001732  loss: 3.2977 (3.2977)  weight_decay: 0.0500 (0.0500)  time: 2.9280  data: 2.5375  max mem: 28503
Epoch: [172]  [ 200/1251]  eta: 0:06:18  lr: 0.001729  min_lr: 0.001729  loss: 3.0754 (3.0962)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9777 (0.9928)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [172]  [ 400/1251]  eta: 0:05:01  lr: 0.001725  min_lr: 0.001725  loss: 3.0278 (3.1077)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9193 (0.9753)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [172]  [ 600/1251]  eta: 0:03:49  lr: 0.001721  min_lr: 0.001721  loss: 2.9198 (3.1031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8849 (0.9433)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [172]  [ 800/1251]  eta: 0:02:38  lr: 0.001718  min_lr: 0.001718  loss: 2.9928 (3.0856)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8189 (0.9413)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [172]  [1000/1251]  eta: 0:01:28  lr: 0.001714  min_lr: 0.001714  loss: 3.3018 (3.0884)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8132 (0.9381)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [172]  [1200/1251]  eta: 0:00:17  lr: 0.001711  min_lr: 0.001711  loss: 3.1923 (3.0942)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0132 (0.9496)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [172]  [1250/1251]  eta: 0:00:00  lr: 0.001710  min_lr: 0.001710  loss: 3.4132 (3.0959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8691 (0.9442)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [172] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.001710  min_lr: 0.001710  loss: 3.4132 (3.1104)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8691 (0.9442)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6886 (0.6886)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.4402  data: 5.2115  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8986 (0.8876)  acc1: 84.0000 (83.4909)  acc5: 98.0000 (97.4909)  time: 0.6916  data: 0.5156  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0355 (1.0309)  acc1: 77.2000 (79.8857)  acc5: 94.8000 (95.6571)  time: 0.1932  data: 0.0236  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1328 (1.0383)  acc1: 77.2000 (79.8880)  acc5: 94.8000 (95.6320)  time: 0.2068  data: 0.0384  max mem: 28503
Test: Total time: 0:00:10 (0.4149 s / it)
* Acc@1 80.564 Acc@5 95.648 loss 1.027
Accuracy of the model on the 50000 test images: 80.6%
Max accuracy: 80.88%
Epoch: [173]  [   0/1251]  eta: 1:12:10  lr: 0.001710  min_lr: 0.001710  loss: 3.4173 (3.4173)  weight_decay: 0.0500 (0.0500)  time: 3.4615  data: 2.1188  max mem: 28503
Epoch: [173]  [ 200/1251]  eta: 0:06:22  lr: 0.001706  min_lr: 0.001706  loss: 3.2049 (3.1006)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8598 (0.8981)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [173]  [ 400/1251]  eta: 0:05:02  lr: 0.001703  min_lr: 0.001703  loss: 3.0821 (3.0912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9442 (0.9829)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [173]  [ 600/1251]  eta: 0:03:49  lr: 0.001699  min_lr: 0.001699  loss: 3.3524 (3.0857)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7884 (0.9525)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [173]  [ 800/1251]  eta: 0:02:38  lr: 0.001696  min_lr: 0.001696  loss: 3.2319 (3.0952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0678 (0.9641)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [173]  [1000/1251]  eta: 0:01:28  lr: 0.001692  min_lr: 0.001692  loss: 3.0745 (3.0956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1208 (0.9760)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [173]  [1200/1251]  eta: 0:00:17  lr: 0.001689  min_lr: 0.001689  loss: 3.3799 (3.1098)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8072 (0.9798)  time: 0.3516  data: 0.0005  max mem: 28503
Epoch: [173]  [1250/1251]  eta: 0:00:00  lr: 0.001688  min_lr: 0.001688  loss: 3.0383 (3.1085)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6981 (0.9706)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [173] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.001688  min_lr: 0.001688  loss: 3.0383 (3.1040)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6981 (0.9706)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5689 (0.5689)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.8366  data: 5.6168  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8073 (0.7838)  acc1: 84.8000 (83.7818)  acc5: 97.2000 (97.3455)  time: 0.7616  data: 0.5884  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9666 (0.9309)  acc1: 77.2000 (80.2286)  acc5: 95.6000 (95.7905)  time: 0.2112  data: 0.0428  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0433 (0.9418)  acc1: 76.8000 (80.0160)  acc5: 95.2000 (95.6960)  time: 0.2109  data: 0.0427  max mem: 28503
Test: Total time: 0:00:10 (0.4325 s / it)
* Acc@1 80.764 Acc@5 95.682 loss 0.924
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.88%
Epoch: [174]  [   0/1251]  eta: 1:10:22  lr: 0.001688  min_lr: 0.001688  loss: 2.3477 (2.3477)  weight_decay: 0.0500 (0.0500)  time: 3.3754  data: 2.7186  max mem: 28503
Epoch: [174]  [ 200/1251]  eta: 0:06:23  lr: 0.001684  min_lr: 0.001684  loss: 3.1266 (3.1128)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8350 (0.9268)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [174]  [ 400/1251]  eta: 0:05:04  lr: 0.001681  min_lr: 0.001681  loss: 2.9270 (3.0979)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7774 (0.8976)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [174]  [ 600/1251]  eta: 0:03:51  lr: 0.001677  min_lr: 0.001677  loss: 3.0485 (3.1018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1475 (0.9675)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [174]  [ 800/1251]  eta: 0:02:39  lr: 0.001674  min_lr: 0.001674  loss: 3.2735 (3.1126)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8066 (0.9409)  time: 0.3470  data: 0.0005  max mem: 28503
Epoch: [174]  [1000/1251]  eta: 0:01:28  lr: 0.001670  min_lr: 0.001670  loss: 3.2866 (3.1236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8763 (0.9431)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [174]  [1200/1251]  eta: 0:00:17  lr: 0.001666  min_lr: 0.001666  loss: 3.2546 (3.1136)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9316 (0.9441)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [174]  [1250/1251]  eta: 0:00:00  lr: 0.001666  min_lr: 0.001666  loss: 2.8795 (3.1092)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9316 (0.9426)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [174] Total time: 0:07:20 (0.3517 s / it)
Averaged stats: lr: 0.001666  min_lr: 0.001666  loss: 2.8795 (3.0987)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9316 (0.9426)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5807 (0.5807)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5767  data: 5.3913  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7673 (0.7665)  acc1: 84.8000 (83.9636)  acc5: 97.6000 (97.3818)  time: 0.6732  data: 0.5024  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9060 (0.9188)  acc1: 79.2000 (80.6286)  acc5: 95.6000 (95.7333)  time: 0.1777  data: 0.0069  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9992 (0.9327)  acc1: 78.0000 (80.1280)  acc5: 94.8000 (95.5680)  time: 0.1773  data: 0.0068  max mem: 28503
Test: Total time: 0:00:10 (0.4005 s / it)
* Acc@1 80.774 Acc@5 95.714 loss 0.913
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.88%
Epoch: [175]  [   0/1251]  eta: 1:09:11  lr: 0.001666  min_lr: 0.001666  loss: 3.2054 (3.2054)  weight_decay: 0.0500 (0.0500)  time: 3.3183  data: 2.6147  max mem: 28503
Epoch: [175]  [ 200/1251]  eta: 0:06:21  lr: 0.001662  min_lr: 0.001662  loss: 3.0528 (3.1019)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9191 (0.9960)  time: 0.3476  data: 0.0005  max mem: 28503
Epoch: [175]  [ 400/1251]  eta: 0:05:02  lr: 0.001658  min_lr: 0.001658  loss: 3.3742 (3.1048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8843 (0.9711)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [175]  [ 600/1251]  eta: 0:03:49  lr: 0.001655  min_lr: 0.001655  loss: 3.1119 (3.1054)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7804 (0.9231)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [175]  [ 800/1251]  eta: 0:02:38  lr: 0.001651  min_lr: 0.001651  loss: 3.3571 (3.1020)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (0.9237)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [175]  [1000/1251]  eta: 0:01:27  lr: 0.001648  min_lr: 0.001648  loss: 3.3511 (3.1069)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8857 (0.9194)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [175]  [1200/1251]  eta: 0:00:17  lr: 0.001644  min_lr: 0.001644  loss: 3.2559 (3.1027)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.9313)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [175]  [1250/1251]  eta: 0:00:00  lr: 0.001644  min_lr: 0.001644  loss: 3.1156 (3.1007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.9356)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [175] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.001644  min_lr: 0.001644  loss: 3.1156 (3.0912)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9057 (0.9356)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5906 (0.5906)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8085  data: 5.5956  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7831 (0.7736)  acc1: 85.6000 (84.1455)  acc5: 97.2000 (97.1636)  time: 0.6822  data: 0.5090  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9564 (0.9264)  acc1: 79.2000 (80.6095)  acc5: 95.2000 (95.4667)  time: 0.1691  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9794 (0.9313)  acc1: 79.2000 (80.5120)  acc5: 95.2000 (95.3920)  time: 0.1686  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3982 s / it)
* Acc@1 80.904 Acc@5 95.760 loss 0.919
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 80.90%
Epoch: [176]  [   0/1251]  eta: 0:54:30  lr: 0.001643  min_lr: 0.001643  loss: 3.4414 (3.4414)  weight_decay: 0.0500 (0.0500)  time: 2.6141  data: 2.2533  max mem: 28503
Epoch: [176]  [ 200/1251]  eta: 0:06:15  lr: 0.001640  min_lr: 0.001640  loss: 2.9916 (3.0615)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9578 (0.9273)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [176]  [ 400/1251]  eta: 0:05:00  lr: 0.001636  min_lr: 0.001636  loss: 3.0912 (3.0586)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9758 (0.9469)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [176]  [ 600/1251]  eta: 0:03:48  lr: 0.001633  min_lr: 0.001633  loss: 3.2611 (3.0640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9438 (0.9544)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [176]  [ 800/1251]  eta: 0:02:37  lr: 0.001629  min_lr: 0.001629  loss: 3.3051 (3.0717)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8709 (0.9697)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [176]  [1000/1251]  eta: 0:01:27  lr: 0.001626  min_lr: 0.001626  loss: 3.1904 (3.0885)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9990 (1.0066)  time: 0.3462  data: 0.0005  max mem: 28503
Epoch: [176]  [1200/1251]  eta: 0:00:17  lr: 0.001622  min_lr: 0.001622  loss: 3.2246 (3.0845)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8292 (1.0092)  time: 0.3462  data: 0.0005  max mem: 28503
Epoch: [176]  [1250/1251]  eta: 0:00:00  lr: 0.001621  min_lr: 0.001621  loss: 3.2694 (3.0862)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7889 (0.9994)  time: 0.2921  data: 0.0005  max mem: 28503
Epoch: [176] Total time: 0:07:16 (0.3488 s / it)
Averaged stats: lr: 0.001621  min_lr: 0.001621  loss: 3.2694 (3.0986)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7889 (0.9994)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6937 (0.6937)  acc1: 87.2000 (87.2000)  acc5: 98.8000 (98.8000)  time: 5.5592  data: 5.3565  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8552 (0.8500)  acc1: 84.0000 (83.6364)  acc5: 97.2000 (97.0909)  time: 0.7057  data: 0.5328  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0069 (0.9995)  acc1: 78.8000 (80.3810)  acc5: 94.8000 (95.6381)  time: 0.1944  data: 0.0253  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0750 (1.0074)  acc1: 77.6000 (80.0480)  acc5: 94.8000 (95.5840)  time: 0.1937  data: 0.0252  max mem: 28503
Test: Total time: 0:00:10 (0.4085 s / it)
* Acc@1 80.748 Acc@5 95.664 loss 0.995
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.90%
Epoch: [177]  [   0/1251]  eta: 1:04:29  lr: 0.001621  min_lr: 0.001621  loss: 3.2982 (3.2982)  weight_decay: 0.0500 (0.0500)  time: 3.0930  data: 2.3189  max mem: 28503
Epoch: [177]  [ 200/1251]  eta: 0:06:20  lr: 0.001618  min_lr: 0.001618  loss: 3.2125 (3.0443)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8328 (0.9280)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [177]  [ 400/1251]  eta: 0:05:03  lr: 0.001614  min_lr: 0.001614  loss: 2.9549 (3.0645)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0532 (0.9906)  time: 0.3540  data: 0.0005  max mem: 28503
Epoch: [177]  [ 600/1251]  eta: 0:03:49  lr: 0.001611  min_lr: 0.001611  loss: 3.1318 (3.0587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0183 (0.9988)  time: 0.3481  data: 0.0003  max mem: 28503
Epoch: [177]  [ 800/1251]  eta: 0:02:38  lr: 0.001607  min_lr: 0.001607  loss: 3.1469 (3.0774)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8900 (1.0081)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [177]  [1000/1251]  eta: 0:01:28  lr: 0.001604  min_lr: 0.001604  loss: 2.9202 (3.0648)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8739 (0.9882)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [177]  [1200/1251]  eta: 0:00:17  lr: 0.001600  min_lr: 0.001600  loss: 3.1854 (3.0827)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9181 (0.9833)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [177]  [1250/1251]  eta: 0:00:00  lr: 0.001599  min_lr: 0.001599  loss: 3.3246 (3.0865)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9098 (0.9796)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [177] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.001599  min_lr: 0.001599  loss: 3.3246 (3.0871)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9098 (0.9796)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6634 (0.6634)  acc1: 88.0000 (88.0000)  acc5: 99.2000 (99.2000)  time: 5.6931  data: 5.5008  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9333 (0.8867)  acc1: 83.2000 (83.4909)  acc5: 97.2000 (97.3091)  time: 0.6919  data: 0.5202  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0483 (1.0360)  acc1: 78.4000 (80.3810)  acc5: 95.2000 (95.7905)  time: 0.1802  data: 0.0112  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1493 (1.0404)  acc1: 78.4000 (80.4160)  acc5: 95.2000 (95.7440)  time: 0.1795  data: 0.0111  max mem: 28503
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 80.752 Acc@5 95.768 loss 1.029
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 80.90%
Epoch: [178]  [   0/1251]  eta: 1:12:08  lr: 0.001599  min_lr: 0.001599  loss: 2.0915 (2.0915)  weight_decay: 0.0500 (0.0500)  time: 3.4599  data: 1.7582  max mem: 28503
Epoch: [178]  [ 200/1251]  eta: 0:06:21  lr: 0.001596  min_lr: 0.001596  loss: 3.2250 (3.0921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9221 (0.9214)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [178]  [ 400/1251]  eta: 0:05:02  lr: 0.001592  min_lr: 0.001592  loss: 3.2896 (3.0830)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8064 (0.9561)  time: 0.3587  data: 0.0005  max mem: 28503
Epoch: [178]  [ 600/1251]  eta: 0:03:49  lr: 0.001589  min_lr: 0.001589  loss: 3.1868 (3.0746)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7897 (0.9302)  time: 0.3522  data: 0.0004  max mem: 28503
Epoch: [178]  [ 800/1251]  eta: 0:02:38  lr: 0.001585  min_lr: 0.001585  loss: 3.2012 (3.0778)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9116 (0.9688)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [178]  [1000/1251]  eta: 0:01:27  lr: 0.001582  min_lr: 0.001582  loss: 3.3609 (3.0859)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8008 (0.9407)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [178]  [1200/1251]  eta: 0:00:17  lr: 0.001578  min_lr: 0.001578  loss: 3.1429 (3.0789)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9567 (0.9496)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [178]  [1250/1251]  eta: 0:00:00  lr: 0.001578  min_lr: 0.001578  loss: 3.2788 (3.0808)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (0.9495)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [178] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001578  min_lr: 0.001578  loss: 3.2788 (3.0877)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (0.9495)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7003 (0.7003)  acc1: 89.6000 (89.6000)  acc5: 98.0000 (98.0000)  time: 5.7189  data: 5.5164  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9033 (0.8786)  acc1: 85.6000 (84.0000)  acc5: 97.6000 (97.3091)  time: 0.6919  data: 0.5196  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0429 (1.0293)  acc1: 77.6000 (80.4381)  acc5: 95.6000 (95.8286)  time: 0.1788  data: 0.0100  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1595 (1.0411)  acc1: 77.6000 (80.1280)  acc5: 94.8000 (95.7440)  time: 0.1783  data: 0.0099  max mem: 28503
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 80.686 Acc@5 95.750 loss 1.031
Accuracy of the model on the 50000 test images: 80.7%
Max accuracy: 80.90%
Epoch: [179]  [   0/1251]  eta: 1:06:39  lr: 0.001577  min_lr: 0.001577  loss: 3.2097 (3.2097)  weight_decay: 0.0500 (0.0500)  time: 3.1969  data: 2.6663  max mem: 28503
Epoch: [179]  [ 200/1251]  eta: 0:06:20  lr: 0.001574  min_lr: 0.001574  loss: 3.0868 (3.1018)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9206 (0.8633)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [179]  [ 400/1251]  eta: 0:05:03  lr: 0.001570  min_lr: 0.001570  loss: 3.3504 (3.0966)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1374 (0.9727)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [179]  [ 600/1251]  eta: 0:03:49  lr: 0.001567  min_lr: 0.001567  loss: 3.2006 (3.0937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8282 (0.9943)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [179]  [ 800/1251]  eta: 0:02:38  lr: 0.001563  min_lr: 0.001563  loss: 3.2696 (3.0918)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8294 (0.9725)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [179]  [1000/1251]  eta: 0:01:28  lr: 0.001560  min_lr: 0.001560  loss: 3.1256 (3.0831)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8909 (nan)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [179]  [1200/1251]  eta: 0:00:17  lr: 0.001556  min_lr: 0.001556  loss: 3.1399 (3.0870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9377 (nan)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [179]  [1250/1251]  eta: 0:00:00  lr: 0.001556  min_lr: 0.001556  loss: 3.0031 (3.0854)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1660 (nan)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [179] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.001556  min_lr: 0.001556  loss: 3.0031 (3.0829)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1660 (nan)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5913 (0.5913)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.6710  data: 5.4408  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7951 (0.7614)  acc1: 85.2000 (84.1091)  acc5: 97.6000 (97.6000)  time: 0.6696  data: 0.4952  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9344 (0.9013)  acc1: 78.4000 (80.6286)  acc5: 96.0000 (96.1333)  time: 0.1804  data: 0.0117  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9543 (0.9106)  acc1: 78.8000 (80.4640)  acc5: 95.6000 (96.0000)  time: 0.1801  data: 0.0117  max mem: 28503
Test: Total time: 0:00:10 (0.4019 s / it)
* Acc@1 81.216 Acc@5 95.924 loss 0.897
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.22%
Epoch: [180]  [   0/1251]  eta: 1:05:51  lr: 0.001556  min_lr: 0.001556  loss: 3.4246 (3.4246)  weight_decay: 0.0500 (0.0500)  time: 3.1591  data: 2.8067  max mem: 28503
Epoch: [180]  [ 200/1251]  eta: 0:06:19  lr: 0.001552  min_lr: 0.001552  loss: 2.9564 (3.0169)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0854 (1.1032)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [180]  [ 400/1251]  eta: 0:05:01  lr: 0.001549  min_lr: 0.001549  loss: 3.2050 (3.0416)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8544 (1.0526)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [180]  [ 600/1251]  eta: 0:03:49  lr: 0.001545  min_lr: 0.001545  loss: 3.0088 (3.0419)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8554 (1.0022)  time: 0.3641  data: 0.0004  max mem: 28503
Epoch: [180]  [ 800/1251]  eta: 0:02:38  lr: 0.001542  min_lr: 0.001542  loss: 3.1369 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9005 (1.0177)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [180]  [1000/1251]  eta: 0:01:27  lr: 0.001538  min_lr: 0.001538  loss: 3.2537 (3.0529)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9220 (1.0154)  time: 0.3441  data: 0.0003  max mem: 28503
Epoch: [180]  [1200/1251]  eta: 0:00:17  lr: 0.001535  min_lr: 0.001535  loss: 3.3344 (3.0567)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9009 (1.0081)  time: 0.3471  data: 0.0003  max mem: 28503
Epoch: [180]  [1250/1251]  eta: 0:00:00  lr: 0.001534  min_lr: 0.001534  loss: 3.3583 (3.0606)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8889 (1.0072)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [180] Total time: 0:07:16 (0.3491 s / it)
Averaged stats: lr: 0.001534  min_lr: 0.001534  loss: 3.3583 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8889 (1.0072)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.5824 (0.5824)  acc1: 90.8000 (90.8000)  acc5: 98.0000 (98.0000)  time: 4.6069  data: 4.3990  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8155 (0.8041)  acc1: 82.8000 (83.4909)  acc5: 97.6000 (97.6727)  time: 0.6571  data: 0.4827  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9874 (0.9552)  acc1: 77.6000 (80.3810)  acc5: 96.0000 (95.8095)  time: 0.2420  data: 0.0723  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0646 (0.9630)  acc1: 77.6000 (80.1120)  acc5: 95.2000 (95.7280)  time: 0.2134  data: 0.0450  max mem: 28503
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 81.000 Acc@5 95.850 loss 0.947
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.22%
Epoch: [181]  [   0/1251]  eta: 1:12:00  lr: 0.001534  min_lr: 0.001534  loss: 3.2779 (3.2779)  weight_decay: 0.0500 (0.0500)  time: 3.4540  data: 1.7328  max mem: 28503
Epoch: [181]  [ 200/1251]  eta: 0:06:25  lr: 0.001530  min_lr: 0.001530  loss: 3.1033 (3.0882)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9165 (0.9980)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [181]  [ 400/1251]  eta: 0:05:04  lr: 0.001527  min_lr: 0.001527  loss: 3.2429 (3.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1295 (1.0273)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [181]  [ 600/1251]  eta: 0:03:50  lr: 0.001523  min_lr: 0.001523  loss: 3.3217 (3.0906)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8605 (1.0089)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [181]  [ 800/1251]  eta: 0:02:39  lr: 0.001520  min_lr: 0.001520  loss: 3.2507 (3.0970)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8651 (0.9863)  time: 0.3457  data: 0.0005  max mem: 28503
Epoch: [181]  [1000/1251]  eta: 0:01:28  lr: 0.001516  min_lr: 0.001516  loss: 3.2701 (3.0941)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8901 (0.9943)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [181]  [1200/1251]  eta: 0:00:17  lr: 0.001513  min_lr: 0.001513  loss: 3.1853 (3.0921)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (0.9927)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [181]  [1250/1251]  eta: 0:00:00  lr: 0.001512  min_lr: 0.001512  loss: 3.2177 (3.0917)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (0.9966)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [181] Total time: 0:07:18 (0.3509 s / it)
Averaged stats: lr: 0.001512  min_lr: 0.001512  loss: 3.2177 (3.0733)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9801 (0.9966)
Test:  [ 0/25]  eta: 0:01:51  loss: 0.6320 (0.6320)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 4.4593  data: 4.2419  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8452 (0.8433)  acc1: 84.0000 (84.0000)  acc5: 97.2000 (97.4545)  time: 0.6747  data: 0.5019  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0447 (0.9808)  acc1: 78.0000 (80.8762)  acc5: 95.6000 (95.8857)  time: 0.2364  data: 0.0677  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0633 (0.9936)  acc1: 77.6000 (80.3360)  acc5: 95.2000 (95.8240)  time: 0.2108  data: 0.0421  max mem: 28503
Test: Total time: 0:00:09 (0.3996 s / it)
* Acc@1 80.846 Acc@5 95.862 loss 0.977
Accuracy of the model on the 50000 test images: 80.8%
Max accuracy: 81.22%
Epoch: [182]  [   0/1251]  eta: 1:11:24  lr: 0.001512  min_lr: 0.001512  loss: 2.3962 (2.3962)  weight_decay: 0.0500 (0.0500)  time: 3.4248  data: 1.6060  max mem: 28503
Epoch: [182]  [ 200/1251]  eta: 0:06:20  lr: 0.001508  min_lr: 0.001508  loss: 2.9528 (3.0002)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8591 (0.9994)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [182]  [ 400/1251]  eta: 0:05:02  lr: 0.001505  min_lr: 0.001505  loss: 3.2328 (3.0312)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0109 (0.9534)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [182]  [ 600/1251]  eta: 0:03:49  lr: 0.001501  min_lr: 0.001501  loss: 3.3175 (3.0474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0953 (1.0112)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [182]  [ 800/1251]  eta: 0:02:38  lr: 0.001498  min_lr: 0.001498  loss: 3.1924 (3.0536)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9683 (1.0058)  time: 0.3629  data: 0.0005  max mem: 28503
Epoch: [182]  [1000/1251]  eta: 0:01:28  lr: 0.001495  min_lr: 0.001495  loss: 3.1021 (3.0634)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9754 (1.0485)  time: 0.3448  data: 0.0003  max mem: 28503
Epoch: [182]  [1200/1251]  eta: 0:00:17  lr: 0.001491  min_lr: 0.001491  loss: 3.0878 (3.0691)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8745 (1.0265)  time: 0.3552  data: 0.0003  max mem: 28503
Epoch: [182]  [1250/1251]  eta: 0:00:00  lr: 0.001490  min_lr: 0.001490  loss: 3.0443 (3.0689)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8854 (1.0226)  time: 0.3053  data: 0.0006  max mem: 28503
Epoch: [182] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.001490  min_lr: 0.001490  loss: 3.0443 (3.0775)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8854 (1.0226)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6025 (0.6025)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.6514  data: 5.4444  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8182 (0.8096)  acc1: 85.6000 (83.6364)  acc5: 97.2000 (97.2364)  time: 0.6872  data: 0.5151  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9979 (0.9507)  acc1: 78.8000 (80.4381)  acc5: 95.2000 (95.7333)  time: 0.1797  data: 0.0111  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0631 (0.9589)  acc1: 78.8000 (80.3360)  acc5: 94.8000 (95.6800)  time: 0.1795  data: 0.0110  max mem: 28503
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 81.002 Acc@5 95.814 loss 0.944
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.22%
Epoch: [183]  [   0/1251]  eta: 1:06:27  lr: 0.001490  min_lr: 0.001490  loss: 2.9436 (2.9436)  weight_decay: 0.0500 (0.0500)  time: 3.1873  data: 2.4418  max mem: 28503
Epoch: [183]  [ 200/1251]  eta: 0:06:21  lr: 0.001487  min_lr: 0.001487  loss: 3.2084 (2.9948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0245 (0.9757)  time: 0.3461  data: 0.0005  max mem: 28503
Epoch: [183]  [ 400/1251]  eta: 0:05:02  lr: 0.001483  min_lr: 0.001483  loss: 3.1325 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8930 (0.9416)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [183]  [ 600/1251]  eta: 0:03:49  lr: 0.001480  min_lr: 0.001480  loss: 3.1509 (3.0420)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9871 (0.9505)  time: 0.3474  data: 0.0005  max mem: 28503
Epoch: [183]  [ 800/1251]  eta: 0:02:38  lr: 0.001476  min_lr: 0.001476  loss: 3.4333 (3.0700)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9131 (0.9816)  time: 0.3590  data: 0.0004  max mem: 28503
Epoch: [183]  [1000/1251]  eta: 0:01:28  lr: 0.001473  min_lr: 0.001473  loss: 3.0810 (3.0578)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8144 (0.9699)  time: 0.3481  data: 0.0005  max mem: 28503
Epoch: [183]  [1200/1251]  eta: 0:00:17  lr: 0.001469  min_lr: 0.001469  loss: 2.9454 (3.0563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0660 (1.0106)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [183]  [1250/1251]  eta: 0:00:00  lr: 0.001469  min_lr: 0.001469  loss: 3.1586 (3.0629)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [183] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.001469  min_lr: 0.001469  loss: 3.1586 (3.0634)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6904 (0.6904)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.4539  data: 5.2580  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8704 (0.8993)  acc1: 84.0000 (83.6000)  acc5: 97.2000 (97.4909)  time: 0.7241  data: 0.5519  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0907 (1.0428)  acc1: 78.4000 (80.5524)  acc5: 95.6000 (95.7524)  time: 0.2113  data: 0.0407  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1335 (1.0507)  acc1: 78.4000 (80.0960)  acc5: 95.2000 (95.6800)  time: 0.2117  data: 0.0406  max mem: 28503
Test: Total time: 0:00:10 (0.4186 s / it)
* Acc@1 80.894 Acc@5 95.786 loss 1.031
Accuracy of the model on the 50000 test images: 80.9%
Max accuracy: 81.22%
Epoch: [184]  [   0/1251]  eta: 1:08:08  lr: 0.001469  min_lr: 0.001469  loss: 3.2841 (3.2841)  weight_decay: 0.0500 (0.0500)  time: 3.2678  data: 2.4989  max mem: 28503
Epoch: [184]  [ 200/1251]  eta: 0:06:20  lr: 0.001465  min_lr: 0.001465  loss: 3.2627 (3.0240)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9314 (0.9396)  time: 0.3450  data: 0.0003  max mem: 28503
Epoch: [184]  [ 400/1251]  eta: 0:05:02  lr: 0.001462  min_lr: 0.001462  loss: 3.1823 (3.0484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8820 (0.9092)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [184]  [ 600/1251]  eta: 0:03:50  lr: 0.001458  min_lr: 0.001458  loss: 3.0762 (3.0349)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.7924 (0.8947)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [184]  [ 800/1251]  eta: 0:02:38  lr: 0.001455  min_lr: 0.001455  loss: 3.2460 (3.0491)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1012 (0.9352)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [184]  [1000/1251]  eta: 0:01:28  lr: 0.001451  min_lr: 0.001451  loss: 3.3268 (3.0580)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9140 (0.9387)  time: 0.3554  data: 0.0005  max mem: 28503
Epoch: [184]  [1200/1251]  eta: 0:00:17  lr: 0.001448  min_lr: 0.001448  loss: 3.0280 (3.0712)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9934 (0.9499)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [184]  [1250/1251]  eta: 0:00:00  lr: 0.001447  min_lr: 0.001447  loss: 3.1404 (3.0718)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9321 (0.9561)  time: 0.3002  data: 0.0007  max mem: 28503
Epoch: [184] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.001447  min_lr: 0.001447  loss: 3.1404 (3.0559)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9321 (0.9561)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6003 (0.6003)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.5468  data: 5.3079  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7920 (0.8218)  acc1: 86.0000 (83.9636)  acc5: 98.0000 (97.4909)  time: 0.7385  data: 0.5634  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9953 (0.9677)  acc1: 78.4000 (80.4952)  acc5: 95.2000 (95.8286)  time: 0.2131  data: 0.0445  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0724 (0.9744)  acc1: 76.8000 (80.3200)  acc5: 95.2000 (95.7600)  time: 0.2129  data: 0.0444  max mem: 28503
Test: Total time: 0:00:10 (0.4225 s / it)
* Acc@1 80.952 Acc@5 95.820 loss 0.958
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.22%
Epoch: [185]  [   0/1251]  eta: 1:15:16  lr: 0.001447  min_lr: 0.001447  loss: 3.1629 (3.1629)  weight_decay: 0.0500 (0.0500)  time: 3.6101  data: 2.7188  max mem: 28503
Epoch: [185]  [ 200/1251]  eta: 0:06:23  lr: 0.001444  min_lr: 0.001444  loss: 3.2395 (3.0048)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9858 (1.0564)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [185]  [ 400/1251]  eta: 0:05:03  lr: 0.001440  min_lr: 0.001440  loss: 3.1082 (3.0113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1086 (1.0554)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [185]  [ 600/1251]  eta: 0:03:50  lr: 0.001437  min_lr: 0.001437  loss: 3.0953 (3.0337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0386 (1.0530)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [185]  [ 800/1251]  eta: 0:02:38  lr: 0.001433  min_lr: 0.001433  loss: 3.2105 (3.0370)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8486 (1.0199)  time: 0.3588  data: 0.0004  max mem: 28503
Epoch: [185]  [1000/1251]  eta: 0:01:28  lr: 0.001430  min_lr: 0.001430  loss: 3.2230 (3.0416)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0136 (1.0142)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [185]  [1200/1251]  eta: 0:00:17  lr: 0.001426  min_lr: 0.001426  loss: 3.2984 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2858 (1.0251)  time: 0.3586  data: 0.0005  max mem: 28503
Epoch: [185]  [1250/1251]  eta: 0:00:00  lr: 0.001426  min_lr: 0.001426  loss: 3.3059 (3.0511)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3958 (1.0431)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [185] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.001426  min_lr: 0.001426  loss: 3.3059 (3.0527)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3958 (1.0431)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5846 (0.5846)  acc1: 91.2000 (91.2000)  acc5: 99.2000 (99.2000)  time: 5.5437  data: 5.3208  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8301 (0.8104)  acc1: 84.8000 (84.5091)  acc5: 97.6000 (97.3818)  time: 0.7424  data: 0.5686  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9590 (0.9419)  acc1: 78.8000 (80.9524)  acc5: 96.0000 (96.0381)  time: 0.2153  data: 0.0468  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0228 (0.9554)  acc1: 78.8000 (80.5760)  acc5: 95.6000 (95.8560)  time: 0.2246  data: 0.0563  max mem: 28503
Test: Total time: 0:00:10 (0.4326 s / it)
* Acc@1 81.222 Acc@5 95.936 loss 0.943
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.22%
Epoch: [186]  [   0/1251]  eta: 1:01:17  lr: 0.001425  min_lr: 0.001425  loss: 3.1551 (3.1551)  weight_decay: 0.0500 (0.0500)  time: 2.9396  data: 2.5461  max mem: 28503
Epoch: [186]  [ 200/1251]  eta: 0:06:19  lr: 0.001422  min_lr: 0.001422  loss: 2.9370 (3.0675)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8673 (1.1086)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [186]  [ 400/1251]  eta: 0:05:02  lr: 0.001419  min_lr: 0.001419  loss: 2.9411 (3.0672)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9188 (1.0279)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [186]  [ 600/1251]  eta: 0:03:49  lr: 0.001415  min_lr: 0.001415  loss: 2.9791 (3.0751)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8075 (0.9912)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [186]  [ 800/1251]  eta: 0:02:38  lr: 0.001412  min_lr: 0.001412  loss: 3.2748 (3.0732)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9495 (1.0230)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [186]  [1000/1251]  eta: 0:01:27  lr: 0.001408  min_lr: 0.001408  loss: 3.1020 (3.0777)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9055 (1.0130)  time: 0.3507  data: 0.0004  max mem: 28503
Epoch: [186]  [1200/1251]  eta: 0:00:17  lr: 0.001405  min_lr: 0.001405  loss: 2.9397 (3.0695)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0603 (1.0249)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [186]  [1250/1251]  eta: 0:00:00  lr: 0.001404  min_lr: 0.001404  loss: 3.2784 (3.0721)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (1.0238)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [186] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.001404  min_lr: 0.001404  loss: 3.2784 (3.0539)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9244 (1.0238)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.7613 (0.7613)  acc1: 87.2000 (87.2000)  acc5: 99.2000 (99.2000)  time: 5.6978  data: 5.4964  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9357 (0.9577)  acc1: 85.2000 (83.8909)  acc5: 97.2000 (97.2727)  time: 0.7312  data: 0.5581  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1159 (1.0968)  acc1: 78.0000 (80.3429)  acc5: 95.2000 (95.5429)  time: 0.2058  data: 0.0322  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1829 (1.1045)  acc1: 78.0000 (80.1440)  acc5: 94.4000 (95.4560)  time: 0.2054  data: 0.0321  max mem: 28503
Test: Total time: 0:00:10 (0.4232 s / it)
* Acc@1 81.042 Acc@5 95.790 loss 1.087
Accuracy of the model on the 50000 test images: 81.0%
Max accuracy: 81.22%
Epoch: [187]  [   0/1251]  eta: 1:11:27  lr: 0.001404  min_lr: 0.001404  loss: 2.6746 (2.6746)  weight_decay: 0.0500 (0.0500)  time: 3.4276  data: 2.7702  max mem: 28503
Epoch: [187]  [ 200/1251]  eta: 0:06:21  lr: 0.001401  min_lr: 0.001401  loss: 3.2192 (3.0472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8204 (0.9785)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [187]  [ 400/1251]  eta: 0:05:01  lr: 0.001397  min_lr: 0.001397  loss: 3.1105 (3.0136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0982 (0.9899)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [187]  [ 600/1251]  eta: 0:03:49  lr: 0.001394  min_lr: 0.001394  loss: 3.2518 (3.0287)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8272 (1.0092)  time: 0.3580  data: 0.0004  max mem: 28503
Epoch: [187]  [ 800/1251]  eta: 0:02:38  lr: 0.001390  min_lr: 0.001390  loss: 3.2008 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9348 (1.0098)  time: 0.3648  data: 0.0004  max mem: 28503
Epoch: [187]  [1000/1251]  eta: 0:01:28  lr: 0.001387  min_lr: 0.001387  loss: 3.3317 (3.0451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8734 (0.9930)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [187]  [1200/1251]  eta: 0:00:17  lr: 0.001383  min_lr: 0.001383  loss: 2.8058 (3.0472)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8953 (0.9782)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [187]  [1250/1251]  eta: 0:00:00  lr: 0.001383  min_lr: 0.001383  loss: 3.0396 (3.0467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2086 (0.9948)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [187] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001383  min_lr: 0.001383  loss: 3.0396 (3.0531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2086 (0.9948)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6449 (0.6449)  acc1: 90.0000 (90.0000)  acc5: 98.4000 (98.4000)  time: 5.6756  data: 5.4844  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9103 (0.8652)  acc1: 86.0000 (85.2364)  acc5: 97.2000 (97.1636)  time: 0.7216  data: 0.5450  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0541 (1.0084)  acc1: 78.8000 (81.3333)  acc5: 95.6000 (95.5429)  time: 0.1974  data: 0.0256  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1050 (1.0215)  acc1: 78.4000 (80.7040)  acc5: 94.8000 (95.4720)  time: 0.1955  data: 0.0255  max mem: 28503
Test: Total time: 0:00:10 (0.4150 s / it)
* Acc@1 81.238 Acc@5 95.780 loss 1.007
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [188]  [   0/1251]  eta: 1:04:49  lr: 0.001383  min_lr: 0.001383  loss: 3.3998 (3.3998)  weight_decay: 0.0500 (0.0500)  time: 3.1093  data: 2.7378  max mem: 28503
Epoch: [188]  [ 200/1251]  eta: 0:06:22  lr: 0.001379  min_lr: 0.001379  loss: 2.5492 (2.9479)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8790 (1.0745)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [188]  [ 400/1251]  eta: 0:05:02  lr: 0.001376  min_lr: 0.001376  loss: 3.2310 (2.9955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0443 (1.0336)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [188]  [ 600/1251]  eta: 0:03:49  lr: 0.001372  min_lr: 0.001372  loss: 3.1540 (3.0188)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9059 (1.0072)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [188]  [ 800/1251]  eta: 0:02:38  lr: 0.001369  min_lr: 0.001369  loss: 3.0079 (3.0312)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8981 (1.0218)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [188]  [1000/1251]  eta: 0:01:28  lr: 0.001366  min_lr: 0.001366  loss: 3.2156 (3.0460)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9123 (1.0117)  time: 0.3566  data: 0.0004  max mem: 28503
Epoch: [188]  [1200/1251]  eta: 0:00:17  lr: 0.001362  min_lr: 0.001362  loss: 3.1860 (3.0369)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8593 (1.0107)  time: 0.3477  data: 0.0005  max mem: 28503
Epoch: [188]  [1250/1251]  eta: 0:00:00  lr: 0.001361  min_lr: 0.001361  loss: 3.1732 (3.0401)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9110 (1.0116)  time: 0.2921  data: 0.0009  max mem: 28503
Epoch: [188] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.001361  min_lr: 0.001361  loss: 3.1732 (3.0484)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9110 (1.0116)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6371 (0.6371)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 5.7054  data: 5.5036  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8033 (0.8219)  acc1: 84.8000 (84.6182)  acc5: 97.6000 (97.5636)  time: 0.7054  data: 0.5339  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0067 (0.9752)  acc1: 78.8000 (81.2191)  acc5: 95.6000 (95.8667)  time: 0.1933  data: 0.0248  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0872 (0.9860)  acc1: 79.6000 (80.8960)  acc5: 95.6000 (95.8400)  time: 0.1931  data: 0.0248  max mem: 28503
Test: Total time: 0:00:10 (0.4132 s / it)
* Acc@1 81.186 Acc@5 95.878 loss 0.972
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.24%
Epoch: [189]  [   0/1251]  eta: 1:08:37  lr: 0.001361  min_lr: 0.001361  loss: 2.6736 (2.6736)  weight_decay: 0.0500 (0.0500)  time: 3.2913  data: 2.7439  max mem: 28503
Epoch: [189]  [ 200/1251]  eta: 0:06:20  lr: 0.001358  min_lr: 0.001358  loss: 3.2682 (3.0592)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8999 (0.9354)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [189]  [ 400/1251]  eta: 0:05:02  lr: 0.001355  min_lr: 0.001355  loss: 3.2001 (3.0259)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9829 (1.0444)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [189]  [ 600/1251]  eta: 0:03:49  lr: 0.001351  min_lr: 0.001351  loss: 2.8904 (3.0381)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8790 (1.0485)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [189]  [ 800/1251]  eta: 0:02:38  lr: 0.001348  min_lr: 0.001348  loss: 3.1110 (3.0486)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8297 (1.0072)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [189]  [1000/1251]  eta: 0:01:28  lr: 0.001344  min_lr: 0.001344  loss: 2.9068 (3.0367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8396 (1.0047)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [189]  [1200/1251]  eta: 0:00:17  lr: 0.001341  min_lr: 0.001341  loss: 2.9966 (3.0266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9302 (1.0172)  time: 0.3577  data: 0.0005  max mem: 28503
Epoch: [189]  [1250/1251]  eta: 0:00:00  lr: 0.001340  min_lr: 0.001340  loss: 3.0865 (3.0245)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9033 (1.0134)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [189] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001340  min_lr: 0.001340  loss: 3.0865 (3.0340)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9033 (1.0134)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5649 (0.5649)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4154  data: 5.2026  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7845 (0.7680)  acc1: 85.6000 (84.5091)  acc5: 97.6000 (97.5636)  time: 0.7025  data: 0.5293  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9584 (0.9060)  acc1: 78.0000 (80.8381)  acc5: 95.6000 (95.8286)  time: 0.2002  data: 0.0311  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9584 (0.9128)  acc1: 77.6000 (80.6400)  acc5: 95.2000 (95.7440)  time: 0.1997  data: 0.0310  max mem: 28503
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 81.364 Acc@5 95.946 loss 0.904
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.36%
Epoch: [190]  [   0/1251]  eta: 1:03:00  lr: 0.001340  min_lr: 0.001340  loss: 2.3651 (2.3651)  weight_decay: 0.0500 (0.0500)  time: 3.0217  data: 2.6359  max mem: 28503
Epoch: [190]  [ 200/1251]  eta: 0:06:21  lr: 0.001337  min_lr: 0.001337  loss: 3.0275 (3.0446)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9344 (1.1178)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [190]  [ 400/1251]  eta: 0:05:01  lr: 0.001333  min_lr: 0.001333  loss: 3.1014 (3.0226)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8019 (1.0350)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [190]  [ 600/1251]  eta: 0:03:49  lr: 0.001330  min_lr: 0.001330  loss: 3.1613 (3.0321)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8298 (0.9987)  time: 0.3438  data: 0.0004  max mem: 28503
Epoch: [190]  [ 800/1251]  eta: 0:02:38  lr: 0.001327  min_lr: 0.001327  loss: 2.9028 (3.0363)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8661 (0.9846)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [190]  [1000/1251]  eta: 0:01:27  lr: 0.001323  min_lr: 0.001323  loss: 3.1074 (3.0274)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9344 (0.9846)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [190]  [1200/1251]  eta: 0:00:17  lr: 0.001320  min_lr: 0.001320  loss: 3.1002 (3.0322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0804 (0.9932)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [190]  [1250/1251]  eta: 0:00:00  lr: 0.001319  min_lr: 0.001319  loss: 3.1295 (3.0344)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (0.9951)  time: 0.2912  data: 0.0006  max mem: 28503
Epoch: [190] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.001319  min_lr: 0.001319  loss: 3.1295 (3.0377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0182 (0.9951)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6265 (0.6265)  acc1: 92.4000 (92.4000)  acc5: 98.8000 (98.8000)  time: 5.4634  data: 5.2684  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8491 (0.8200)  acc1: 85.6000 (85.0182)  acc5: 98.0000 (97.6727)  time: 0.6694  data: 0.4973  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0335 (0.9671)  acc1: 79.2000 (81.1429)  acc5: 95.6000 (95.8857)  time: 0.1941  data: 0.0250  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0285 (0.9761)  acc1: 79.6000 (80.9920)  acc5: 94.8000 (95.7920)  time: 0.1935  data: 0.0250  max mem: 28503
Test: Total time: 0:00:10 (0.4038 s / it)
* Acc@1 81.326 Acc@5 95.936 loss 0.968
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.36%
Epoch: [191]  [   0/1251]  eta: 1:10:26  lr: 0.001319  min_lr: 0.001319  loss: 3.3809 (3.3809)  weight_decay: 0.0500 (0.0500)  time: 3.3785  data: 2.5056  max mem: 28503
Epoch: [191]  [ 200/1251]  eta: 0:06:22  lr: 0.001316  min_lr: 0.001316  loss: 3.0475 (3.0016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0815 (1.1005)  time: 0.3548  data: 0.0004  max mem: 28503
Epoch: [191]  [ 400/1251]  eta: 0:05:02  lr: 0.001312  min_lr: 0.001312  loss: 3.0603 (3.0225)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8408 (1.0276)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [191]  [ 600/1251]  eta: 0:03:50  lr: 0.001309  min_lr: 0.001309  loss: 2.8175 (3.0039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0781 (1.0362)  time: 0.3636  data: 0.0004  max mem: 28503
Epoch: [191]  [ 800/1251]  eta: 0:02:38  lr: 0.001305  min_lr: 0.001305  loss: 3.0765 (3.0160)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9740 (1.0677)  time: 0.3450  data: 0.0005  max mem: 28503
Epoch: [191]  [1000/1251]  eta: 0:01:28  lr: 0.001302  min_lr: 0.001302  loss: 3.0277 (3.0332)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1209 (1.1015)  time: 0.3451  data: 0.0005  max mem: 28503
Epoch: [191]  [1200/1251]  eta: 0:00:17  lr: 0.001299  min_lr: 0.001299  loss: 3.1516 (3.0348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0144 (1.0808)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [191]  [1250/1251]  eta: 0:00:00  lr: 0.001298  min_lr: 0.001298  loss: 3.1696 (3.0328)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0009 (1.0790)  time: 0.2914  data: 0.0007  max mem: 28503
Epoch: [191] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001298  min_lr: 0.001298  loss: 3.1696 (3.0295)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0009 (1.0790)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5937 (0.5937)  acc1: 90.8000 (90.8000)  acc5: 98.4000 (98.4000)  time: 5.4133  data: 5.2235  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7880 (0.7937)  acc1: 85.6000 (84.1818)  acc5: 97.6000 (97.4909)  time: 0.6496  data: 0.4778  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9577 (0.9284)  acc1: 77.6000 (80.8000)  acc5: 96.0000 (96.0571)  time: 0.1954  data: 0.0201  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0149 (0.9363)  acc1: 79.2000 (80.5760)  acc5: 95.2000 (95.9200)  time: 0.1957  data: 0.0200  max mem: 28503
Test: Total time: 0:00:10 (0.4043 s / it)
* Acc@1 81.386 Acc@5 96.068 loss 0.920
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.39%
Epoch: [192]  [   0/1251]  eta: 1:05:57  lr: 0.001298  min_lr: 0.001298  loss: 3.2149 (3.2149)  weight_decay: 0.0500 (0.0500)  time: 3.1634  data: 2.7962  max mem: 28503
Epoch: [192]  [ 200/1251]  eta: 0:06:22  lr: 0.001295  min_lr: 0.001295  loss: 2.9734 (3.0327)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9083 (1.0307)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [192]  [ 400/1251]  eta: 0:05:03  lr: 0.001291  min_lr: 0.001291  loss: 3.1142 (2.9934)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0308 (1.0462)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [192]  [ 600/1251]  eta: 0:03:50  lr: 0.001288  min_lr: 0.001288  loss: 3.2036 (3.0156)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9358 (1.0354)  time: 0.3495  data: 0.0004  max mem: 28503
Epoch: [192]  [ 800/1251]  eta: 0:02:38  lr: 0.001284  min_lr: 0.001284  loss: 3.2856 (3.0170)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0704 (1.0722)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [192]  [1000/1251]  eta: 0:01:28  lr: 0.001281  min_lr: 0.001281  loss: 3.1895 (3.0173)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8457 (1.0393)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [192]  [1200/1251]  eta: 0:00:17  lr: 0.001278  min_lr: 0.001278  loss: 3.0013 (3.0214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0371 (1.0370)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [192]  [1250/1251]  eta: 0:00:00  lr: 0.001277  min_lr: 0.001277  loss: 3.0387 (3.0223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0600 (1.0382)  time: 0.3009  data: 0.0005  max mem: 28503
Epoch: [192] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.001277  min_lr: 0.001277  loss: 3.0387 (3.0337)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0600 (1.0382)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6362 (0.6362)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5424  data: 5.3412  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8787 (0.8311)  acc1: 85.2000 (84.3273)  acc5: 97.6000 (97.4909)  time: 0.7315  data: 0.5583  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9811 (0.9712)  acc1: 78.0000 (80.7048)  acc5: 95.6000 (95.9810)  time: 0.2178  data: 0.0460  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0621 (0.9816)  acc1: 77.6000 (80.3520)  acc5: 94.8000 (95.9040)  time: 0.2169  data: 0.0459  max mem: 28503
Test: Total time: 0:00:10 (0.4260 s / it)
* Acc@1 81.224 Acc@5 95.922 loss 0.966
Accuracy of the model on the 50000 test images: 81.2%
Max accuracy: 81.39%
Epoch: [193]  [   0/1251]  eta: 1:10:29  lr: 0.001277  min_lr: 0.001277  loss: 3.0938 (3.0938)  weight_decay: 0.0500 (0.0500)  time: 3.3812  data: 1.8844  max mem: 28503
Epoch: [193]  [ 200/1251]  eta: 0:06:20  lr: 0.001274  min_lr: 0.001274  loss: 3.3753 (3.0828)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9332 (1.0662)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [193]  [ 400/1251]  eta: 0:05:02  lr: 0.001270  min_lr: 0.001270  loss: 3.0875 (3.0528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0770 (1.0159)  time: 0.3545  data: 0.0004  max mem: 28503
Epoch: [193]  [ 600/1251]  eta: 0:03:49  lr: 0.001267  min_lr: 0.001267  loss: 3.2586 (3.0512)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9591 (1.0032)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [193]  [ 800/1251]  eta: 0:02:38  lr: 0.001264  min_lr: 0.001264  loss: 3.0514 (3.0550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9187 (0.9983)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [193]  [1000/1251]  eta: 0:01:28  lr: 0.001260  min_lr: 0.001260  loss: 3.0500 (3.0453)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9063 (0.9967)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [193]  [1200/1251]  eta: 0:00:17  lr: 0.001257  min_lr: 0.001257  loss: 3.1308 (3.0383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0088 (1.0047)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [193]  [1250/1251]  eta: 0:00:00  lr: 0.001256  min_lr: 0.001256  loss: 3.1710 (3.0367)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0196 (1.0091)  time: 0.2991  data: 0.0006  max mem: 28503
Epoch: [193] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.001256  min_lr: 0.001256  loss: 3.1710 (3.0264)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0196 (1.0091)
Test:  [ 0/25]  eta: 0:02:06  loss: 0.7005 (0.7005)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.0639  data: 4.8634  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8883 (0.8749)  acc1: 85.6000 (84.9818)  acc5: 98.0000 (97.7818)  time: 0.7130  data: 0.5398  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0527 (1.0235)  acc1: 78.8000 (81.2762)  acc5: 95.2000 (96.0762)  time: 0.2232  data: 0.0538  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0997 (1.0327)  acc1: 78.8000 (80.9600)  acc5: 95.2000 (95.9840)  time: 0.2221  data: 0.0537  max mem: 28503
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 81.460 Acc@5 95.904 loss 1.025
Accuracy of the model on the 50000 test images: 81.5%
Max accuracy: 81.46%
Epoch: [194]  [   0/1251]  eta: 1:07:40  lr: 0.001256  min_lr: 0.001256  loss: 2.4126 (2.4126)  weight_decay: 0.0500 (0.0500)  time: 3.2455  data: 2.8767  max mem: 28503
Epoch: [194]  [ 200/1251]  eta: 0:06:21  lr: 0.001253  min_lr: 0.001253  loss: 2.8889 (3.0112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0648 (1.1310)  time: 0.3563  data: 0.0004  max mem: 28503
Epoch: [194]  [ 400/1251]  eta: 0:05:02  lr: 0.001249  min_lr: 0.001249  loss: 3.1114 (3.0203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8581 (1.0285)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [194]  [ 600/1251]  eta: 0:03:49  lr: 0.001246  min_lr: 0.001246  loss: 2.8916 (2.9936)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9604 (1.0323)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [194]  [ 800/1251]  eta: 0:02:38  lr: 0.001243  min_lr: 0.001243  loss: 2.9688 (3.0031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8632 (1.0223)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [194]  [1000/1251]  eta: 0:01:28  lr: 0.001239  min_lr: 0.001239  loss: 3.0718 (3.0067)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0243 (1.0136)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [194]  [1200/1251]  eta: 0:00:17  lr: 0.001236  min_lr: 0.001236  loss: 2.8416 (3.0101)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9256 (1.0231)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [194]  [1250/1251]  eta: 0:00:00  lr: 0.001235  min_lr: 0.001235  loss: 3.0534 (3.0102)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9641 (1.0242)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [194] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.001235  min_lr: 0.001235  loss: 3.0534 (3.0176)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9641 (1.0242)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6331 (0.6331)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.3409  data: 5.1559  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8245 (0.8239)  acc1: 85.6000 (84.1818)  acc5: 98.0000 (97.4909)  time: 0.6539  data: 0.4838  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9487 (0.9583)  acc1: 78.8000 (80.9333)  acc5: 96.0000 (95.9810)  time: 0.1817  data: 0.0132  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0332 (0.9632)  acc1: 78.0000 (80.6720)  acc5: 95.2000 (95.9360)  time: 0.1901  data: 0.0216  max mem: 28503
Test: Total time: 0:00:09 (0.3972 s / it)
* Acc@1 81.304 Acc@5 95.944 loss 0.950
Accuracy of the model on the 50000 test images: 81.3%
Max accuracy: 81.46%
Epoch: [195]  [   0/1251]  eta: 1:08:29  lr: 0.001235  min_lr: 0.001235  loss: 2.2228 (2.2228)  weight_decay: 0.0500 (0.0500)  time: 3.2847  data: 1.5848  max mem: 28503
Epoch: [195]  [ 200/1251]  eta: 0:06:21  lr: 0.001232  min_lr: 0.001232  loss: 2.4584 (2.9374)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [195]  [ 400/1251]  eta: 0:05:02  lr: 0.001229  min_lr: 0.001229  loss: 3.2676 (2.9757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8918 (nan)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [195]  [ 600/1251]  eta: 0:03:49  lr: 0.001225  min_lr: 0.001225  loss: 3.0677 (2.9633)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8683 (nan)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [195]  [ 800/1251]  eta: 0:02:38  lr: 0.001222  min_lr: 0.001222  loss: 3.2516 (2.9865)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [195]  [1000/1251]  eta: 0:01:28  lr: 0.001219  min_lr: 0.001219  loss: 3.1438 (2.9909)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8641 (nan)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [195]  [1200/1251]  eta: 0:00:17  lr: 0.001215  min_lr: 0.001215  loss: 2.9900 (2.9947)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9531 (nan)  time: 0.3604  data: 0.0004  max mem: 28503
Epoch: [195]  [1250/1251]  eta: 0:00:00  lr: 0.001215  min_lr: 0.001215  loss: 3.1406 (2.9982)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9169 (nan)  time: 0.2921  data: 0.0005  max mem: 28503
Epoch: [195] Total time: 0:07:17 (0.3501 s / it)
Averaged stats: lr: 0.001215  min_lr: 0.001215  loss: 3.1406 (3.0105)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9169 (nan)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.7351 (0.7351)  acc1: 87.2000 (87.2000)  acc5: 98.4000 (98.4000)  time: 5.3884  data: 5.1574  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9245 (0.8881)  acc1: 84.8000 (83.7455)  acc5: 97.6000 (97.5273)  time: 0.7279  data: 0.5516  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0595 (1.0168)  acc1: 79.6000 (81.0095)  acc5: 95.6000 (95.9429)  time: 0.2152  data: 0.0456  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0955 (1.0235)  acc1: 79.6000 (80.7680)  acc5: 94.8000 (95.8080)  time: 0.2139  data: 0.0455  max mem: 28503
Test: Total time: 0:00:10 (0.4181 s / it)
* Acc@1 81.358 Acc@5 96.008 loss 1.009
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.46%
Epoch: [196]  [   0/1251]  eta: 1:11:30  lr: 0.001215  min_lr: 0.001215  loss: 3.1493 (3.1493)  weight_decay: 0.0500 (0.0500)  time: 3.4299  data: 2.2340  max mem: 28503
Epoch: [196]  [ 200/1251]  eta: 0:06:21  lr: 0.001211  min_lr: 0.001211  loss: 2.6871 (3.0026)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8707 (0.9401)  time: 0.3462  data: 0.0005  max mem: 28503
Epoch: [196]  [ 400/1251]  eta: 0:05:02  lr: 0.001208  min_lr: 0.001208  loss: 3.0492 (3.0255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1316 (1.1220)  time: 0.3461  data: 0.0003  max mem: 28503
Epoch: [196]  [ 600/1251]  eta: 0:03:50  lr: 0.001205  min_lr: 0.001205  loss: 3.1648 (3.0074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0627 (1.0877)  time: 0.3459  data: 0.0003  max mem: 28503
Epoch: [196]  [ 800/1251]  eta: 0:02:38  lr: 0.001201  min_lr: 0.001201  loss: 2.9170 (2.9946)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8677 (1.0761)  time: 0.3477  data: 0.0003  max mem: 28503
Epoch: [196]  [1000/1251]  eta: 0:01:28  lr: 0.001198  min_lr: 0.001198  loss: 3.1588 (2.9863)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9481 (1.0685)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [196]  [1200/1251]  eta: 0:00:17  lr: 0.001195  min_lr: 0.001195  loss: 3.2612 (2.9989)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0202 (1.0614)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [196]  [1250/1251]  eta: 0:00:00  lr: 0.001194  min_lr: 0.001194  loss: 3.1986 (3.0045)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1785 (1.0678)  time: 0.2919  data: 0.0005  max mem: 28503
Epoch: [196] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.001194  min_lr: 0.001194  loss: 3.1986 (3.0058)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1785 (1.0678)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6835 (0.6835)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4674  data: 5.2613  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.9238 (0.8843)  acc1: 85.6000 (84.3636)  acc5: 97.6000 (97.4546)  time: 0.7366  data: 0.5634  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0734 (1.0210)  acc1: 79.2000 (81.0476)  acc5: 95.6000 (95.8476)  time: 0.2160  data: 0.0469  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0734 (1.0332)  acc1: 79.2000 (80.6400)  acc5: 94.8000 (95.6640)  time: 0.2152  data: 0.0467  max mem: 28503
Test: Total time: 0:00:10 (0.4218 s / it)
* Acc@1 81.350 Acc@5 96.014 loss 1.014
Accuracy of the model on the 50000 test images: 81.4%
Max accuracy: 81.46%
Epoch: [197]  [   0/1251]  eta: 1:13:11  lr: 0.001194  min_lr: 0.001194  loss: 3.3528 (3.3528)  weight_decay: 0.0500 (0.0500)  time: 3.5103  data: 1.6625  max mem: 28503
Epoch: [197]  [ 200/1251]  eta: 0:06:24  lr: 0.001191  min_lr: 0.001191  loss: 2.9669 (3.0039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0010 (1.0377)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [197]  [ 400/1251]  eta: 0:05:03  lr: 0.001187  min_lr: 0.001187  loss: 3.1583 (3.0183)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0281 (1.0256)  time: 0.3486  data: 0.0005  max mem: 28503
Epoch: [197]  [ 600/1251]  eta: 0:03:50  lr: 0.001184  min_lr: 0.001184  loss: 3.1869 (3.0203)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9538 (1.0436)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [197]  [ 800/1251]  eta: 0:02:38  lr: 0.001181  min_lr: 0.001181  loss: 3.2426 (3.0247)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8884 (1.0291)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [197]  [1000/1251]  eta: 0:01:28  lr: 0.001178  min_lr: 0.001178  loss: 2.9937 (3.0184)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8338 (1.0410)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [197]  [1200/1251]  eta: 0:00:17  lr: 0.001174  min_lr: 0.001174  loss: 3.2035 (3.0095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0478 (1.0400)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [197]  [1250/1251]  eta: 0:00:00  lr: 0.001174  min_lr: 0.001174  loss: 3.1029 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0761 (1.0435)  time: 0.2919  data: 0.0006  max mem: 28503
Epoch: [197] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.001174  min_lr: 0.001174  loss: 3.1029 (3.0095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0761 (1.0435)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6427 (0.6427)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.7418  data: 5.5524  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8752 (0.8242)  acc1: 84.4000 (83.8545)  acc5: 97.6000 (97.3091)  time: 0.7115  data: 0.5404  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9890 (0.9620)  acc1: 79.2000 (80.9143)  acc5: 96.0000 (95.7905)  time: 0.1885  data: 0.0196  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0414 (0.9668)  acc1: 79.2000 (80.6880)  acc5: 96.0000 (95.8400)  time: 0.1881  data: 0.0196  max mem: 28503
Test: Total time: 0:00:10 (0.4112 s / it)
* Acc@1 81.568 Acc@5 96.036 loss 0.954
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.57%
Epoch: [198]  [   0/1251]  eta: 1:06:42  lr: 0.001174  min_lr: 0.001174  loss: 2.1802 (2.1802)  weight_decay: 0.0500 (0.0500)  time: 3.1994  data: 2.7914  max mem: 28503
Epoch: [198]  [ 200/1251]  eta: 0:06:19  lr: 0.001170  min_lr: 0.001170  loss: 2.9677 (3.0189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0992 (1.0472)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [198]  [ 400/1251]  eta: 0:05:01  lr: 0.001167  min_lr: 0.001167  loss: 3.1848 (2.9964)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8850 (1.0036)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [198]  [ 600/1251]  eta: 0:03:49  lr: 0.001164  min_lr: 0.001164  loss: 3.1624 (3.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9943 (1.0355)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [198]  [ 800/1251]  eta: 0:02:38  lr: 0.001161  min_lr: 0.001161  loss: 3.2225 (3.0101)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2746 (1.0655)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [198]  [1000/1251]  eta: 0:01:28  lr: 0.001157  min_lr: 0.001157  loss: 3.2106 (3.0116)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9653 (1.0647)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [198]  [1200/1251]  eta: 0:00:17  lr: 0.001154  min_lr: 0.001154  loss: 2.7890 (3.0100)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0646)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [198]  [1250/1251]  eta: 0:00:00  lr: 0.001153  min_lr: 0.001153  loss: 3.1206 (3.0087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0648)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [198] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.001153  min_lr: 0.001153  loss: 3.1206 (3.0028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9317 (1.0648)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.6101 (0.6101)  acc1: 89.2000 (89.2000)  acc5: 99.6000 (99.6000)  time: 5.6629  data: 5.4474  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8403 (0.8113)  acc1: 84.8000 (85.0545)  acc5: 97.2000 (97.4182)  time: 0.7206  data: 0.5473  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9849 (0.9484)  acc1: 80.8000 (81.6000)  acc5: 95.6000 (95.9238)  time: 0.1974  data: 0.0287  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0203 (0.9557)  acc1: 79.2000 (81.0880)  acc5: 95.6000 (96.0000)  time: 0.1970  data: 0.0286  max mem: 28503
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 81.582 Acc@5 96.002 loss 0.947
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.58%
Epoch: [199]  [   0/1251]  eta: 1:00:42  lr: 0.001153  min_lr: 0.001153  loss: 3.0328 (3.0328)  weight_decay: 0.0500 (0.0500)  time: 2.9113  data: 2.5471  max mem: 28503
Epoch: [199]  [ 200/1251]  eta: 0:06:19  lr: 0.001150  min_lr: 0.001150  loss: 3.0018 (2.9669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0463 (1.1573)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [199]  [ 400/1251]  eta: 0:05:01  lr: 0.001147  min_lr: 0.001147  loss: 2.8653 (2.9574)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8769 (1.0996)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [199]  [ 600/1251]  eta: 0:03:49  lr: 0.001143  min_lr: 0.001143  loss: 3.0102 (2.9856)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1010 (1.0870)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [199]  [ 800/1251]  eta: 0:02:38  lr: 0.001140  min_lr: 0.001140  loss: 3.0705 (2.9837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1402 (1.0964)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [199]  [1000/1251]  eta: 0:01:27  lr: 0.001137  min_lr: 0.001137  loss: 2.9758 (2.9840)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9120 (1.0766)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [199]  [1200/1251]  eta: 0:00:17  lr: 0.001134  min_lr: 0.001134  loss: 2.9903 (2.9830)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0530 (1.0699)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [199]  [1250/1251]  eta: 0:00:00  lr: 0.001133  min_lr: 0.001133  loss: 3.3157 (2.9845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1338 (1.0727)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [199] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001133  min_lr: 0.001133  loss: 3.3157 (3.0018)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1338 (1.0727)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.7039 (0.7039)  acc1: 88.8000 (88.8000)  acc5: 98.4000 (98.4000)  time: 5.5656  data: 5.3481  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8674 (0.8335)  acc1: 84.8000 (85.0182)  acc5: 98.0000 (97.7455)  time: 0.6958  data: 0.5208  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9796 (0.9718)  acc1: 80.0000 (81.7714)  acc5: 95.6000 (96.0191)  time: 0.1886  data: 0.0191  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0432 (0.9790)  acc1: 78.8000 (81.3440)  acc5: 95.6000 (96.0480)  time: 0.1873  data: 0.0190  max mem: 28503
Test: Total time: 0:00:10 (0.4037 s / it)
* Acc@1 81.762 Acc@5 96.154 loss 0.971
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.76%
Epoch: [200]  [   0/1251]  eta: 1:00:50  lr: 0.001133  min_lr: 0.001133  loss: 3.3590 (3.3590)  weight_decay: 0.0500 (0.0500)  time: 2.9177  data: 2.5408  max mem: 28503
Epoch: [200]  [ 200/1251]  eta: 0:06:18  lr: 0.001130  min_lr: 0.001130  loss: 3.2127 (3.0150)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0089 (1.0060)  time: 0.3546  data: 0.0005  max mem: 28503
Epoch: [200]  [ 400/1251]  eta: 0:05:02  lr: 0.001126  min_lr: 0.001126  loss: 3.0243 (2.9885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0849 (1.0925)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [200]  [ 600/1251]  eta: 0:03:49  lr: 0.001123  min_lr: 0.001123  loss: 3.0818 (2.9640)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9068 (1.0608)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [200]  [ 800/1251]  eta: 0:02:38  lr: 0.001120  min_lr: 0.001120  loss: 3.1016 (2.9695)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0321 (1.1132)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [200]  [1000/1251]  eta: 0:01:28  lr: 0.001117  min_lr: 0.001117  loss: 2.9093 (2.9744)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9882 (1.1229)  time: 0.3457  data: 0.0005  max mem: 28503
Epoch: [200]  [1200/1251]  eta: 0:00:17  lr: 0.001114  min_lr: 0.001114  loss: 3.0126 (2.9774)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0968 (1.1109)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [200]  [1250/1251]  eta: 0:00:00  lr: 0.001113  min_lr: 0.001113  loss: 2.8832 (2.9786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1822 (1.1159)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [200] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.001113  min_lr: 0.001113  loss: 2.8832 (2.9881)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1822 (1.1159)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5756 (0.5756)  acc1: 88.8000 (88.8000)  acc5: 99.2000 (99.2000)  time: 5.4788  data: 5.2716  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8225 (0.7565)  acc1: 84.8000 (84.8727)  acc5: 96.8000 (97.2364)  time: 0.6553  data: 0.4831  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9465 (0.8959)  acc1: 80.0000 (81.5619)  acc5: 96.0000 (95.6000)  time: 0.1898  data: 0.0213  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9729 (0.9067)  acc1: 80.0000 (81.3120)  acc5: 95.2000 (95.6320)  time: 0.1930  data: 0.0246  max mem: 28503
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 81.748 Acc@5 96.066 loss 0.889
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.76%
Epoch: [201]  [   0/1251]  eta: 1:11:12  lr: 0.001113  min_lr: 0.001113  loss: 3.7750 (3.7750)  weight_decay: 0.0500 (0.0500)  time: 3.4152  data: 2.1687  max mem: 28503
Epoch: [201]  [ 200/1251]  eta: 0:06:23  lr: 0.001110  min_lr: 0.001110  loss: 3.2191 (3.0178)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0087 (1.0965)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [201]  [ 400/1251]  eta: 0:05:03  lr: 0.001106  min_lr: 0.001106  loss: 3.1158 (3.0006)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0209 (1.0747)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [201]  [ 600/1251]  eta: 0:03:50  lr: 0.001103  min_lr: 0.001103  loss: 3.1200 (3.0099)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1089 (1.0966)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [201]  [ 800/1251]  eta: 0:02:38  lr: 0.001100  min_lr: 0.001100  loss: 2.9975 (3.0254)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0634 (1.0875)  time: 0.3556  data: 0.0004  max mem: 28503
Epoch: [201]  [1000/1251]  eta: 0:01:28  lr: 0.001097  min_lr: 0.001097  loss: 2.8024 (3.0124)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0460 (1.0978)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [201]  [1200/1251]  eta: 0:00:17  lr: 0.001094  min_lr: 0.001094  loss: 2.9651 (3.0114)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0687 (1.0961)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [201]  [1250/1251]  eta: 0:00:00  lr: 0.001093  min_lr: 0.001093  loss: 3.0290 (3.0117)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9011 (1.0877)  time: 0.2966  data: 0.0005  max mem: 28503
Epoch: [201] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.001093  min_lr: 0.001093  loss: 3.0290 (2.9957)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9011 (1.0877)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6249 (0.6249)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.3995  data: 5.2043  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8521 (0.7994)  acc1: 84.8000 (84.7636)  acc5: 97.6000 (97.3818)  time: 0.7130  data: 0.5416  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9506 (0.9328)  acc1: 78.4000 (81.5429)  acc5: 96.0000 (96.0762)  time: 0.2064  data: 0.0377  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0202 (0.9393)  acc1: 78.8000 (81.3280)  acc5: 95.6000 (96.0800)  time: 0.2061  data: 0.0377  max mem: 28503
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 81.604 Acc@5 96.126 loss 0.931
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.76%
Epoch: [202]  [   0/1251]  eta: 1:09:24  lr: 0.001093  min_lr: 0.001093  loss: 2.2414 (2.2414)  weight_decay: 0.0500 (0.0500)  time: 3.3287  data: 1.8678  max mem: 28503
Epoch: [202]  [ 200/1251]  eta: 0:06:20  lr: 0.001090  min_lr: 0.001090  loss: 3.0846 (3.0007)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9451 (1.0259)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [202]  [ 400/1251]  eta: 0:05:01  lr: 0.001086  min_lr: 0.001086  loss: 2.7611 (2.9949)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9342 (1.0686)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [202]  [ 600/1251]  eta: 0:03:49  lr: 0.001083  min_lr: 0.001083  loss: 3.1358 (3.0009)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [202]  [ 800/1251]  eta: 0:02:38  lr: 0.001080  min_lr: 0.001080  loss: 3.3166 (3.0039)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0869 (nan)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [202]  [1000/1251]  eta: 0:01:27  lr: 0.001077  min_lr: 0.001077  loss: 2.9183 (2.9958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8379 (nan)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [202]  [1200/1251]  eta: 0:00:17  lr: 0.001074  min_lr: 0.001074  loss: 3.1400 (3.0030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0542 (nan)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [202]  [1250/1251]  eta: 0:00:00  lr: 0.001073  min_lr: 0.001073  loss: 3.2043 (3.0063)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9792 (nan)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [202] Total time: 0:07:16 (0.3490 s / it)
Averaged stats: lr: 0.001073  min_lr: 0.001073  loss: 3.2043 (2.9812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9792 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6717 (0.6717)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.6894  data: 5.4835  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8873 (0.8567)  acc1: 86.0000 (84.9091)  acc5: 97.6000 (97.4909)  time: 0.6709  data: 0.4988  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0502 (0.9893)  acc1: 79.6000 (81.8667)  acc5: 96.0000 (96.1143)  time: 0.1689  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0617 (1.0021)  acc1: 79.6000 (81.4560)  acc5: 95.6000 (95.9680)  time: 0.1687  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3928 s / it)
* Acc@1 81.866 Acc@5 95.968 loss 0.991
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.87%
Epoch: [203]  [   0/1251]  eta: 1:08:13  lr: 0.001073  min_lr: 0.001073  loss: 2.4664 (2.4664)  weight_decay: 0.0500 (0.0500)  time: 3.2726  data: 2.9076  max mem: 28503
Epoch: [203]  [ 200/1251]  eta: 0:06:22  lr: 0.001070  min_lr: 0.001070  loss: 3.0171 (2.9265)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9192 (0.9690)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [203]  [ 400/1251]  eta: 0:05:02  lr: 0.001066  min_lr: 0.001066  loss: 2.8233 (2.9261)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9655 (1.0258)  time: 0.3452  data: 0.0003  max mem: 28503
Epoch: [203]  [ 600/1251]  eta: 0:03:49  lr: 0.001063  min_lr: 0.001063  loss: 3.1150 (2.9505)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8614 (1.0367)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [203]  [ 800/1251]  eta: 0:02:38  lr: 0.001060  min_lr: 0.001060  loss: 3.2622 (2.9732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1179 (1.0611)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [203]  [1000/1251]  eta: 0:01:27  lr: 0.001057  min_lr: 0.001057  loss: 2.9980 (2.9671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0001 (1.1049)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [203]  [1200/1251]  eta: 0:00:17  lr: 0.001054  min_lr: 0.001054  loss: 3.2934 (2.9734)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9500 (1.0951)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [203]  [1250/1251]  eta: 0:00:00  lr: 0.001053  min_lr: 0.001053  loss: 3.2017 (2.9757)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (1.0879)  time: 0.2916  data: 0.0006  max mem: 28503
Epoch: [203] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.001053  min_lr: 0.001053  loss: 3.2017 (2.9812)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9422 (1.0879)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.5871 (0.5871)  acc1: 87.6000 (87.6000)  acc5: 98.4000 (98.4000)  time: 5.2005  data: 5.0031  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7983 (0.7652)  acc1: 85.2000 (84.6545)  acc5: 97.6000 (97.4546)  time: 0.6429  data: 0.4677  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9820 (0.9157)  acc1: 80.0000 (81.4476)  acc5: 96.0000 (95.7524)  time: 0.2046  data: 0.0339  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0127 (0.9252)  acc1: 80.0000 (81.1360)  acc5: 95.2000 (95.7600)  time: 0.2082  data: 0.0381  max mem: 28503
Test: Total time: 0:00:10 (0.4055 s / it)
* Acc@1 81.712 Acc@5 96.060 loss 0.914
Accuracy of the model on the 50000 test images: 81.7%
Max accuracy: 81.87%
Epoch: [204]  [   0/1251]  eta: 1:08:57  lr: 0.001053  min_lr: 0.001053  loss: 1.8258 (1.8258)  weight_decay: 0.0500 (0.0500)  time: 3.3071  data: 2.4545  max mem: 28503
Epoch: [204]  [ 200/1251]  eta: 0:06:25  lr: 0.001050  min_lr: 0.001050  loss: 3.2019 (2.9837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0132 (1.2010)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [204]  [ 400/1251]  eta: 0:05:03  lr: 0.001047  min_lr: 0.001047  loss: 2.8760 (3.0035)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9281 (1.0979)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [204]  [ 600/1251]  eta: 0:03:50  lr: 0.001044  min_lr: 0.001044  loss: 2.8909 (3.0023)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2091 (1.1312)  time: 0.3509  data: 0.0004  max mem: 28503
Epoch: [204]  [ 800/1251]  eta: 0:02:38  lr: 0.001040  min_lr: 0.001040  loss: 2.9542 (2.9937)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9625 (1.1177)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [204]  [1000/1251]  eta: 0:01:28  lr: 0.001037  min_lr: 0.001037  loss: 3.1328 (3.0091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0203 (1.1109)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [204]  [1200/1251]  eta: 0:00:17  lr: 0.001034  min_lr: 0.001034  loss: 3.0337 (3.0103)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9456 (1.1028)  time: 0.3462  data: 0.0003  max mem: 28503
Epoch: [204]  [1250/1251]  eta: 0:00:00  lr: 0.001033  min_lr: 0.001033  loss: 3.1383 (3.0088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1974 (1.1059)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [204] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.001033  min_lr: 0.001033  loss: 3.1383 (2.9769)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1974 (1.1059)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6023 (0.6023)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.4363  data: 5.1987  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8490 (0.8145)  acc1: 85.6000 (84.8000)  acc5: 97.6000 (97.4182)  time: 0.6498  data: 0.4730  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9992 (0.9492)  acc1: 79.6000 (81.4857)  acc5: 95.6000 (95.8667)  time: 0.1842  data: 0.0146  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9992 (0.9600)  acc1: 79.6000 (81.2800)  acc5: 95.2000 (95.8240)  time: 0.1830  data: 0.0145  max mem: 28503
Test: Total time: 0:00:09 (0.3974 s / it)
* Acc@1 81.610 Acc@5 96.064 loss 0.946
Accuracy of the model on the 50000 test images: 81.6%
Max accuracy: 81.87%
Epoch: [205]  [   0/1251]  eta: 1:05:55  lr: 0.001033  min_lr: 0.001033  loss: 2.7043 (2.7043)  weight_decay: 0.0500 (0.0500)  time: 3.1617  data: 2.5601  max mem: 28503
Epoch: [205]  [ 200/1251]  eta: 0:06:23  lr: 0.001030  min_lr: 0.001030  loss: 3.1892 (3.0280)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9115 (1.1208)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [205]  [ 400/1251]  eta: 0:05:03  lr: 0.001027  min_lr: 0.001027  loss: 3.2026 (2.9866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0995 (1.0750)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [205]  [ 600/1251]  eta: 0:03:50  lr: 0.001024  min_lr: 0.001024  loss: 2.8494 (2.9919)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1694 (1.1237)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [205]  [ 800/1251]  eta: 0:02:38  lr: 0.001021  min_lr: 0.001021  loss: 3.1080 (2.9958)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9159 (1.0866)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [205]  [1000/1251]  eta: 0:01:28  lr: 0.001018  min_lr: 0.001018  loss: 2.9155 (2.9816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0699 (1.0866)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [205]  [1200/1251]  eta: 0:00:17  lr: 0.001014  min_lr: 0.001014  loss: 2.9349 (2.9867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8750 (1.0694)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [205]  [1250/1251]  eta: 0:00:00  lr: 0.001014  min_lr: 0.001014  loss: 3.0358 (2.9867)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8750 (1.0639)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [205] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.001014  min_lr: 0.001014  loss: 3.0358 (2.9750)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8750 (1.0639)
Test:  [ 0/25]  eta: 0:01:55  loss: 0.6084 (0.6084)  acc1: 88.4000 (88.4000)  acc5: 99.2000 (99.2000)  time: 4.6139  data: 4.4087  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8157 (0.7753)  acc1: 85.2000 (84.9455)  acc5: 98.0000 (97.6000)  time: 0.6725  data: 0.4984  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9444 (0.9115)  acc1: 79.2000 (81.9048)  acc5: 96.0000 (96.3238)  time: 0.2370  data: 0.0670  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0095 (0.9231)  acc1: 79.2000 (81.4720)  acc5: 95.6000 (96.1600)  time: 0.1823  data: 0.0135  max mem: 28503
Test: Total time: 0:00:10 (0.4044 s / it)
* Acc@1 81.824 Acc@5 96.210 loss 0.907
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.87%
Epoch: [206]  [   0/1251]  eta: 1:09:12  lr: 0.001014  min_lr: 0.001014  loss: 2.9704 (2.9704)  weight_decay: 0.0500 (0.0500)  time: 3.3190  data: 2.5843  max mem: 28503
Epoch: [206]  [ 200/1251]  eta: 0:06:23  lr: 0.001011  min_lr: 0.001011  loss: 3.1038 (2.9266)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8864 (0.9676)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [206]  [ 400/1251]  eta: 0:05:02  lr: 0.001007  min_lr: 0.001007  loss: 2.9228 (2.9367)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8837 (1.0121)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [206]  [ 600/1251]  eta: 0:03:49  lr: 0.001004  min_lr: 0.001004  loss: 3.1453 (2.9550)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9564 (1.0168)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [206]  [ 800/1251]  eta: 0:02:38  lr: 0.001001  min_lr: 0.001001  loss: 3.2228 (2.9582)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8890 (1.0422)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [206]  [1000/1251]  eta: 0:01:28  lr: 0.000998  min_lr: 0.000998  loss: 2.9216 (2.9630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0951 (1.0619)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [206]  [1200/1251]  eta: 0:00:17  lr: 0.000995  min_lr: 0.000995  loss: 2.6827 (2.9556)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1253 (1.0722)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [206]  [1250/1251]  eta: 0:00:00  lr: 0.000994  min_lr: 0.000994  loss: 3.0147 (2.9563)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0843 (1.0734)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [206] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.000994  min_lr: 0.000994  loss: 3.0147 (2.9650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0843 (1.0734)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5467 (0.5467)  acc1: 89.6000 (89.6000)  acc5: 98.8000 (98.8000)  time: 5.4725  data: 5.2815  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7837 (0.7431)  acc1: 84.8000 (84.6909)  acc5: 98.0000 (97.6364)  time: 0.7004  data: 0.5299  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9134 (0.8776)  acc1: 78.8000 (81.5429)  acc5: 95.6000 (96.1143)  time: 0.1957  data: 0.0274  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9284 (0.8893)  acc1: 78.8000 (81.1840)  acc5: 95.2000 (96.1280)  time: 0.1955  data: 0.0273  max mem: 28503
Test: Total time: 0:00:10 (0.4061 s / it)
* Acc@1 81.800 Acc@5 96.210 loss 0.874
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.87%
Epoch: [207]  [   0/1251]  eta: 1:13:46  lr: 0.000994  min_lr: 0.000994  loss: 3.0601 (3.0601)  weight_decay: 0.0500 (0.0500)  time: 3.5383  data: 1.8837  max mem: 28503
Epoch: [207]  [ 200/1251]  eta: 0:06:22  lr: 0.000991  min_lr: 0.000991  loss: 3.2410 (2.9320)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9223 (0.9479)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [207]  [ 400/1251]  eta: 0:05:03  lr: 0.000988  min_lr: 0.000988  loss: 3.1070 (2.9553)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9932 (1.0220)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [207]  [ 600/1251]  eta: 0:03:50  lr: 0.000985  min_lr: 0.000985  loss: 2.9815 (2.9690)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9370 (1.0530)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [207]  [ 800/1251]  eta: 0:02:38  lr: 0.000982  min_lr: 0.000982  loss: 3.0594 (2.9601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2180 (1.0696)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [207]  [1000/1251]  eta: 0:01:28  lr: 0.000979  min_lr: 0.000979  loss: 3.1491 (2.9679)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0893 (1.1246)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [207]  [1200/1251]  eta: 0:00:17  lr: 0.000976  min_lr: 0.000976  loss: 2.7078 (2.9679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8954 (1.1078)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [207]  [1250/1251]  eta: 0:00:00  lr: 0.000975  min_lr: 0.000975  loss: 2.9691 (2.9679)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (1.1021)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [207] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.000975  min_lr: 0.000975  loss: 2.9691 (2.9584)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (1.1021)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6098 (0.6098)  acc1: 88.4000 (88.4000)  acc5: 98.8000 (98.8000)  time: 5.5162  data: 5.3086  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8258 (0.7857)  acc1: 85.2000 (84.5455)  acc5: 97.6000 (97.6000)  time: 0.7099  data: 0.5372  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9451 (0.9114)  acc1: 80.0000 (81.5048)  acc5: 95.6000 (96.2476)  time: 0.1988  data: 0.0301  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9967 (0.9228)  acc1: 79.6000 (81.1680)  acc5: 95.6000 (96.1280)  time: 0.1983  data: 0.0300  max mem: 28503
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 81.860 Acc@5 96.292 loss 0.905
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.87%
Epoch: [208]  [   0/1251]  eta: 1:12:19  lr: 0.000975  min_lr: 0.000975  loss: 3.3757 (3.3757)  weight_decay: 0.0500 (0.0500)  time: 3.4685  data: 1.7306  max mem: 28503
Epoch: [208]  [ 200/1251]  eta: 0:06:23  lr: 0.000972  min_lr: 0.000972  loss: 3.0533 (2.9243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0581 (1.1529)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [208]  [ 400/1251]  eta: 0:05:02  lr: 0.000969  min_lr: 0.000969  loss: 3.0972 (2.9428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0227 (1.1111)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [208]  [ 600/1251]  eta: 0:03:50  lr: 0.000966  min_lr: 0.000966  loss: 2.9288 (2.9394)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9853 (1.0736)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [208]  [ 800/1251]  eta: 0:02:38  lr: 0.000963  min_lr: 0.000963  loss: 3.0972 (2.9439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0816 (1.0843)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [208]  [1000/1251]  eta: 0:01:28  lr: 0.000960  min_lr: 0.000960  loss: 3.1285 (2.9384)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9839 (1.0895)  time: 0.3545  data: 0.0004  max mem: 28503
Epoch: [208]  [1200/1251]  eta: 0:00:17  lr: 0.000956  min_lr: 0.000956  loss: 3.2036 (2.9407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1123 (1.0867)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [208]  [1250/1251]  eta: 0:00:00  lr: 0.000956  min_lr: 0.000956  loss: 2.8958 (2.9439)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1364 (1.0921)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [208] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.000956  min_lr: 0.000956  loss: 2.8958 (2.9616)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1364 (1.0921)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.6809 (0.6809)  acc1: 88.4000 (88.4000)  acc5: 98.4000 (98.4000)  time: 5.3381  data: 5.1519  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8514 (0.8432)  acc1: 84.4000 (84.2545)  acc5: 97.6000 (97.4909)  time: 0.6846  data: 0.5137  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9968 (0.9895)  acc1: 78.4000 (81.2571)  acc5: 95.6000 (95.9810)  time: 0.1939  data: 0.0250  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1078 (0.9984)  acc1: 78.8000 (81.0560)  acc5: 95.2000 (95.7760)  time: 0.1934  data: 0.0249  max mem: 28503
Test: Total time: 0:00:09 (0.3993 s / it)
* Acc@1 81.784 Acc@5 95.972 loss 0.980
Accuracy of the model on the 50000 test images: 81.8%
Max accuracy: 81.87%
Epoch: [209]  [   0/1251]  eta: 1:12:27  lr: 0.000956  min_lr: 0.000956  loss: 1.9779 (1.9779)  weight_decay: 0.0500 (0.0500)  time: 3.4754  data: 3.0371  max mem: 28503
Epoch: [209]  [ 200/1251]  eta: 0:06:21  lr: 0.000953  min_lr: 0.000953  loss: 2.9322 (2.8862)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1578 (1.1402)  time: 0.3548  data: 0.0004  max mem: 28503
Epoch: [209]  [ 400/1251]  eta: 0:05:02  lr: 0.000950  min_lr: 0.000950  loss: 2.6704 (2.9074)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9378 (1.1131)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [209]  [ 600/1251]  eta: 0:03:49  lr: 0.000947  min_lr: 0.000947  loss: 3.1449 (2.9299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0593 (1.1262)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [209]  [ 800/1251]  eta: 0:02:38  lr: 0.000944  min_lr: 0.000944  loss: 2.9929 (2.9410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0823 (1.1199)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [209]  [1000/1251]  eta: 0:01:28  lr: 0.000940  min_lr: 0.000940  loss: 3.0983 (2.9371)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9043 (1.0945)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [209]  [1200/1251]  eta: 0:00:17  lr: 0.000937  min_lr: 0.000937  loss: 3.0353 (2.9346)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1982 (1.1290)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [209]  [1250/1251]  eta: 0:00:00  lr: 0.000937  min_lr: 0.000937  loss: 3.1767 (2.9343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3492 (1.1335)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [209] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.000937  min_lr: 0.000937  loss: 3.1767 (2.9457)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3492 (1.1335)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6174 (0.6174)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.5378  data: 5.3083  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8237 (0.7812)  acc1: 85.2000 (84.9455)  acc5: 97.6000 (97.6727)  time: 0.7297  data: 0.5541  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9730 (0.9060)  acc1: 80.0000 (81.9048)  acc5: 95.6000 (96.4000)  time: 0.2087  data: 0.0394  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9963 (0.9144)  acc1: 80.0000 (81.6160)  acc5: 95.6000 (96.2880)  time: 0.2085  data: 0.0394  max mem: 28503
Test: Total time: 0:00:10 (0.4189 s / it)
* Acc@1 81.886 Acc@5 96.222 loss 0.902
Accuracy of the model on the 50000 test images: 81.9%
Max accuracy: 81.89%
Epoch: [210]  [   0/1251]  eta: 1:09:40  lr: 0.000937  min_lr: 0.000937  loss: 3.6326 (3.6326)  weight_decay: 0.0500 (0.0500)  time: 3.3417  data: 2.9852  max mem: 28503
Epoch: [210]  [ 200/1251]  eta: 0:06:21  lr: 0.000934  min_lr: 0.000934  loss: 3.1670 (2.9771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0520 (1.0589)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [210]  [ 400/1251]  eta: 0:05:03  lr: 0.000931  min_lr: 0.000931  loss: 3.1014 (2.9825)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2216 (1.0808)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [210]  [ 600/1251]  eta: 0:03:49  lr: 0.000928  min_lr: 0.000928  loss: 2.8575 (2.9698)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0377 (1.1215)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [210]  [ 800/1251]  eta: 0:02:38  lr: 0.000925  min_lr: 0.000925  loss: 2.9865 (2.9563)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9659 (1.1088)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [210]  [1000/1251]  eta: 0:01:27  lr: 0.000922  min_lr: 0.000922  loss: 3.1917 (2.9707)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9463 (1.1067)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [210]  [1200/1251]  eta: 0:00:17  lr: 0.000918  min_lr: 0.000918  loss: 3.1278 (2.9646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2102 (1.1177)  time: 0.3531  data: 0.0004  max mem: 28503
Epoch: [210]  [1250/1251]  eta: 0:00:00  lr: 0.000918  min_lr: 0.000918  loss: 3.1094 (2.9640)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0625 (1.1157)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [210] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000918  min_lr: 0.000918  loss: 3.1094 (2.9522)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0625 (1.1157)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6125 (0.6125)  acc1: 90.8000 (90.8000)  acc5: 98.8000 (98.8000)  time: 5.5020  data: 5.2997  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8323 (0.7855)  acc1: 86.0000 (85.0545)  acc5: 97.6000 (97.6364)  time: 0.7245  data: 0.5507  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9620 (0.9233)  acc1: 79.2000 (81.9810)  acc5: 95.6000 (96.2095)  time: 0.2097  data: 0.0380  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0294 (0.9308)  acc1: 79.2000 (81.5520)  acc5: 95.2000 (96.1440)  time: 0.2084  data: 0.0379  max mem: 28503
Test: Total time: 0:00:10 (0.4187 s / it)
* Acc@1 82.042 Acc@5 96.258 loss 0.919
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.04%
Epoch: [211]  [   0/1251]  eta: 1:07:50  lr: 0.000918  min_lr: 0.000918  loss: 3.3115 (3.3115)  weight_decay: 0.0500 (0.0500)  time: 3.2540  data: 2.8713  max mem: 28503
Epoch: [211]  [ 200/1251]  eta: 0:06:19  lr: 0.000915  min_lr: 0.000915  loss: 2.8750 (2.9494)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9875 (0.9744)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [211]  [ 400/1251]  eta: 0:05:01  lr: 0.000912  min_lr: 0.000912  loss: 3.0822 (2.9646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1965 (1.0647)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [211]  [ 600/1251]  eta: 0:03:49  lr: 0.000909  min_lr: 0.000909  loss: 2.7495 (2.9483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0401 (1.0494)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [211]  [ 800/1251]  eta: 0:02:38  lr: 0.000906  min_lr: 0.000906  loss: 2.8035 (2.9306)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8935 (1.0643)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [211]  [1000/1251]  eta: 0:01:28  lr: 0.000903  min_lr: 0.000903  loss: 2.7101 (2.9294)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3251 (1.1255)  time: 0.3450  data: 0.0005  max mem: 28503
Epoch: [211]  [1200/1251]  eta: 0:00:17  lr: 0.000900  min_lr: 0.000900  loss: 2.8391 (2.9361)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8903 (1.1017)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [211]  [1250/1251]  eta: 0:00:00  lr: 0.000899  min_lr: 0.000899  loss: 3.2054 (2.9375)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9603 (1.0981)  time: 0.2926  data: 0.0006  max mem: 28503
Epoch: [211] Total time: 0:07:17 (0.3499 s / it)
Averaged stats: lr: 0.000899  min_lr: 0.000899  loss: 3.2054 (2.9473)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9603 (1.0981)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6623 (0.6623)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.6017  data: 5.3982  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8554 (0.8352)  acc1: 85.2000 (85.2727)  acc5: 97.6000 (97.5273)  time: 0.7234  data: 0.5506  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0159 (0.9717)  acc1: 80.0000 (81.8476)  acc5: 95.6000 (96.1524)  time: 0.2176  data: 0.0487  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0854 (0.9838)  acc1: 79.6000 (81.3600)  acc5: 94.8000 (95.9680)  time: 0.2167  data: 0.0486  max mem: 28503
Test: Total time: 0:00:10 (0.4283 s / it)
* Acc@1 81.964 Acc@5 96.196 loss 0.973
Accuracy of the model on the 50000 test images: 82.0%
Max accuracy: 82.04%
Epoch: [212]  [   0/1251]  eta: 1:02:39  lr: 0.000899  min_lr: 0.000899  loss: 3.3153 (3.3153)  weight_decay: 0.0500 (0.0500)  time: 3.0051  data: 1.7231  max mem: 28503
Epoch: [212]  [ 200/1251]  eta: 0:06:24  lr: 0.000896  min_lr: 0.000896  loss: 3.1073 (2.9497)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9780 (1.1274)  time: 0.3543  data: 0.0004  max mem: 28503
Epoch: [212]  [ 400/1251]  eta: 0:05:03  lr: 0.000893  min_lr: 0.000893  loss: 3.1531 (2.9452)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0217 (1.0682)  time: 0.3484  data: 0.0005  max mem: 28503
Epoch: [212]  [ 600/1251]  eta: 0:03:50  lr: 0.000890  min_lr: 0.000890  loss: 2.7779 (2.9201)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0568 (1.0593)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [212]  [ 800/1251]  eta: 0:02:38  lr: 0.000887  min_lr: 0.000887  loss: 3.1657 (2.9309)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1848 (1.0727)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [212]  [1000/1251]  eta: 0:01:28  lr: 0.000884  min_lr: 0.000884  loss: 3.2231 (2.9231)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5117 (1.1209)  time: 0.3579  data: 0.0004  max mem: 28503
Epoch: [212]  [1200/1251]  eta: 0:00:17  lr: 0.000881  min_lr: 0.000881  loss: 2.9711 (2.9164)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1463 (1.1210)  time: 0.3590  data: 0.0004  max mem: 28503
Epoch: [212]  [1250/1251]  eta: 0:00:00  lr: 0.000880  min_lr: 0.000880  loss: 3.0557 (2.9192)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1822 (1.1213)  time: 0.2917  data: 0.0005  max mem: 28503
Epoch: [212] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.000880  min_lr: 0.000880  loss: 3.0557 (2.9372)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1822 (1.1213)
Test:  [ 0/25]  eta: 0:02:10  loss: 0.6152 (0.6152)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.2034  data: 5.0075  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8347 (0.8103)  acc1: 87.2000 (85.4182)  acc5: 98.0000 (97.6727)  time: 0.6802  data: 0.5091  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0307 (0.9378)  acc1: 79.2000 (82.1905)  acc5: 96.0000 (96.1905)  time: 0.2127  data: 0.0437  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0344 (0.9495)  acc1: 79.2000 (81.7920)  acc5: 95.2000 (96.0480)  time: 0.2133  data: 0.0444  max mem: 28503
Test: Total time: 0:00:10 (0.4133 s / it)
* Acc@1 82.136 Acc@5 96.182 loss 0.941
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.14%
Epoch: [213]  [   0/1251]  eta: 1:08:56  lr: 0.000880  min_lr: 0.000880  loss: 3.2021 (3.2021)  weight_decay: 0.0500 (0.0500)  time: 3.3065  data: 2.9447  max mem: 28503
Epoch: [213]  [ 200/1251]  eta: 0:06:19  lr: 0.000877  min_lr: 0.000877  loss: 2.8646 (2.9066)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9744 (1.0192)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [213]  [ 400/1251]  eta: 0:05:01  lr: 0.000874  min_lr: 0.000874  loss: 2.9938 (2.9038)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8877 (0.9953)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [213]  [ 600/1251]  eta: 0:03:49  lr: 0.000871  min_lr: 0.000871  loss: 2.9930 (2.9055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0344 (1.0601)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [213]  [ 800/1251]  eta: 0:02:38  lr: 0.000868  min_lr: 0.000868  loss: 3.0646 (2.9095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2984 (1.1474)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [213]  [1000/1251]  eta: 0:01:27  lr: 0.000865  min_lr: 0.000865  loss: 3.0599 (2.9192)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9732 (1.1196)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [213]  [1200/1251]  eta: 0:00:17  lr: 0.000863  min_lr: 0.000863  loss: 2.9181 (2.9116)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0375 (1.1054)  time: 0.3461  data: 0.0005  max mem: 28503
Epoch: [213]  [1250/1251]  eta: 0:00:00  lr: 0.000862  min_lr: 0.000862  loss: 3.0238 (2.9181)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0655 (1.1155)  time: 0.2983  data: 0.0007  max mem: 28503
Epoch: [213] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.000862  min_lr: 0.000862  loss: 3.0238 (2.9353)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0655 (1.1155)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6763 (0.6763)  acc1: 88.8000 (88.8000)  acc5: 98.8000 (98.8000)  time: 5.8293  data: 5.6240  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8593 (0.8331)  acc1: 85.6000 (85.7091)  acc5: 97.6000 (97.4909)  time: 0.7397  data: 0.5666  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9750 (0.9609)  acc1: 80.8000 (82.0952)  acc5: 96.0000 (96.3238)  time: 0.1996  data: 0.0305  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0703 (0.9712)  acc1: 79.2000 (81.6320)  acc5: 95.6000 (96.1920)  time: 0.1988  data: 0.0304  max mem: 28503
Test: Total time: 0:00:10 (0.4237 s / it)
* Acc@1 82.148 Acc@5 96.144 loss 0.965
Accuracy of the model on the 50000 test images: 82.1%
Max accuracy: 82.15%
Epoch: [214]  [   0/1251]  eta: 1:04:04  lr: 0.000862  min_lr: 0.000862  loss: 2.6474 (2.6474)  weight_decay: 0.0500 (0.0500)  time: 3.0733  data: 2.7157  max mem: 28503
Epoch: [214]  [ 200/1251]  eta: 0:06:18  lr: 0.000859  min_lr: 0.000859  loss: 3.0185 (2.9241)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9627 (1.0500)  time: 0.3440  data: 0.0004  max mem: 28503
Epoch: [214]  [ 400/1251]  eta: 0:05:01  lr: 0.000856  min_lr: 0.000856  loss: 3.0681 (2.9221)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0077 (1.1205)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [214]  [ 600/1251]  eta: 0:03:48  lr: 0.000853  min_lr: 0.000853  loss: 3.1099 (2.9236)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9504 (1.1018)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [214]  [ 800/1251]  eta: 0:02:38  lr: 0.000850  min_lr: 0.000850  loss: 3.0555 (2.9331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1451 (1.1073)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [214]  [1000/1251]  eta: 0:01:27  lr: 0.000847  min_lr: 0.000847  loss: 2.9380 (2.9341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0793 (1.1082)  time: 0.3534  data: 0.0004  max mem: 28503
Epoch: [214]  [1200/1251]  eta: 0:00:17  lr: 0.000844  min_lr: 0.000844  loss: 3.0997 (2.9380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3832 (1.1134)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [214]  [1250/1251]  eta: 0:00:00  lr: 0.000844  min_lr: 0.000844  loss: 3.0772 (2.9423)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3616 (1.1187)  time: 0.2915  data: 0.0006  max mem: 28503
Epoch: [214] Total time: 0:07:16 (0.3492 s / it)
Averaged stats: lr: 0.000844  min_lr: 0.000844  loss: 3.0772 (2.9323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3616 (1.1187)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.7105 (0.7105)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.4480  data: 5.2526  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8982 (0.8942)  acc1: 86.4000 (85.2000)  acc5: 97.6000 (97.4546)  time: 0.7088  data: 0.5377  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.1112 (1.0235)  acc1: 80.0000 (82.0000)  acc5: 96.0000 (96.2667)  time: 0.2016  data: 0.0332  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.1241 (1.0309)  acc1: 80.0000 (81.8560)  acc5: 96.0000 (96.2400)  time: 0.2013  data: 0.0331  max mem: 28503
Test: Total time: 0:00:10 (0.4101 s / it)
* Acc@1 82.264 Acc@5 96.294 loss 1.018
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.26%
Epoch: [215]  [   0/1251]  eta: 0:58:20  lr: 0.000843  min_lr: 0.000843  loss: 3.3748 (3.3748)  weight_decay: 0.0500 (0.0500)  time: 2.7986  data: 2.4387  max mem: 28503
Epoch: [215]  [ 200/1251]  eta: 0:06:17  lr: 0.000841  min_lr: 0.000841  loss: 3.1167 (2.8985)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1165 (1.3394)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [215]  [ 400/1251]  eta: 0:05:00  lr: 0.000838  min_lr: 0.000838  loss: 3.0287 (2.9323)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9833 (1.2145)  time: 0.3454  data: 0.0005  max mem: 28503
Epoch: [215]  [ 600/1251]  eta: 0:03:48  lr: 0.000835  min_lr: 0.000835  loss: 2.9966 (2.9341)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9600 (1.1420)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [215]  [ 800/1251]  eta: 0:02:38  lr: 0.000832  min_lr: 0.000832  loss: 2.9346 (2.9366)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9661 (1.1295)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [215]  [1000/1251]  eta: 0:01:27  lr: 0.000829  min_lr: 0.000829  loss: 3.1559 (2.9351)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0854 (1.1432)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [215]  [1200/1251]  eta: 0:00:17  lr: 0.000826  min_lr: 0.000826  loss: 3.1431 (2.9414)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0285 (1.1440)  time: 0.3567  data: 0.0004  max mem: 28503
Epoch: [215]  [1250/1251]  eta: 0:00:00  lr: 0.000825  min_lr: 0.000825  loss: 3.0429 (2.9413)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0198 (1.1433)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [215] Total time: 0:07:16 (0.3493 s / it)
Averaged stats: lr: 0.000825  min_lr: 0.000825  loss: 3.0429 (2.9278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0198 (1.1433)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5367 (0.5367)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5907  data: 5.3973  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7738 (0.7494)  acc1: 85.6000 (85.0909)  acc5: 98.0000 (97.5636)  time: 0.7114  data: 0.5397  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9354 (0.8687)  acc1: 80.4000 (81.8095)  acc5: 96.0000 (96.4000)  time: 0.1960  data: 0.0271  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9354 (0.8783)  acc1: 79.2000 (81.2960)  acc5: 95.6000 (96.2880)  time: 0.1954  data: 0.0270  max mem: 28503
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 82.354 Acc@5 96.288 loss 0.863
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.35%
Epoch: [216]  [   0/1251]  eta: 1:02:55  lr: 0.000825  min_lr: 0.000825  loss: 2.8021 (2.8021)  weight_decay: 0.0500 (0.0500)  time: 3.0178  data: 2.6541  max mem: 28503
Epoch: [216]  [ 200/1251]  eta: 0:06:19  lr: 0.000822  min_lr: 0.000822  loss: 3.0428 (2.9091)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0935 (1.1898)  time: 0.3459  data: 0.0005  max mem: 28503
Epoch: [216]  [ 400/1251]  eta: 0:05:01  lr: 0.000819  min_lr: 0.000819  loss: 2.9999 (2.9153)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2446 (1.2674)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [216]  [ 600/1251]  eta: 0:03:49  lr: 0.000817  min_lr: 0.000817  loss: 2.9615 (2.9174)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1270 (1.2381)  time: 0.3455  data: 0.0005  max mem: 28503
Epoch: [216]  [ 800/1251]  eta: 0:02:38  lr: 0.000814  min_lr: 0.000814  loss: 2.6466 (2.9087)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9367 (1.2061)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [216]  [1000/1251]  eta: 0:01:27  lr: 0.000811  min_lr: 0.000811  loss: 3.0026 (2.9054)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [216]  [1200/1251]  eta: 0:00:17  lr: 0.000808  min_lr: 0.000808  loss: 3.1372 (2.9043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0524 (nan)  time: 0.3539  data: 0.0004  max mem: 28503
Epoch: [216]  [1250/1251]  eta: 0:00:00  lr: 0.000807  min_lr: 0.000807  loss: 2.7278 (2.8981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (nan)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [216] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.000807  min_lr: 0.000807  loss: 2.7278 (2.9232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1352 (nan)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5256 (0.5256)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.4876  data: 5.2880  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7302 (0.7136)  acc1: 86.4000 (85.4182)  acc5: 98.0000 (97.5273)  time: 0.7414  data: 0.5683  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9126 (0.8453)  acc1: 80.0000 (82.3429)  acc5: 96.4000 (96.3048)  time: 0.2176  data: 0.0482  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9463 (0.8584)  acc1: 79.2000 (82.0320)  acc5: 96.0000 (96.1760)  time: 0.2164  data: 0.0481  max mem: 28503
Test: Total time: 0:00:10 (0.4237 s / it)
* Acc@1 82.394 Acc@5 96.250 loss 0.842
Accuracy of the model on the 50000 test images: 82.4%
Max accuracy: 82.39%
Epoch: [217]  [   0/1251]  eta: 1:00:24  lr: 0.000807  min_lr: 0.000807  loss: 2.8395 (2.8395)  weight_decay: 0.0500 (0.0500)  time: 2.8973  data: 2.5164  max mem: 28503
Epoch: [217]  [ 200/1251]  eta: 0:06:18  lr: 0.000804  min_lr: 0.000804  loss: 2.7945 (2.8660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0030 (1.1026)  time: 0.3477  data: 0.0006  max mem: 28503
Epoch: [217]  [ 400/1251]  eta: 0:05:01  lr: 0.000801  min_lr: 0.000801  loss: 2.6981 (2.8908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0766 (1.1144)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [217]  [ 600/1251]  eta: 0:03:49  lr: 0.000799  min_lr: 0.000799  loss: 2.9609 (2.9191)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9571 (1.1428)  time: 0.3555  data: 0.0005  max mem: 28503
Epoch: [217]  [ 800/1251]  eta: 0:02:38  lr: 0.000796  min_lr: 0.000796  loss: 3.1306 (2.9214)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0614 (1.1787)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [217]  [1000/1251]  eta: 0:01:27  lr: 0.000793  min_lr: 0.000793  loss: 3.0808 (2.9198)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9366 (1.1517)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [217]  [1200/1251]  eta: 0:00:17  lr: 0.000790  min_lr: 0.000790  loss: 3.0237 (2.9232)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0596 (1.1332)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [217]  [1250/1251]  eta: 0:00:00  lr: 0.000789  min_lr: 0.000789  loss: 3.1920 (2.9252)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1007 (1.1309)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [217] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.000789  min_lr: 0.000789  loss: 3.1920 (2.9220)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1007 (1.1309)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5986 (0.5986)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6980  data: 5.4991  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7939 (0.7773)  acc1: 86.8000 (85.4182)  acc5: 97.6000 (97.8182)  time: 0.6716  data: 0.5003  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9891 (0.9054)  acc1: 80.0000 (82.0000)  acc5: 96.8000 (96.5143)  time: 0.1710  data: 0.0025  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0124 (0.9164)  acc1: 79.6000 (81.6320)  acc5: 96.0000 (96.3680)  time: 0.1708  data: 0.0024  max mem: 28503
Test: Total time: 0:00:09 (0.3947 s / it)
* Acc@1 82.202 Acc@5 96.304 loss 0.908
Accuracy of the model on the 50000 test images: 82.2%
Max accuracy: 82.39%
Epoch: [218]  [   0/1251]  eta: 1:07:38  lr: 0.000789  min_lr: 0.000789  loss: 2.9849 (2.9849)  weight_decay: 0.0500 (0.0500)  time: 3.2441  data: 2.2614  max mem: 28503
Epoch: [218]  [ 200/1251]  eta: 0:06:20  lr: 0.000786  min_lr: 0.000786  loss: 2.8899 (2.9054)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0580 (1.0879)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [218]  [ 400/1251]  eta: 0:05:02  lr: 0.000784  min_lr: 0.000784  loss: 2.8809 (2.8870)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9703 (1.0686)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [218]  [ 600/1251]  eta: 0:03:49  lr: 0.000781  min_lr: 0.000781  loss: 3.0187 (2.8646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4944 (1.1086)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [218]  [ 800/1251]  eta: 0:02:38  lr: 0.000778  min_lr: 0.000778  loss: 2.8987 (2.8832)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0617 (1.0929)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [218]  [1000/1251]  eta: 0:01:27  lr: 0.000775  min_lr: 0.000775  loss: 3.1081 (2.8925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2689 (1.1192)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [218]  [1200/1251]  eta: 0:00:17  lr: 0.000772  min_lr: 0.000772  loss: 3.0906 (2.8978)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9319 (1.1183)  time: 0.3533  data: 0.0004  max mem: 28503
Epoch: [218]  [1250/1251]  eta: 0:00:00  lr: 0.000772  min_lr: 0.000772  loss: 3.1641 (2.8971)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.1153)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [218] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.000772  min_lr: 0.000772  loss: 3.1641 (2.9094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0389 (1.1153)
Test:  [ 0/25]  eta: 0:01:24  loss: 0.6294 (0.6294)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 3.3719  data: 3.1698  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8299 (0.8267)  acc1: 85.6000 (85.5273)  acc5: 97.6000 (97.3818)  time: 0.6247  data: 0.4508  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0131 (0.9466)  acc1: 80.4000 (82.2095)  acc5: 96.0000 (96.3048)  time: 0.3017  data: 0.1320  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0131 (0.9564)  acc1: 80.4000 (81.8240)  acc5: 95.6000 (96.1280)  time: 0.2199  data: 0.0516  max mem: 28503
Test: Total time: 0:00:10 (0.4115 s / it)
* Acc@1 82.332 Acc@5 96.316 loss 0.943
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.39%
Epoch: [219]  [   0/1251]  eta: 1:11:55  lr: 0.000771  min_lr: 0.000771  loss: 2.6535 (2.6535)  weight_decay: 0.0500 (0.0500)  time: 3.4496  data: 2.7914  max mem: 28503
Epoch: [219]  [ 200/1251]  eta: 0:06:24  lr: 0.000769  min_lr: 0.000769  loss: 3.1127 (2.9052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2350 (1.2861)  time: 0.3539  data: 0.0004  max mem: 28503
Epoch: [219]  [ 400/1251]  eta: 0:05:03  lr: 0.000766  min_lr: 0.000766  loss: 3.0321 (2.9069)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0185 (1.2285)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [219]  [ 600/1251]  eta: 0:03:50  lr: 0.000763  min_lr: 0.000763  loss: 2.8401 (2.9055)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0160 (1.2109)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [219]  [ 800/1251]  eta: 0:02:38  lr: 0.000760  min_lr: 0.000760  loss: 2.9688 (2.9038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0730 (1.2035)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [219]  [1000/1251]  eta: 0:01:28  lr: 0.000757  min_lr: 0.000757  loss: 2.8924 (2.9044)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0958 (1.1993)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [219]  [1200/1251]  eta: 0:00:17  lr: 0.000755  min_lr: 0.000755  loss: 2.9527 (2.9031)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9957 (1.1799)  time: 0.3531  data: 0.0004  max mem: 28503
Epoch: [219]  [1250/1251]  eta: 0:00:00  lr: 0.000754  min_lr: 0.000754  loss: 2.8522 (2.9055)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (1.1718)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [219] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.000754  min_lr: 0.000754  loss: 2.8522 (2.9039)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9436 (1.1718)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5635 (0.5635)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7354  data: 5.5387  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7616 (0.7330)  acc1: 85.2000 (85.0909)  acc5: 97.6000 (97.6364)  time: 0.7391  data: 0.5637  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9176 (0.8618)  acc1: 79.2000 (81.7333)  acc5: 95.6000 (96.2286)  time: 0.2101  data: 0.0331  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9385 (0.8686)  acc1: 80.0000 (81.6640)  acc5: 95.6000 (96.1440)  time: 0.2097  data: 0.0330  max mem: 28503
Test: Total time: 0:00:10 (0.4284 s / it)
* Acc@1 82.330 Acc@5 96.368 loss 0.854
Accuracy of the model on the 50000 test images: 82.3%
Max accuracy: 82.39%
Epoch: [220]  [   0/1251]  eta: 1:11:57  lr: 0.000754  min_lr: 0.000754  loss: 2.7062 (2.7062)  weight_decay: 0.0500 (0.0500)  time: 3.4513  data: 2.5199  max mem: 28503
Epoch: [220]  [ 200/1251]  eta: 0:06:23  lr: 0.000751  min_lr: 0.000751  loss: 3.0710 (2.8748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0929 (1.1377)  time: 0.3584  data: 0.0004  max mem: 28503
Epoch: [220]  [ 400/1251]  eta: 0:05:03  lr: 0.000748  min_lr: 0.000748  loss: 3.0029 (2.8959)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9121 (1.0792)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [220]  [ 600/1251]  eta: 0:03:50  lr: 0.000745  min_lr: 0.000745  loss: 2.8801 (2.8890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0426 (1.0825)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [220]  [ 800/1251]  eta: 0:02:39  lr: 0.000743  min_lr: 0.000743  loss: 2.9990 (2.8900)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0289 (1.0793)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [220]  [1000/1251]  eta: 0:01:28  lr: 0.000740  min_lr: 0.000740  loss: 2.8180 (2.8939)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2449 (1.1061)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [220]  [1200/1251]  eta: 0:00:17  lr: 0.000737  min_lr: 0.000737  loss: 2.9202 (2.8932)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9612 (1.0947)  time: 0.3593  data: 0.0004  max mem: 28503
Epoch: [220]  [1250/1251]  eta: 0:00:00  lr: 0.000736  min_lr: 0.000736  loss: 2.8915 (2.8922)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (1.0943)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [220] Total time: 0:07:18 (0.3508 s / it)
Averaged stats: lr: 0.000736  min_lr: 0.000736  loss: 2.8915 (2.9028)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9544 (1.0943)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6018 (0.6018)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5126  data: 5.3114  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7554 (0.7662)  acc1: 84.4000 (85.1273)  acc5: 97.6000 (97.6364)  time: 0.6763  data: 0.5047  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9069 (0.8888)  acc1: 80.0000 (81.9810)  acc5: 96.8000 (96.3048)  time: 0.1806  data: 0.0121  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9131 (0.9006)  acc1: 80.0000 (81.6640)  acc5: 95.6000 (96.2240)  time: 0.1804  data: 0.0120  max mem: 28503
Test: Total time: 0:00:09 (0.3986 s / it)
* Acc@1 82.568 Acc@5 96.382 loss 0.883
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.57%
Epoch: [221]  [   0/1251]  eta: 1:07:35  lr: 0.000736  min_lr: 0.000736  loss: 3.1344 (3.1344)  weight_decay: 0.0500 (0.0500)  time: 3.2416  data: 2.8460  max mem: 28503
Epoch: [221]  [ 200/1251]  eta: 0:06:22  lr: 0.000734  min_lr: 0.000734  loss: 3.0197 (2.9167)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1068 (1.2283)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [221]  [ 400/1251]  eta: 0:05:02  lr: 0.000731  min_lr: 0.000731  loss: 3.0467 (2.9237)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2273 (1.1884)  time: 0.3564  data: 0.0004  max mem: 28503
Epoch: [221]  [ 600/1251]  eta: 0:03:49  lr: 0.000728  min_lr: 0.000728  loss: 2.7999 (2.8996)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3522 (1.2246)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [221]  [ 800/1251]  eta: 0:02:38  lr: 0.000725  min_lr: 0.000725  loss: 3.1307 (2.9021)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0845 (1.1864)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [221]  [1000/1251]  eta: 0:01:28  lr: 0.000722  min_lr: 0.000722  loss: 3.1521 (2.9070)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1277 (1.1879)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [221]  [1200/1251]  eta: 0:00:17  lr: 0.000720  min_lr: 0.000720  loss: 2.9358 (2.9004)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0987 (1.1830)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [221]  [1250/1251]  eta: 0:00:00  lr: 0.000719  min_lr: 0.000719  loss: 2.9313 (2.9022)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1243 (1.1818)  time: 0.2926  data: 0.0005  max mem: 28503
Epoch: [221] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.000719  min_lr: 0.000719  loss: 2.9313 (2.8986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1243 (1.1818)
Test:  [ 0/25]  eta: 0:01:33  loss: 0.5872 (0.5872)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 3.7584  data: 3.5559  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7860 (0.7699)  acc1: 86.4000 (85.3818)  acc5: 97.6000 (97.6000)  time: 0.6587  data: 0.4739  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9424 (0.9012)  acc1: 80.0000 (82.0952)  acc5: 96.0000 (96.1905)  time: 0.2739  data: 0.0982  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0009 (0.9143)  acc1: 79.6000 (81.7280)  acc5: 95.6000 (96.1120)  time: 0.2187  data: 0.0472  max mem: 28503
Test: Total time: 0:00:09 (0.3995 s / it)
* Acc@1 82.586 Acc@5 96.430 loss 0.894
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.59%
Epoch: [222]  [   0/1251]  eta: 1:00:00  lr: 0.000719  min_lr: 0.000719  loss: 3.2444 (3.2444)  weight_decay: 0.0500 (0.0500)  time: 2.8782  data: 2.5206  max mem: 28503
Epoch: [222]  [ 200/1251]  eta: 0:06:20  lr: 0.000716  min_lr: 0.000716  loss: 2.9069 (2.9052)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2335 (1.2377)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [222]  [ 400/1251]  eta: 0:05:01  lr: 0.000714  min_lr: 0.000714  loss: 2.8718 (2.8748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0461 (1.2726)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [222]  [ 600/1251]  eta: 0:03:49  lr: 0.000711  min_lr: 0.000711  loss: 2.5367 (2.8703)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8781 (1.2158)  time: 0.3631  data: 0.0004  max mem: 28503
Epoch: [222]  [ 800/1251]  eta: 0:02:38  lr: 0.000708  min_lr: 0.000708  loss: 3.0032 (2.8742)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0923 (1.2016)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [222]  [1000/1251]  eta: 0:01:27  lr: 0.000705  min_lr: 0.000705  loss: 2.8521 (2.8731)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9896 (1.1819)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [222]  [1200/1251]  eta: 0:00:17  lr: 0.000703  min_lr: 0.000703  loss: 2.8298 (2.8704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3317 (1.2097)  time: 0.3569  data: 0.0005  max mem: 28503
Epoch: [222]  [1250/1251]  eta: 0:00:00  lr: 0.000702  min_lr: 0.000702  loss: 2.9091 (2.8703)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1103 (1.2066)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [222] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.000702  min_lr: 0.000702  loss: 2.9091 (2.8888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1103 (1.2066)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5270 (0.5270)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.4277  data: 5.2041  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7625 (0.7362)  acc1: 85.6000 (85.2000)  acc5: 97.6000 (97.6364)  time: 0.6867  data: 0.5124  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9390 (0.8636)  acc1: 80.8000 (82.5524)  acc5: 96.0000 (96.2667)  time: 0.1906  data: 0.0217  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9390 (0.8773)  acc1: 80.8000 (82.2240)  acc5: 95.6000 (96.2080)  time: 0.1900  data: 0.0216  max mem: 28503
Test: Total time: 0:00:10 (0.4009 s / it)
* Acc@1 82.636 Acc@5 96.410 loss 0.858
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [223]  [   0/1251]  eta: 1:00:40  lr: 0.000702  min_lr: 0.000702  loss: 3.1378 (3.1378)  weight_decay: 0.0500 (0.0500)  time: 2.9100  data: 2.5053  max mem: 28503
Epoch: [223]  [ 200/1251]  eta: 0:06:17  lr: 0.000699  min_lr: 0.000699  loss: 2.8329 (2.8185)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0221 (1.1591)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [223]  [ 400/1251]  eta: 0:05:00  lr: 0.000696  min_lr: 0.000696  loss: 3.0128 (2.8641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1058 (1.1728)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [223]  [ 600/1251]  eta: 0:03:49  lr: 0.000694  min_lr: 0.000694  loss: 2.7863 (2.8531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3919 (1.2326)  time: 0.3593  data: 0.0004  max mem: 28503
Epoch: [223]  [ 800/1251]  eta: 0:02:38  lr: 0.000691  min_lr: 0.000691  loss: 2.9865 (2.8592)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1277 (1.2144)  time: 0.3470  data: 0.0004  max mem: 28503
Epoch: [223]  [1000/1251]  eta: 0:01:27  lr: 0.000688  min_lr: 0.000688  loss: 2.9327 (2.8648)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0905 (1.1931)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [223]  [1200/1251]  eta: 0:00:17  lr: 0.000686  min_lr: 0.000686  loss: 3.0845 (2.8828)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1200 (1.2040)  time: 0.3463  data: 0.0005  max mem: 28503
Epoch: [223]  [1250/1251]  eta: 0:00:00  lr: 0.000685  min_lr: 0.000685  loss: 3.1243 (2.8848)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0594 (1.2010)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [223] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000685  min_lr: 0.000685  loss: 3.1243 (2.8916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0594 (1.2010)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6318 (0.6318)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.5156  data: 5.3169  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8239 (0.8189)  acc1: 86.4000 (85.3091)  acc5: 98.0000 (97.9273)  time: 0.6557  data: 0.4837  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9987 (0.9465)  acc1: 80.0000 (82.2667)  acc5: 96.4000 (96.4952)  time: 0.1691  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0552 (0.9582)  acc1: 79.6000 (81.9040)  acc5: 95.6000 (96.3840)  time: 0.1686  data: 0.0002  max mem: 28503
Test: Total time: 0:00:09 (0.3864 s / it)
* Acc@1 82.508 Acc@5 96.376 loss 0.943
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.64%
Epoch: [224]  [   0/1251]  eta: 1:09:13  lr: 0.000685  min_lr: 0.000685  loss: 3.4169 (3.4169)  weight_decay: 0.0500 (0.0500)  time: 3.3203  data: 1.9115  max mem: 28503
Epoch: [224]  [ 200/1251]  eta: 0:06:23  lr: 0.000682  min_lr: 0.000682  loss: 2.9788 (2.9123)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2329 (1.1750)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [224]  [ 400/1251]  eta: 0:05:02  lr: 0.000680  min_lr: 0.000680  loss: 3.0329 (2.8744)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0961 (1.1724)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [224]  [ 600/1251]  eta: 0:03:49  lr: 0.000677  min_lr: 0.000677  loss: 2.9327 (2.8785)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1443 (1.2030)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [224]  [ 800/1251]  eta: 0:02:38  lr: 0.000674  min_lr: 0.000674  loss: 3.0533 (2.8885)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0785 (1.1613)  time: 0.3444  data: 0.0005  max mem: 28503
Epoch: [224]  [1000/1251]  eta: 0:01:28  lr: 0.000671  min_lr: 0.000671  loss: 3.1430 (2.8833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0906 (1.1628)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [224]  [1200/1251]  eta: 0:00:17  lr: 0.000669  min_lr: 0.000669  loss: 2.7312 (2.8883)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2212 (1.1877)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [224]  [1250/1251]  eta: 0:00:00  lr: 0.000668  min_lr: 0.000668  loss: 2.9303 (2.8899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4338 (1.1980)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [224] Total time: 0:07:17 (0.3497 s / it)
Averaged stats: lr: 0.000668  min_lr: 0.000668  loss: 2.9303 (2.8794)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4338 (1.1980)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5838 (0.5838)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.5010  data: 5.2978  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8204 (0.7859)  acc1: 86.4000 (85.6364)  acc5: 97.6000 (97.7818)  time: 0.6887  data: 0.5168  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9531 (0.9028)  acc1: 80.4000 (82.7238)  acc5: 96.4000 (96.5714)  time: 0.1995  data: 0.0303  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9531 (0.9148)  acc1: 80.4000 (82.2720)  acc5: 96.0000 (96.4320)  time: 0.2001  data: 0.0302  max mem: 28503
Test: Total time: 0:00:10 (0.4106 s / it)
* Acc@1 82.502 Acc@5 96.354 loss 0.911
Accuracy of the model on the 50000 test images: 82.5%
Max accuracy: 82.64%
Epoch: [225]  [   0/1251]  eta: 1:06:43  lr: 0.000668  min_lr: 0.000668  loss: 2.1939 (2.1939)  weight_decay: 0.0500 (0.0500)  time: 3.2006  data: 2.7484  max mem: 28503
Epoch: [225]  [ 200/1251]  eta: 0:06:21  lr: 0.000665  min_lr: 0.000665  loss: 2.6675 (2.8515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2797 (1.2219)  time: 0.3552  data: 0.0005  max mem: 28503
Epoch: [225]  [ 400/1251]  eta: 0:05:02  lr: 0.000663  min_lr: 0.000663  loss: 3.0575 (2.8630)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9640 (1.2017)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [225]  [ 600/1251]  eta: 0:03:49  lr: 0.000660  min_lr: 0.000660  loss: 3.1403 (2.8625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1328 (1.1702)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [225]  [ 800/1251]  eta: 0:02:38  lr: 0.000657  min_lr: 0.000657  loss: 2.8522 (2.8523)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0107 (1.1739)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [225]  [1000/1251]  eta: 0:01:27  lr: 0.000655  min_lr: 0.000655  loss: 2.7924 (2.8440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1024 (1.1758)  time: 0.3518  data: 0.0005  max mem: 28503
Epoch: [225]  [1200/1251]  eta: 0:00:17  lr: 0.000652  min_lr: 0.000652  loss: 2.9467 (2.8462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3557 (1.2183)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [225]  [1250/1251]  eta: 0:00:00  lr: 0.000652  min_lr: 0.000652  loss: 2.8550 (2.8464)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4112 (1.2260)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [225] Total time: 0:07:17 (0.3495 s / it)
Averaged stats: lr: 0.000652  min_lr: 0.000652  loss: 2.8550 (2.8786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4112 (1.2260)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5255 (0.5255)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.6893  data: 5.4843  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7658 (0.7267)  acc1: 86.8000 (85.3818)  acc5: 97.6000 (97.6364)  time: 0.6786  data: 0.5054  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8806 (0.8581)  acc1: 80.8000 (82.4952)  acc5: 96.4000 (96.2857)  time: 0.1931  data: 0.0238  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9342 (0.8693)  acc1: 80.0000 (82.1920)  acc5: 96.0000 (96.3040)  time: 0.1927  data: 0.0242  max mem: 28503
Test: Total time: 0:00:10 (0.4130 s / it)
* Acc@1 82.622 Acc@5 96.434 loss 0.860
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [226]  [   0/1251]  eta: 1:11:12  lr: 0.000651  min_lr: 0.000651  loss: 3.0606 (3.0606)  weight_decay: 0.0500 (0.0500)  time: 3.4150  data: 1.6124  max mem: 28503
Epoch: [226]  [ 200/1251]  eta: 0:06:21  lr: 0.000649  min_lr: 0.000649  loss: 2.8552 (2.8323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1117 (1.2547)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [226]  [ 400/1251]  eta: 0:05:03  lr: 0.000646  min_lr: 0.000646  loss: 3.0542 (2.8568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1397 (1.2225)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [226]  [ 600/1251]  eta: 0:03:49  lr: 0.000644  min_lr: 0.000644  loss: 3.0272 (2.8726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0415 (nan)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [226]  [ 800/1251]  eta: 0:02:38  lr: 0.000641  min_lr: 0.000641  loss: 3.0863 (2.8700)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1336 (nan)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [226]  [1000/1251]  eta: 0:01:28  lr: 0.000638  min_lr: 0.000638  loss: 3.0405 (2.8730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3319 (nan)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [226]  [1200/1251]  eta: 0:00:17  lr: 0.000636  min_lr: 0.000636  loss: 2.9485 (2.8666)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9878 (nan)  time: 0.3542  data: 0.0005  max mem: 28503
Epoch: [226]  [1250/1251]  eta: 0:00:00  lr: 0.000635  min_lr: 0.000635  loss: 3.1208 (2.8666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0631 (nan)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [226] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.000635  min_lr: 0.000635  loss: 3.1208 (2.8792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0631 (nan)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.6588 (0.6588)  acc1: 91.2000 (91.2000)  acc5: 98.4000 (98.4000)  time: 5.2864  data: 5.0852  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8289 (0.8267)  acc1: 84.8000 (85.9636)  acc5: 98.0000 (97.6364)  time: 0.6724  data: 0.5007  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9699 (0.9632)  acc1: 80.8000 (82.4762)  acc5: 95.2000 (96.2286)  time: 0.1898  data: 0.0212  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0463 (0.9744)  acc1: 79.6000 (82.0480)  acc5: 95.2000 (96.1120)  time: 0.1895  data: 0.0211  max mem: 28503
Test: Total time: 0:00:09 (0.3940 s / it)
* Acc@1 82.562 Acc@5 96.326 loss 0.962
Accuracy of the model on the 50000 test images: 82.6%
Max accuracy: 82.64%
Epoch: [227]  [   0/1251]  eta: 1:12:03  lr: 0.000635  min_lr: 0.000635  loss: 2.3815 (2.3815)  weight_decay: 0.0500 (0.0500)  time: 3.4559  data: 2.9123  max mem: 28503
Epoch: [227]  [ 200/1251]  eta: 0:06:21  lr: 0.000632  min_lr: 0.000632  loss: 3.1588 (2.9107)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2184 (1.2963)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [227]  [ 400/1251]  eta: 0:05:02  lr: 0.000630  min_lr: 0.000630  loss: 2.8948 (2.9110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0587 (1.2820)  time: 0.3564  data: 0.0004  max mem: 28503
Epoch: [227]  [ 600/1251]  eta: 0:03:49  lr: 0.000627  min_lr: 0.000627  loss: 2.8607 (2.8987)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0729 (1.2420)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [227]  [ 800/1251]  eta: 0:02:38  lr: 0.000625  min_lr: 0.000625  loss: 3.0823 (2.9029)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2089 (1.2400)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [227]  [1000/1251]  eta: 0:01:28  lr: 0.000622  min_lr: 0.000622  loss: 2.8411 (2.8949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1150 (1.2616)  time: 0.3651  data: 0.0004  max mem: 28503
Epoch: [227]  [1200/1251]  eta: 0:00:17  lr: 0.000619  min_lr: 0.000619  loss: 3.0598 (2.8912)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0673 (1.2384)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [227]  [1250/1251]  eta: 0:00:00  lr: 0.000619  min_lr: 0.000619  loss: 2.9883 (2.8896)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.2356)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [227] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.000619  min_lr: 0.000619  loss: 2.9883 (2.8792)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0888 (1.2356)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5734 (0.5734)  acc1: 91.6000 (91.6000)  acc5: 99.2000 (99.2000)  time: 5.6479  data: 5.4454  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7827 (0.7547)  acc1: 86.8000 (86.2182)  acc5: 98.0000 (97.7091)  time: 0.7448  data: 0.5731  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9193 (0.8819)  acc1: 80.4000 (82.7619)  acc5: 96.4000 (96.5143)  time: 0.2115  data: 0.0430  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9505 (0.8954)  acc1: 80.0000 (82.3520)  acc5: 96.0000 (96.4320)  time: 0.2113  data: 0.0429  max mem: 28503
Test: Total time: 0:00:10 (0.4254 s / it)
* Acc@1 82.728 Acc@5 96.450 loss 0.883
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.73%
Epoch: [228]  [   0/1251]  eta: 1:06:28  lr: 0.000619  min_lr: 0.000619  loss: 3.1935 (3.1935)  weight_decay: 0.0500 (0.0500)  time: 3.1880  data: 2.8127  max mem: 28503
Epoch: [228]  [ 200/1251]  eta: 0:06:22  lr: 0.000616  min_lr: 0.000616  loss: 2.9899 (2.8857)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0829 (1.2756)  time: 0.3550  data: 0.0005  max mem: 28503
Epoch: [228]  [ 400/1251]  eta: 0:05:03  lr: 0.000614  min_lr: 0.000614  loss: 2.5942 (2.8632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2890 (1.3369)  time: 0.3513  data: 0.0005  max mem: 28503
Epoch: [228]  [ 600/1251]  eta: 0:03:50  lr: 0.000611  min_lr: 0.000611  loss: 3.0232 (2.8600)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4302 (1.3719)  time: 0.3511  data: 0.0005  max mem: 28503
Epoch: [228]  [ 800/1251]  eta: 0:02:38  lr: 0.000608  min_lr: 0.000608  loss: 3.0273 (2.8638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1620 (1.3245)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [228]  [1000/1251]  eta: 0:01:28  lr: 0.000606  min_lr: 0.000606  loss: 2.9266 (2.8688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2689 (1.3092)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [228]  [1200/1251]  eta: 0:00:17  lr: 0.000603  min_lr: 0.000603  loss: 3.0849 (2.8674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0518 (1.2901)  time: 0.3472  data: 0.0005  max mem: 28503
Epoch: [228]  [1250/1251]  eta: 0:00:00  lr: 0.000603  min_lr: 0.000603  loss: 3.0320 (2.8672)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0772 (1.2819)  time: 0.2977  data: 0.0006  max mem: 28503
Epoch: [228] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.000603  min_lr: 0.000603  loss: 3.0320 (2.8675)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0772 (1.2819)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5984 (0.5984)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5007  data: 5.2935  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7779 (0.7490)  acc1: 85.6000 (85.6000)  acc5: 98.0000 (97.7091)  time: 0.7276  data: 0.5530  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8911 (0.8762)  acc1: 80.0000 (82.4381)  acc5: 96.4000 (96.4381)  time: 0.2114  data: 0.0395  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9444 (0.8899)  acc1: 80.0000 (82.0320)  acc5: 96.0000 (96.3200)  time: 0.2102  data: 0.0394  max mem: 28503
Test: Total time: 0:00:10 (0.4194 s / it)
* Acc@1 82.688 Acc@5 96.494 loss 0.879
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.73%
Epoch: [229]  [   0/1251]  eta: 1:12:20  lr: 0.000603  min_lr: 0.000603  loss: 2.3462 (2.3462)  weight_decay: 0.0500 (0.0500)  time: 3.4694  data: 2.3905  max mem: 28503
Epoch: [229]  [ 200/1251]  eta: 0:06:20  lr: 0.000600  min_lr: 0.000600  loss: 2.9654 (2.8816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0569 (1.1054)  time: 0.3558  data: 0.0004  max mem: 28503
Epoch: [229]  [ 400/1251]  eta: 0:05:01  lr: 0.000597  min_lr: 0.000597  loss: 3.0380 (2.8724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0696 (1.1441)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [229]  [ 600/1251]  eta: 0:03:49  lr: 0.000595  min_lr: 0.000595  loss: 2.9920 (2.8768)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3242 (1.1858)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [229]  [ 800/1251]  eta: 0:02:38  lr: 0.000592  min_lr: 0.000592  loss: 2.8181 (2.8650)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2441 (1.2500)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [229]  [1000/1251]  eta: 0:01:27  lr: 0.000590  min_lr: 0.000590  loss: 2.9832 (2.8647)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0903 (1.2361)  time: 0.3458  data: 0.0005  max mem: 28503
Epoch: [229]  [1200/1251]  eta: 0:00:17  lr: 0.000587  min_lr: 0.000587  loss: 2.9084 (2.8567)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2124 (1.2286)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [229]  [1250/1251]  eta: 0:00:00  lr: 0.000587  min_lr: 0.000587  loss: 2.9921 (2.8606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.2236)  time: 0.2914  data: 0.0007  max mem: 28503
Epoch: [229] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.000587  min_lr: 0.000587  loss: 2.9921 (2.8620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0437 (1.2236)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6269 (0.6269)  acc1: 91.2000 (91.2000)  acc5: 99.6000 (99.6000)  time: 5.4597  data: 5.2697  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8535 (0.8107)  acc1: 87.2000 (86.1091)  acc5: 97.6000 (97.7091)  time: 0.6713  data: 0.5003  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9866 (0.9367)  acc1: 80.4000 (82.8952)  acc5: 96.4000 (96.5143)  time: 0.1821  data: 0.0133  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0021 (0.9474)  acc1: 80.4000 (82.5120)  acc5: 95.6000 (96.3840)  time: 0.1816  data: 0.0132  max mem: 28503
Test: Total time: 0:00:10 (0.4023 s / it)
* Acc@1 82.724 Acc@5 96.464 loss 0.937
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.73%
Epoch: [230]  [   0/1251]  eta: 1:10:17  lr: 0.000587  min_lr: 0.000587  loss: 3.5170 (3.5170)  weight_decay: 0.0500 (0.0500)  time: 3.3712  data: 2.8848  max mem: 28503
Epoch: [230]  [ 200/1251]  eta: 0:06:23  lr: 0.000584  min_lr: 0.000584  loss: 2.7785 (2.8090)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0094 (1.0852)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [230]  [ 400/1251]  eta: 0:05:02  lr: 0.000582  min_lr: 0.000582  loss: 2.9448 (2.8034)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0631 (1.1532)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [230]  [ 600/1251]  eta: 0:03:50  lr: 0.000579  min_lr: 0.000579  loss: 3.0346 (2.8258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0553 (1.1653)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [230]  [ 800/1251]  eta: 0:02:38  lr: 0.000577  min_lr: 0.000577  loss: 2.9746 (2.8299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0811 (1.1752)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [230]  [1000/1251]  eta: 0:01:28  lr: 0.000574  min_lr: 0.000574  loss: 2.8978 (2.8218)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6259 (1.2457)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [230]  [1200/1251]  eta: 0:00:17  lr: 0.000571  min_lr: 0.000571  loss: 3.0185 (2.8228)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1061 (1.2692)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [230]  [1250/1251]  eta: 0:00:00  lr: 0.000571  min_lr: 0.000571  loss: 3.1577 (2.8264)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0907 (1.2643)  time: 0.2922  data: 0.0007  max mem: 28503
Epoch: [230] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000571  min_lr: 0.000571  loss: 3.1577 (2.8570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0907 (1.2643)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6444 (0.6444)  acc1: 89.2000 (89.2000)  acc5: 98.8000 (98.8000)  time: 5.5453  data: 5.3228  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8604 (0.8230)  acc1: 86.0000 (85.0182)  acc5: 97.6000 (97.4909)  time: 0.6695  data: 0.4939  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9761 (0.9455)  acc1: 80.8000 (82.3810)  acc5: 96.0000 (96.3429)  time: 0.1781  data: 0.0082  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0152 (0.9594)  acc1: 80.8000 (82.0800)  acc5: 95.6000 (96.1600)  time: 0.1839  data: 0.0147  max mem: 28503
Test: Total time: 0:00:10 (0.4015 s / it)
* Acc@1 82.702 Acc@5 96.374 loss 0.946
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.73%
Epoch: [231]  [   0/1251]  eta: 1:04:00  lr: 0.000571  min_lr: 0.000571  loss: 3.0782 (3.0782)  weight_decay: 0.0500 (0.0500)  time: 3.0698  data: 2.5424  max mem: 28503
Epoch: [231]  [ 200/1251]  eta: 0:06:19  lr: 0.000568  min_lr: 0.000568  loss: 2.9814 (2.8451)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9274 (1.0570)  time: 0.3563  data: 0.0004  max mem: 28503
Epoch: [231]  [ 400/1251]  eta: 0:05:01  lr: 0.000566  min_lr: 0.000566  loss: 2.8803 (2.8163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1014 (1.1531)  time: 0.3568  data: 0.0004  max mem: 28503
Epoch: [231]  [ 600/1251]  eta: 0:03:49  lr: 0.000563  min_lr: 0.000563  loss: 3.0102 (2.8323)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1678 (1.1582)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [231]  [ 800/1251]  eta: 0:02:38  lr: 0.000561  min_lr: 0.000561  loss: 2.9145 (2.8494)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0274 (1.2033)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [231]  [1000/1251]  eta: 0:01:27  lr: 0.000558  min_lr: 0.000558  loss: 2.7903 (2.8501)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2043 (1.2072)  time: 0.3441  data: 0.0004  max mem: 28503
Epoch: [231]  [1200/1251]  eta: 0:00:17  lr: 0.000556  min_lr: 0.000556  loss: 2.8632 (2.8531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0884 (1.2262)  time: 0.3444  data: 0.0004  max mem: 28503
Epoch: [231]  [1250/1251]  eta: 0:00:00  lr: 0.000555  min_lr: 0.000555  loss: 2.7711 (2.8533)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1534 (1.2291)  time: 0.2914  data: 0.0007  max mem: 28503
Epoch: [231] Total time: 0:07:16 (0.3488 s / it)
Averaged stats: lr: 0.000555  min_lr: 0.000555  loss: 2.7711 (2.8520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1534 (1.2291)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5579 (0.5579)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.5928  data: 5.3775  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7879 (0.7470)  acc1: 87.2000 (85.6364)  acc5: 97.6000 (97.6000)  time: 0.6619  data: 0.4892  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9074 (0.8679)  acc1: 81.2000 (82.6476)  acc5: 96.0000 (96.3238)  time: 0.1806  data: 0.0112  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9501 (0.8807)  acc1: 81.2000 (82.2880)  acc5: 96.0000 (96.2240)  time: 0.1856  data: 0.0163  max mem: 28503
Test: Total time: 0:00:10 (0.4062 s / it)
* Acc@1 82.890 Acc@5 96.504 loss 0.864
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.89%
Epoch: [232]  [   0/1251]  eta: 1:00:25  lr: 0.000555  min_lr: 0.000555  loss: 3.1769 (3.1769)  weight_decay: 0.0500 (0.0500)  time: 2.8982  data: 2.4445  max mem: 28503
Epoch: [232]  [ 200/1251]  eta: 0:06:18  lr: 0.000553  min_lr: 0.000553  loss: 3.0200 (2.8572)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0808 (1.1642)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [232]  [ 400/1251]  eta: 0:05:01  lr: 0.000550  min_lr: 0.000550  loss: 2.9957 (2.8330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0451 (1.1689)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [232]  [ 600/1251]  eta: 0:03:49  lr: 0.000548  min_lr: 0.000548  loss: 3.1059 (2.8259)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1532 (1.1776)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [232]  [ 800/1251]  eta: 0:02:38  lr: 0.000545  min_lr: 0.000545  loss: 2.8598 (2.8409)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2307 (1.2078)  time: 0.3587  data: 0.0004  max mem: 28503
Epoch: [232]  [1000/1251]  eta: 0:01:27  lr: 0.000543  min_lr: 0.000543  loss: 2.6106 (2.8331)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0283 (1.2118)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [232]  [1200/1251]  eta: 0:00:17  lr: 0.000540  min_lr: 0.000540  loss: 3.0309 (2.8364)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2015 (1.2273)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [232]  [1250/1251]  eta: 0:00:00  lr: 0.000540  min_lr: 0.000540  loss: 2.9979 (2.8390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7078 (1.2725)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [232] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.000540  min_lr: 0.000540  loss: 2.9979 (2.8410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7078 (1.2725)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6582 (0.6582)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5592  data: 5.3660  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8448 (0.8156)  acc1: 87.2000 (85.6727)  acc5: 98.0000 (97.8909)  time: 0.7164  data: 0.5454  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9999 (0.9485)  acc1: 80.4000 (82.8381)  acc5: 95.6000 (96.2857)  time: 0.2003  data: 0.0317  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0607 (0.9645)  acc1: 80.0000 (82.3360)  acc5: 95.2000 (96.1440)  time: 0.2001  data: 0.0317  max mem: 28503
Test: Total time: 0:00:10 (0.4131 s / it)
* Acc@1 82.674 Acc@5 96.380 loss 0.951
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.89%
Epoch: [233]  [   0/1251]  eta: 1:10:21  lr: 0.000540  min_lr: 0.000540  loss: 1.6803 (1.6803)  weight_decay: 0.0500 (0.0500)  time: 3.3748  data: 2.7201  max mem: 28503
Epoch: [233]  [ 200/1251]  eta: 0:06:20  lr: 0.000537  min_lr: 0.000537  loss: 3.0611 (2.8519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1494 (1.2953)  time: 0.3532  data: 0.0004  max mem: 28503
Epoch: [233]  [ 400/1251]  eta: 0:05:02  lr: 0.000535  min_lr: 0.000535  loss: 3.0187 (2.8172)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2478 (1.2831)  time: 0.3552  data: 0.0004  max mem: 28503
Epoch: [233]  [ 600/1251]  eta: 0:03:49  lr: 0.000533  min_lr: 0.000533  loss: 3.0004 (2.8211)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1421 (1.2477)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [233]  [ 800/1251]  eta: 0:02:38  lr: 0.000530  min_lr: 0.000530  loss: 2.7879 (2.8392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0655 (1.2604)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [233]  [1000/1251]  eta: 0:01:27  lr: 0.000528  min_lr: 0.000528  loss: 2.8552 (2.8471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3588 (1.2520)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [233]  [1200/1251]  eta: 0:00:17  lr: 0.000525  min_lr: 0.000525  loss: 2.9345 (2.8586)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1638 (1.2538)  time: 0.3522  data: 0.0004  max mem: 28503
Epoch: [233]  [1250/1251]  eta: 0:00:00  lr: 0.000525  min_lr: 0.000525  loss: 3.1147 (2.8619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2089 (1.2522)  time: 0.2961  data: 0.0006  max mem: 28503
Epoch: [233] Total time: 0:07:17 (0.3493 s / it)
Averaged stats: lr: 0.000525  min_lr: 0.000525  loss: 3.1147 (2.8471)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2089 (1.2522)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.6933 (0.6933)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.5129  data: 5.3101  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.9091 (0.8723)  acc1: 87.2000 (86.5455)  acc5: 98.0000 (97.9636)  time: 0.6838  data: 0.5121  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0505 (1.0002)  acc1: 80.4000 (82.7619)  acc5: 96.0000 (96.4952)  time: 0.1925  data: 0.0240  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0716 (1.0144)  acc1: 80.4000 (82.2720)  acc5: 95.2000 (96.2560)  time: 0.1924  data: 0.0240  max mem: 28503
Test: Total time: 0:00:10 (0.4048 s / it)
* Acc@1 82.720 Acc@5 96.400 loss 0.996
Accuracy of the model on the 50000 test images: 82.7%
Max accuracy: 82.89%
Epoch: [234]  [   0/1251]  eta: 1:09:11  lr: 0.000525  min_lr: 0.000525  loss: 1.9717 (1.9717)  weight_decay: 0.0500 (0.0500)  time: 3.3182  data: 2.3200  max mem: 28503
Epoch: [234]  [ 200/1251]  eta: 0:06:22  lr: 0.000522  min_lr: 0.000522  loss: 2.8499 (2.7824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1808 (1.3723)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [234]  [ 400/1251]  eta: 0:05:03  lr: 0.000520  min_lr: 0.000520  loss: 2.9803 (2.8095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2979 (1.3308)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [234]  [ 600/1251]  eta: 0:03:49  lr: 0.000517  min_lr: 0.000517  loss: 2.9712 (2.8428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1487 (1.2991)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [234]  [ 800/1251]  eta: 0:02:38  lr: 0.000515  min_lr: 0.000515  loss: 3.0382 (2.8642)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1998 (1.3185)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [234]  [1000/1251]  eta: 0:01:28  lr: 0.000513  min_lr: 0.000513  loss: 2.7726 (2.8520)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2214 (1.2992)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [234]  [1200/1251]  eta: 0:00:17  lr: 0.000510  min_lr: 0.000510  loss: 2.9316 (2.8515)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3903 (1.2908)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [234]  [1250/1251]  eta: 0:00:00  lr: 0.000510  min_lr: 0.000510  loss: 2.8807 (2.8539)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3903 (1.2918)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [234] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.000510  min_lr: 0.000510  loss: 2.8807 (2.8407)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3903 (1.2918)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6435 (0.6435)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.5698  data: 5.3859  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8510 (0.8276)  acc1: 85.6000 (85.5273)  acc5: 97.6000 (97.7455)  time: 0.6730  data: 0.5026  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0038 (0.9567)  acc1: 81.2000 (82.7238)  acc5: 96.4000 (96.3619)  time: 0.1781  data: 0.0095  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0527 (0.9688)  acc1: 81.2000 (82.3040)  acc5: 95.6000 (96.2560)  time: 0.1793  data: 0.0110  max mem: 28503
Test: Total time: 0:00:09 (0.3969 s / it)
* Acc@1 82.756 Acc@5 96.418 loss 0.953
Accuracy of the model on the 50000 test images: 82.8%
Max accuracy: 82.89%
Epoch: [235]  [   0/1251]  eta: 1:07:31  lr: 0.000510  min_lr: 0.000510  loss: 2.9965 (2.9965)  weight_decay: 0.0500 (0.0500)  time: 3.2389  data: 1.7750  max mem: 28503
Epoch: [235]  [ 200/1251]  eta: 0:06:21  lr: 0.000507  min_lr: 0.000507  loss: 2.9175 (2.8205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0735 (1.2247)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [235]  [ 400/1251]  eta: 0:05:02  lr: 0.000505  min_lr: 0.000505  loss: 2.8488 (2.8393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1018 (1.2443)  time: 0.3453  data: 0.0005  max mem: 28503
Epoch: [235]  [ 600/1251]  eta: 0:03:49  lr: 0.000502  min_lr: 0.000502  loss: 2.9444 (2.8311)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3353 (1.2649)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [235]  [ 800/1251]  eta: 0:02:38  lr: 0.000500  min_lr: 0.000500  loss: 2.7529 (2.8226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2396 (1.2567)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [235]  [1000/1251]  eta: 0:01:28  lr: 0.000498  min_lr: 0.000498  loss: 3.0949 (2.8329)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1655 (1.2513)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [235]  [1200/1251]  eta: 0:00:17  lr: 0.000495  min_lr: 0.000495  loss: 2.7191 (2.8273)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1905 (1.2448)  time: 0.3527  data: 0.0004  max mem: 28503
Epoch: [235]  [1250/1251]  eta: 0:00:00  lr: 0.000495  min_lr: 0.000495  loss: 3.0750 (2.8289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2103 (1.2529)  time: 0.2915  data: 0.0007  max mem: 28503
Epoch: [235] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000495  min_lr: 0.000495  loss: 3.0750 (2.8326)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2103 (1.2529)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6194 (0.6194)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.5503  data: 5.3226  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8365 (0.7977)  acc1: 86.8000 (85.9636)  acc5: 97.6000 (97.8182)  time: 0.7131  data: 0.5389  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9340 (0.9300)  acc1: 80.4000 (82.8000)  acc5: 96.4000 (96.4571)  time: 0.2024  data: 0.0309  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0190 (0.9430)  acc1: 80.4000 (82.5440)  acc5: 95.6000 (96.3360)  time: 0.2023  data: 0.0309  max mem: 28503
Test: Total time: 0:00:10 (0.4149 s / it)
* Acc@1 82.910 Acc@5 96.432 loss 0.928
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.91%
Epoch: [236]  [   0/1251]  eta: 1:03:46  lr: 0.000495  min_lr: 0.000495  loss: 2.8478 (2.8478)  weight_decay: 0.0500 (0.0500)  time: 3.0589  data: 2.7081  max mem: 28503
Epoch: [236]  [ 200/1251]  eta: 0:06:20  lr: 0.000492  min_lr: 0.000492  loss: 2.6474 (2.8347)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2075 (1.2039)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [236]  [ 400/1251]  eta: 0:05:02  lr: 0.000490  min_lr: 0.000490  loss: 2.5579 (2.8361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2055 (1.2898)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [236]  [ 600/1251]  eta: 0:03:49  lr: 0.000488  min_lr: 0.000488  loss: 2.8751 (2.8461)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2339 (1.2829)  time: 0.3549  data: 0.0004  max mem: 28503
Epoch: [236]  [ 800/1251]  eta: 0:02:38  lr: 0.000485  min_lr: 0.000485  loss: 2.8052 (2.8390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1278 (1.2835)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [236]  [1000/1251]  eta: 0:01:28  lr: 0.000483  min_lr: 0.000483  loss: 3.0221 (2.8292)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0398 (1.2599)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [236]  [1200/1251]  eta: 0:00:17  lr: 0.000481  min_lr: 0.000481  loss: 2.9562 (2.8334)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0614 (1.2502)  time: 0.3497  data: 0.0005  max mem: 28503
Epoch: [236]  [1250/1251]  eta: 0:00:00  lr: 0.000480  min_lr: 0.000480  loss: 3.1314 (2.8333)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3183 (1.2593)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [236] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.000480  min_lr: 0.000480  loss: 3.1314 (2.8288)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3183 (1.2593)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6265 (0.6265)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.5536  data: 5.3578  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8385 (0.8198)  acc1: 85.6000 (85.5636)  acc5: 97.6000 (97.6000)  time: 0.6862  data: 0.5143  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9798 (0.9417)  acc1: 80.8000 (82.4191)  acc5: 96.4000 (96.3429)  time: 0.1855  data: 0.0165  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9798 (0.9508)  acc1: 80.4000 (82.1600)  acc5: 96.4000 (96.2720)  time: 0.1853  data: 0.0164  max mem: 28503
Test: Total time: 0:00:10 (0.4029 s / it)
* Acc@1 82.930 Acc@5 96.464 loss 0.933
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 82.93%
Epoch: [237]  [   0/1251]  eta: 1:10:00  lr: 0.000480  min_lr: 0.000480  loss: 3.0537 (3.0537)  weight_decay: 0.0500 (0.0500)  time: 3.3577  data: 3.0058  max mem: 28503
Epoch: [237]  [ 200/1251]  eta: 0:06:22  lr: 0.000478  min_lr: 0.000478  loss: 3.0482 (2.8040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1023 (1.2968)  time: 0.3560  data: 0.0004  max mem: 28503
Epoch: [237]  [ 400/1251]  eta: 0:05:03  lr: 0.000475  min_lr: 0.000475  loss: 2.9629 (2.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3860 (1.3315)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [237]  [ 600/1251]  eta: 0:03:50  lr: 0.000473  min_lr: 0.000473  loss: 2.7180 (2.8031)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2419 (1.3022)  time: 0.3530  data: 0.0004  max mem: 28503
Epoch: [237]  [ 800/1251]  eta: 0:02:38  lr: 0.000471  min_lr: 0.000471  loss: 2.8169 (2.8180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1470 (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [237]  [1000/1251]  eta: 0:01:28  lr: 0.000468  min_lr: 0.000468  loss: 2.9799 (2.8154)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3146 (nan)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [237]  [1200/1251]  eta: 0:00:17  lr: 0.000466  min_lr: 0.000466  loss: 2.9803 (2.8234)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1777 (nan)  time: 0.3551  data: 0.0004  max mem: 28503
Epoch: [237]  [1250/1251]  eta: 0:00:00  lr: 0.000466  min_lr: 0.000466  loss: 2.9380 (2.8223)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1390 (nan)  time: 0.2923  data: 0.0005  max mem: 28503
Epoch: [237] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.000466  min_lr: 0.000466  loss: 2.9380 (2.8225)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1390 (nan)
Test:  [ 0/25]  eta: 0:02:09  loss: 0.5854 (0.5854)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.1831  data: 4.9773  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7973 (0.7451)  acc1: 87.2000 (86.2182)  acc5: 98.0000 (97.8909)  time: 0.6953  data: 0.5207  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9203 (0.8688)  acc1: 81.2000 (82.9524)  acc5: 96.4000 (96.4952)  time: 0.2190  data: 0.0490  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9409 (0.8795)  acc1: 80.4000 (82.6400)  acc5: 95.6000 (96.3680)  time: 0.2300  data: 0.0604  max mem: 28503
Test: Total time: 0:00:10 (0.4220 s / it)
* Acc@1 83.036 Acc@5 96.536 loss 0.864
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.04%
Epoch: [238]  [   0/1251]  eta: 0:57:29  lr: 0.000466  min_lr: 0.000466  loss: 2.7171 (2.7171)  weight_decay: 0.0500 (0.0500)  time: 2.7574  data: 2.3216  max mem: 28503
Epoch: [238]  [ 200/1251]  eta: 0:06:17  lr: 0.000463  min_lr: 0.000463  loss: 2.9101 (2.7986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1460 (1.2443)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [238]  [ 400/1251]  eta: 0:05:01  lr: 0.000461  min_lr: 0.000461  loss: 2.8242 (2.8079)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0333 (1.2203)  time: 0.3557  data: 0.0004  max mem: 28503
Epoch: [238]  [ 600/1251]  eta: 0:03:49  lr: 0.000459  min_lr: 0.000459  loss: 2.8802 (2.8110)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2563 (1.2653)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [238]  [ 800/1251]  eta: 0:02:38  lr: 0.000456  min_lr: 0.000456  loss: 2.9815 (2.8253)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1807 (1.2874)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [238]  [1000/1251]  eta: 0:01:28  lr: 0.000454  min_lr: 0.000454  loss: 2.7950 (2.8267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0313 (1.2443)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [238]  [1200/1251]  eta: 0:00:17  lr: 0.000452  min_lr: 0.000452  loss: 2.7158 (2.8275)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1193 (1.2491)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [238]  [1250/1251]  eta: 0:00:00  lr: 0.000451  min_lr: 0.000451  loss: 2.9618 (2.8272)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3544 (1.2550)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [238] Total time: 0:07:18 (0.3501 s / it)
Averaged stats: lr: 0.000451  min_lr: 0.000451  loss: 2.9618 (2.8250)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3544 (1.2550)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5709 (0.5709)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.2822  data: 5.0809  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7631 (0.7559)  acc1: 86.4000 (85.6364)  acc5: 98.0000 (97.7818)  time: 0.6994  data: 0.5233  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9337 (0.8805)  acc1: 80.0000 (82.8952)  acc5: 96.4000 (96.6476)  time: 0.2068  data: 0.0338  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9337 (0.8932)  acc1: 80.0000 (82.5440)  acc5: 96.4000 (96.5440)  time: 0.2061  data: 0.0338  max mem: 28503
Test: Total time: 0:00:10 (0.4070 s / it)
* Acc@1 82.896 Acc@5 96.550 loss 0.883
Accuracy of the model on the 50000 test images: 82.9%
Max accuracy: 83.04%
Epoch: [239]  [   0/1251]  eta: 1:07:31  lr: 0.000451  min_lr: 0.000451  loss: 2.5213 (2.5213)  weight_decay: 0.0500 (0.0500)  time: 3.2384  data: 2.6194  max mem: 28503
Epoch: [239]  [ 200/1251]  eta: 0:06:22  lr: 0.000449  min_lr: 0.000449  loss: 2.9266 (2.7899)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2408 (1.2895)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [239]  [ 400/1251]  eta: 0:05:02  lr: 0.000447  min_lr: 0.000447  loss: 2.9188 (2.8078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1438 (1.2347)  time: 0.3445  data: 0.0004  max mem: 28503
Epoch: [239]  [ 600/1251]  eta: 0:03:49  lr: 0.000445  min_lr: 0.000445  loss: 2.9685 (2.8118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2451 (1.2636)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [239]  [ 800/1251]  eta: 0:02:38  lr: 0.000442  min_lr: 0.000442  loss: 2.9983 (2.8088)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1218 (1.2703)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [239]  [1000/1251]  eta: 0:01:28  lr: 0.000440  min_lr: 0.000440  loss: 3.0082 (2.8208)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0439 (1.2505)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [239]  [1200/1251]  eta: 0:00:17  lr: 0.000438  min_lr: 0.000438  loss: 2.7683 (2.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2824 (1.2726)  time: 0.3544  data: 0.0004  max mem: 28503
Epoch: [239]  [1250/1251]  eta: 0:00:00  lr: 0.000437  min_lr: 0.000437  loss: 2.8214 (2.8215)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2882 (1.2728)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [239] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000437  min_lr: 0.000437  loss: 2.8214 (2.8212)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2882 (1.2728)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5886 (0.5886)  acc1: 90.0000 (90.0000)  acc5: 99.6000 (99.6000)  time: 5.8012  data: 5.5843  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8017 (0.7681)  acc1: 87.2000 (86.2182)  acc5: 98.0000 (97.7455)  time: 0.6809  data: 0.5080  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9298 (0.8898)  acc1: 81.2000 (83.1429)  acc5: 96.4000 (96.6476)  time: 0.1687  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9407 (0.8995)  acc1: 81.2000 (82.8800)  acc5: 96.4000 (96.5120)  time: 0.1686  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3977 s / it)
* Acc@1 83.064 Acc@5 96.554 loss 0.890
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.06%
Epoch: [240]  [   0/1251]  eta: 1:02:29  lr: 0.000437  min_lr: 0.000437  loss: 2.7350 (2.7350)  weight_decay: 0.0500 (0.0500)  time: 2.9976  data: 2.5879  max mem: 28503
Epoch: [240]  [ 200/1251]  eta: 0:06:22  lr: 0.000435  min_lr: 0.000435  loss: 2.8755 (2.7986)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3125 (1.3428)  time: 0.3450  data: 0.0005  max mem: 28503
Epoch: [240]  [ 400/1251]  eta: 0:05:02  lr: 0.000433  min_lr: 0.000433  loss: 2.8517 (2.8000)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0204 (1.2264)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [240]  [ 600/1251]  eta: 0:03:49  lr: 0.000431  min_lr: 0.000431  loss: 2.9537 (2.7944)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0876 (1.2550)  time: 0.3495  data: 0.0004  max mem: 28503
Epoch: [240]  [ 800/1251]  eta: 0:02:38  lr: 0.000428  min_lr: 0.000428  loss: 2.9927 (2.8030)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2971 (1.2721)  time: 0.3544  data: 0.0005  max mem: 28503
Epoch: [240]  [1000/1251]  eta: 0:01:28  lr: 0.000426  min_lr: 0.000426  loss: 2.8164 (2.8026)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2335 (1.2671)  time: 0.3533  data: 0.0005  max mem: 28503
Epoch: [240]  [1200/1251]  eta: 0:00:17  lr: 0.000424  min_lr: 0.000424  loss: 2.9481 (2.8058)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9590 (1.2375)  time: 0.3516  data: 0.0004  max mem: 28503
Epoch: [240]  [1250/1251]  eta: 0:00:00  lr: 0.000423  min_lr: 0.000423  loss: 2.7686 (2.8089)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1287 (1.2372)  time: 0.2921  data: 0.0006  max mem: 28503
Epoch: [240] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000423  min_lr: 0.000423  loss: 2.7686 (2.8108)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1287 (1.2372)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5526 (0.5526)  acc1: 91.6000 (91.6000)  acc5: 99.6000 (99.6000)  time: 5.6688  data: 5.4747  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7466 (0.7244)  acc1: 87.2000 (86.3273)  acc5: 97.6000 (97.8546)  time: 0.7030  data: 0.5316  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9164 (0.8501)  acc1: 80.0000 (82.9524)  acc5: 96.4000 (96.5524)  time: 0.1875  data: 0.0187  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9278 (0.8604)  acc1: 80.0000 (82.5600)  acc5: 96.0000 (96.4960)  time: 0.1871  data: 0.0186  max mem: 28503
Test: Total time: 0:00:10 (0.4096 s / it)
* Acc@1 83.204 Acc@5 96.578 loss 0.847
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.20%
Epoch: [241]  [   0/1251]  eta: 1:03:01  lr: 0.000423  min_lr: 0.000423  loss: 3.0171 (3.0171)  weight_decay: 0.0500 (0.0500)  time: 3.0228  data: 2.6361  max mem: 28503
Epoch: [241]  [ 200/1251]  eta: 0:06:22  lr: 0.000421  min_lr: 0.000421  loss: 2.9305 (2.7898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3221 (1.2590)  time: 0.3678  data: 0.0004  max mem: 28503
Epoch: [241]  [ 400/1251]  eta: 0:05:01  lr: 0.000419  min_lr: 0.000419  loss: 2.6347 (2.8074)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2628 (1.2177)  time: 0.3443  data: 0.0004  max mem: 28503
Epoch: [241]  [ 600/1251]  eta: 0:03:49  lr: 0.000417  min_lr: 0.000417  loss: 2.6749 (2.7963)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2977 (1.2367)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [241]  [ 800/1251]  eta: 0:02:38  lr: 0.000415  min_lr: 0.000415  loss: 2.8458 (2.7969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.2405)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [241]  [1000/1251]  eta: 0:01:27  lr: 0.000412  min_lr: 0.000412  loss: 2.8724 (2.8122)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2058 (1.2504)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [241]  [1200/1251]  eta: 0:00:17  lr: 0.000410  min_lr: 0.000410  loss: 2.9946 (2.8126)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2484 (1.2693)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [241]  [1250/1251]  eta: 0:00:00  lr: 0.000410  min_lr: 0.000410  loss: 2.8349 (2.8121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1191 (1.2625)  time: 0.2919  data: 0.0007  max mem: 28503
Epoch: [241] Total time: 0:07:17 (0.3498 s / it)
Averaged stats: lr: 0.000410  min_lr: 0.000410  loss: 2.8349 (2.8072)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1191 (1.2625)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5544 (0.5544)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.4239  data: 5.2325  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7692 (0.7423)  acc1: 86.4000 (86.1455)  acc5: 97.6000 (97.7091)  time: 0.7083  data: 0.5362  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9125 (0.8633)  acc1: 81.6000 (83.0476)  acc5: 96.0000 (96.5333)  time: 0.2025  data: 0.0334  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9426 (0.8728)  acc1: 81.2000 (82.6880)  acc5: 96.0000 (96.4800)  time: 0.2016  data: 0.0333  max mem: 28503
Test: Total time: 0:00:10 (0.4091 s / it)
* Acc@1 83.066 Acc@5 96.524 loss 0.863
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.20%
Epoch: [242]  [   0/1251]  eta: 1:12:53  lr: 0.000410  min_lr: 0.000410  loss: 2.9253 (2.9253)  weight_decay: 0.0500 (0.0500)  time: 3.4963  data: 2.7223  max mem: 28503
Epoch: [242]  [ 200/1251]  eta: 0:06:23  lr: 0.000407  min_lr: 0.000407  loss: 2.7471 (2.7636)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1546 (1.2986)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [242]  [ 400/1251]  eta: 0:05:02  lr: 0.000405  min_lr: 0.000405  loss: 3.0128 (2.8043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2806 (1.3622)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [242]  [ 600/1251]  eta: 0:03:49  lr: 0.000403  min_lr: 0.000403  loss: 2.7730 (2.7957)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2623 (1.3972)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [242]  [ 800/1251]  eta: 0:02:38  lr: 0.000401  min_lr: 0.000401  loss: 2.9110 (2.7999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2946 (1.4179)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [242]  [1000/1251]  eta: 0:01:28  lr: 0.000399  min_lr: 0.000399  loss: 2.8646 (2.7984)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2692 (1.3918)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [242]  [1200/1251]  eta: 0:00:17  lr: 0.000397  min_lr: 0.000397  loss: 2.7997 (2.7990)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3491 (1.3779)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [242]  [1250/1251]  eta: 0:00:00  lr: 0.000396  min_lr: 0.000396  loss: 2.9137 (2.8010)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3491 (1.3744)  time: 0.2979  data: 0.0007  max mem: 28503
Epoch: [242] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.000396  min_lr: 0.000396  loss: 2.9137 (2.8002)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3491 (1.3744)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6586 (0.6586)  acc1: 89.6000 (89.6000)  acc5: 99.6000 (99.6000)  time: 5.5813  data: 5.3834  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8436 (0.8154)  acc1: 86.4000 (85.9636)  acc5: 97.6000 (97.8909)  time: 0.6611  data: 0.4898  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9649 (0.9337)  acc1: 80.8000 (83.0286)  acc5: 96.4000 (96.6286)  time: 0.1710  data: 0.0024  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0248 (0.9443)  acc1: 80.8000 (82.6240)  acc5: 96.0000 (96.5440)  time: 0.1728  data: 0.0023  max mem: 28503
Test: Total time: 0:00:09 (0.3925 s / it)
* Acc@1 83.048 Acc@5 96.532 loss 0.937
Accuracy of the model on the 50000 test images: 83.0%
Max accuracy: 83.20%
Epoch: [243]  [   0/1251]  eta: 1:11:11  lr: 0.000396  min_lr: 0.000396  loss: 3.1525 (3.1525)  weight_decay: 0.0500 (0.0500)  time: 3.4144  data: 2.2732  max mem: 28503
Epoch: [243]  [ 200/1251]  eta: 0:06:22  lr: 0.000394  min_lr: 0.000394  loss: 2.5531 (2.7594)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3401 (1.5826)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [243]  [ 400/1251]  eta: 0:05:02  lr: 0.000392  min_lr: 0.000392  loss: 2.8724 (2.7790)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3081 (1.4349)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [243]  [ 600/1251]  eta: 0:03:49  lr: 0.000390  min_lr: 0.000390  loss: 2.7020 (2.7644)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1408 (1.3776)  time: 0.3466  data: 0.0005  max mem: 28503
Epoch: [243]  [ 800/1251]  eta: 0:02:38  lr: 0.000388  min_lr: 0.000388  loss: 2.7127 (2.7638)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2829 (1.3849)  time: 0.3468  data: 0.0005  max mem: 28503
Epoch: [243]  [1000/1251]  eta: 0:01:28  lr: 0.000385  min_lr: 0.000385  loss: 3.0023 (2.7733)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0801 (1.3533)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [243]  [1200/1251]  eta: 0:00:17  lr: 0.000383  min_lr: 0.000383  loss: 2.9588 (2.7803)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1053 (1.3334)  time: 0.3529  data: 0.0004  max mem: 28503
Epoch: [243]  [1250/1251]  eta: 0:00:00  lr: 0.000383  min_lr: 0.000383  loss: 2.9182 (2.7833)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1003 (1.3238)  time: 0.2916  data: 0.0007  max mem: 28503
Epoch: [243] Total time: 0:07:18 (0.3502 s / it)
Averaged stats: lr: 0.000383  min_lr: 0.000383  loss: 2.9182 (2.7949)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1003 (1.3238)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6258 (0.6258)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.6144  data: 5.3831  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8066 (0.7956)  acc1: 86.4000 (85.6727)  acc5: 98.0000 (97.8546)  time: 0.6989  data: 0.5240  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9601 (0.9243)  acc1: 80.4000 (82.8000)  acc5: 96.0000 (96.5333)  time: 0.1879  data: 0.0191  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0062 (0.9377)  acc1: 80.0000 (82.4960)  acc5: 96.0000 (96.5120)  time: 0.1873  data: 0.0190  max mem: 28503
Test: Total time: 0:00:10 (0.4052 s / it)
* Acc@1 83.120 Acc@5 96.502 loss 0.925
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.20%
Epoch: [244]  [   0/1251]  eta: 1:12:19  lr: 0.000383  min_lr: 0.000383  loss: 2.5592 (2.5592)  weight_decay: 0.0500 (0.0500)  time: 3.4684  data: 3.0275  max mem: 28503
Epoch: [244]  [ 200/1251]  eta: 0:06:21  lr: 0.000381  min_lr: 0.000381  loss: 2.6950 (2.7765)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0411 (1.2149)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [244]  [ 400/1251]  eta: 0:05:03  lr: 0.000379  min_lr: 0.000379  loss: 2.9719 (2.7793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2489 (nan)  time: 0.3536  data: 0.0004  max mem: 28503
Epoch: [244]  [ 600/1251]  eta: 0:03:50  lr: 0.000377  min_lr: 0.000377  loss: 2.8458 (2.7982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2440 (nan)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [244]  [ 800/1251]  eta: 0:02:38  lr: 0.000374  min_lr: 0.000374  loss: 3.0701 (2.7859)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1833 (nan)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [244]  [1000/1251]  eta: 0:01:28  lr: 0.000372  min_lr: 0.000372  loss: 2.7252 (2.7866)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3242 (nan)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [244]  [1200/1251]  eta: 0:00:17  lr: 0.000370  min_lr: 0.000370  loss: 2.7364 (2.7878)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1499 (nan)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [244]  [1250/1251]  eta: 0:00:00  lr: 0.000370  min_lr: 0.000370  loss: 2.9759 (2.7886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2643 (nan)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [244] Total time: 0:07:17 (0.3500 s / it)
Averaged stats: lr: 0.000370  min_lr: 0.000370  loss: 2.9759 (2.7836)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2643 (nan)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.5602 (0.5602)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.6909  data: 5.4955  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7392 (0.7314)  acc1: 86.4000 (86.0000)  acc5: 98.4000 (97.8182)  time: 0.6717  data: 0.4999  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9300 (0.8610)  acc1: 80.4000 (82.8191)  acc5: 96.4000 (96.5714)  time: 0.1692  data: 0.0003  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9509 (0.8711)  acc1: 80.4000 (82.7040)  acc5: 95.2000 (96.4320)  time: 0.1686  data: 0.0002  max mem: 28503
Test: Total time: 0:00:09 (0.3944 s / it)
* Acc@1 83.168 Acc@5 96.528 loss 0.863
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.20%
Epoch: [245]  [   0/1251]  eta: 1:05:14  lr: 0.000370  min_lr: 0.000370  loss: 2.9799 (2.9799)  weight_decay: 0.0500 (0.0500)  time: 3.1291  data: 2.5844  max mem: 28503
Epoch: [245]  [ 200/1251]  eta: 0:06:21  lr: 0.000368  min_lr: 0.000368  loss: 2.9351 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1137 (1.1256)  time: 0.3455  data: 0.0003  max mem: 28503
Epoch: [245]  [ 400/1251]  eta: 0:05:01  lr: 0.000366  min_lr: 0.000366  loss: 3.0353 (2.7605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3699 (1.3077)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [245]  [ 600/1251]  eta: 0:03:49  lr: 0.000364  min_lr: 0.000364  loss: 3.0023 (2.7715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1215 (1.2809)  time: 0.3460  data: 0.0005  max mem: 28503
Epoch: [245]  [ 800/1251]  eta: 0:02:38  lr: 0.000362  min_lr: 0.000362  loss: 2.9088 (2.7852)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2391 (1.3123)  time: 0.3541  data: 0.0005  max mem: 28503
Epoch: [245]  [1000/1251]  eta: 0:01:27  lr: 0.000359  min_lr: 0.000359  loss: 2.7377 (2.7793)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2593 (1.3223)  time: 0.3515  data: 0.0004  max mem: 28503
Epoch: [245]  [1200/1251]  eta: 0:00:17  lr: 0.000357  min_lr: 0.000357  loss: 2.7486 (2.7808)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2693 (1.3134)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [245]  [1250/1251]  eta: 0:00:00  lr: 0.000357  min_lr: 0.000357  loss: 2.9243 (2.7801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3209 (1.3210)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [245] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.000357  min_lr: 0.000357  loss: 2.9243 (2.7879)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3209 (1.3210)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5890 (0.5890)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5282  data: 5.3225  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7939 (0.7598)  acc1: 86.8000 (86.0364)  acc5: 98.0000 (97.7455)  time: 0.6792  data: 0.5053  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9444 (0.8847)  acc1: 81.2000 (82.9905)  acc5: 95.6000 (96.4191)  time: 0.1837  data: 0.0141  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9733 (0.8964)  acc1: 80.8000 (82.4800)  acc5: 95.6000 (96.4160)  time: 0.1826  data: 0.0140  max mem: 28503
Test: Total time: 0:00:10 (0.4027 s / it)
* Acc@1 83.092 Acc@5 96.542 loss 0.882
Accuracy of the model on the 50000 test images: 83.1%
Max accuracy: 83.20%
Epoch: [246]  [   0/1251]  eta: 1:09:31  lr: 0.000357  min_lr: 0.000357  loss: 2.9604 (2.9604)  weight_decay: 0.0500 (0.0500)  time: 3.3343  data: 2.2715  max mem: 28503
Epoch: [246]  [ 200/1251]  eta: 0:06:22  lr: 0.000355  min_lr: 0.000355  loss: 2.8971 (2.7923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1383 (1.1791)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [246]  [ 400/1251]  eta: 0:05:03  lr: 0.000353  min_lr: 0.000353  loss: 2.8235 (2.7822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2006 (1.2707)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [246]  [ 600/1251]  eta: 0:03:50  lr: 0.000351  min_lr: 0.000351  loss: 2.8709 (2.7822)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1368 (1.2781)  time: 0.3573  data: 0.0004  max mem: 28503
Epoch: [246]  [ 800/1251]  eta: 0:02:38  lr: 0.000349  min_lr: 0.000349  loss: 2.8097 (2.7845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2404 (1.3026)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [246]  [1000/1251]  eta: 0:01:28  lr: 0.000347  min_lr: 0.000347  loss: 2.9471 (2.7778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3389 (1.3458)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [246]  [1200/1251]  eta: 0:00:17  lr: 0.000345  min_lr: 0.000345  loss: 2.9926 (2.7888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1034 (1.3403)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [246]  [1250/1251]  eta: 0:00:00  lr: 0.000344  min_lr: 0.000344  loss: 2.8521 (2.7921)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1570 (1.3386)  time: 0.2920  data: 0.0006  max mem: 28503
Epoch: [246] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.000344  min_lr: 0.000344  loss: 2.8521 (2.7764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1570 (1.3386)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6567 (0.6567)  acc1: 89.2000 (89.2000)  acc5: 98.4000 (98.4000)  time: 5.6849  data: 5.4936  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8283 (0.8026)  acc1: 87.2000 (85.6000)  acc5: 97.6000 (97.8182)  time: 0.7459  data: 0.5709  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9670 (0.9341)  acc1: 81.2000 (82.3810)  acc5: 96.4000 (96.4000)  time: 0.2102  data: 0.0394  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0106 (0.9471)  acc1: 81.2000 (82.0640)  acc5: 96.0000 (96.3520)  time: 0.2086  data: 0.0393  max mem: 28503
Test: Total time: 0:00:10 (0.4261 s / it)
* Acc@1 83.176 Acc@5 96.556 loss 0.931
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.20%
Epoch: [247]  [   0/1251]  eta: 1:07:08  lr: 0.000344  min_lr: 0.000344  loss: 2.5939 (2.5939)  weight_decay: 0.0500 (0.0500)  time: 3.2206  data: 2.7224  max mem: 28503
Epoch: [247]  [ 200/1251]  eta: 0:06:21  lr: 0.000342  min_lr: 0.000342  loss: 2.2577 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1779 (1.3231)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [247]  [ 400/1251]  eta: 0:05:02  lr: 0.000340  min_lr: 0.000340  loss: 2.8459 (2.7205)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1628 (1.3112)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [247]  [ 600/1251]  eta: 0:03:49  lr: 0.000338  min_lr: 0.000338  loss: 2.7090 (2.7392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3618 (1.3385)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [247]  [ 800/1251]  eta: 0:02:38  lr: 0.000336  min_lr: 0.000336  loss: 2.7797 (2.7510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3737 (1.3587)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [247]  [1000/1251]  eta: 0:01:28  lr: 0.000334  min_lr: 0.000334  loss: 2.9216 (2.7528)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1225 (1.3414)  time: 0.3469  data: 0.0005  max mem: 28503
Epoch: [247]  [1200/1251]  eta: 0:00:17  lr: 0.000332  min_lr: 0.000332  loss: 2.7077 (2.7558)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4196 (1.3635)  time: 0.3473  data: 0.0005  max mem: 28503
Epoch: [247]  [1250/1251]  eta: 0:00:00  lr: 0.000332  min_lr: 0.000332  loss: 2.9519 (2.7568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2551 (1.3649)  time: 0.2988  data: 0.0007  max mem: 28503
Epoch: [247] Total time: 0:07:18 (0.3507 s / it)
Averaged stats: lr: 0.000332  min_lr: 0.000332  loss: 2.9519 (2.7724)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2551 (1.3649)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5540 (0.5540)  acc1: 89.2000 (89.2000)  acc5: 99.2000 (99.2000)  time: 5.4510  data: 5.2546  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7662 (0.7242)  acc1: 86.8000 (86.0727)  acc5: 98.0000 (98.0364)  time: 0.7500  data: 0.5787  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8857 (0.8486)  acc1: 82.0000 (82.9143)  acc5: 96.8000 (96.7048)  time: 0.2282  data: 0.0596  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9294 (0.8614)  acc1: 81.2000 (82.5280)  acc5: 96.0000 (96.6400)  time: 0.2279  data: 0.0595  max mem: 28503
Test: Total time: 0:00:10 (0.4313 s / it)
* Acc@1 83.166 Acc@5 96.670 loss 0.848
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.20%
Epoch: [248]  [   0/1251]  eta: 1:09:59  lr: 0.000332  min_lr: 0.000332  loss: 3.0806 (3.0806)  weight_decay: 0.0500 (0.0500)  time: 3.3570  data: 1.7042  max mem: 28503
Epoch: [248]  [ 200/1251]  eta: 0:06:20  lr: 0.000330  min_lr: 0.000330  loss: 2.8669 (2.7913)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1662 (1.2329)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [248]  [ 400/1251]  eta: 0:05:01  lr: 0.000328  min_lr: 0.000328  loss: 2.7393 (2.8176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2617 (1.2320)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [248]  [ 600/1251]  eta: 0:03:49  lr: 0.000326  min_lr: 0.000326  loss: 2.9926 (2.8095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3623 (1.2943)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [248]  [ 800/1251]  eta: 0:02:38  lr: 0.000324  min_lr: 0.000324  loss: 2.4139 (2.7839)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2085 (1.2766)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [248]  [1000/1251]  eta: 0:01:27  lr: 0.000322  min_lr: 0.000322  loss: 2.8065 (2.7867)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3172 (1.2905)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [248]  [1200/1251]  eta: 0:00:17  lr: 0.000320  min_lr: 0.000320  loss: 2.7953 (2.7888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1715 (1.2959)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [248]  [1250/1251]  eta: 0:00:00  lr: 0.000320  min_lr: 0.000320  loss: 2.9557 (2.7901)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1729 (1.2957)  time: 0.2915  data: 0.0005  max mem: 28503
Epoch: [248] Total time: 0:07:16 (0.3492 s / it)
Averaged stats: lr: 0.000320  min_lr: 0.000320  loss: 2.9557 (2.7723)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1729 (1.2957)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6304 (0.6304)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6050  data: 5.4124  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8430 (0.8035)  acc1: 86.8000 (86.0364)  acc5: 98.4000 (97.8545)  time: 0.6781  data: 0.5072  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9984 (0.9334)  acc1: 81.6000 (83.1429)  acc5: 96.4000 (96.4762)  time: 0.1972  data: 0.0278  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0182 (0.9469)  acc1: 81.2000 (82.6080)  acc5: 95.6000 (96.3840)  time: 0.2013  data: 0.0322  max mem: 28503
Test: Total time: 0:00:10 (0.4157 s / it)
* Acc@1 83.244 Acc@5 96.538 loss 0.932
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.24%
Epoch: [249]  [   0/1251]  eta: 1:03:34  lr: 0.000320  min_lr: 0.000320  loss: 2.5263 (2.5263)  weight_decay: 0.0500 (0.0500)  time: 3.0494  data: 2.6814  max mem: 28503
Epoch: [249]  [ 200/1251]  eta: 0:06:20  lr: 0.000318  min_lr: 0.000318  loss: 2.8336 (2.7810)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1707 (1.3194)  time: 0.3518  data: 0.0004  max mem: 28503
Epoch: [249]  [ 400/1251]  eta: 0:05:01  lr: 0.000316  min_lr: 0.000316  loss: 2.7370 (2.7721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5313 (1.3701)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [249]  [ 600/1251]  eta: 0:03:49  lr: 0.000314  min_lr: 0.000314  loss: 3.0537 (2.7968)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4762 (1.4019)  time: 0.3541  data: 0.0004  max mem: 28503
Epoch: [249]  [ 800/1251]  eta: 0:02:38  lr: 0.000312  min_lr: 0.000312  loss: 2.5373 (2.7732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1642 (1.3666)  time: 0.3546  data: 0.0004  max mem: 28503
Epoch: [249]  [1000/1251]  eta: 0:01:27  lr: 0.000310  min_lr: 0.000310  loss: 2.9734 (2.7660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3028 (1.3472)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [249]  [1200/1251]  eta: 0:00:17  lr: 0.000308  min_lr: 0.000308  loss: 2.8558 (2.7596)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1941 (1.3203)  time: 0.3452  data: 0.0005  max mem: 28503
Epoch: [249]  [1250/1251]  eta: 0:00:00  lr: 0.000308  min_lr: 0.000308  loss: 2.9149 (2.7591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.3165)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [249] Total time: 0:07:17 (0.3496 s / it)
Averaged stats: lr: 0.000308  min_lr: 0.000308  loss: 2.9149 (2.7689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1746 (1.3165)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5286 (0.5286)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.7831  data: 5.5911  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7365 (0.6968)  acc1: 86.4000 (85.8545)  acc5: 98.0000 (97.9273)  time: 0.7384  data: 0.5668  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8572 (0.8155)  acc1: 81.2000 (83.1048)  acc5: 96.8000 (96.7619)  time: 0.2065  data: 0.0322  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.8572 (0.8285)  acc1: 81.2000 (82.6400)  acc5: 96.4000 (96.7040)  time: 0.2063  data: 0.0322  max mem: 28503
Test: Total time: 0:00:10 (0.4268 s / it)
* Acc@1 83.278 Acc@5 96.610 loss 0.813
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.28%
Epoch: [250]  [   0/1251]  eta: 1:05:27  lr: 0.000307  min_lr: 0.000307  loss: 2.9102 (2.9102)  weight_decay: 0.0500 (0.0500)  time: 3.1391  data: 2.7635  max mem: 28503
Epoch: [250]  [ 200/1251]  eta: 0:06:19  lr: 0.000306  min_lr: 0.000306  loss: 2.8724 (2.8038)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3610 (1.2844)  time: 0.3447  data: 0.0004  max mem: 28503
Epoch: [250]  [ 400/1251]  eta: 0:05:01  lr: 0.000304  min_lr: 0.000304  loss: 2.6595 (2.7947)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3152 (1.3328)  time: 0.3460  data: 0.0003  max mem: 28503
Epoch: [250]  [ 600/1251]  eta: 0:03:48  lr: 0.000302  min_lr: 0.000302  loss: 2.7664 (2.7838)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4285 (1.3530)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [250]  [ 800/1251]  eta: 0:02:38  lr: 0.000300  min_lr: 0.000300  loss: 2.8027 (2.7718)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [250]  [1000/1251]  eta: 0:01:27  lr: 0.000298  min_lr: 0.000298  loss: 2.6542 (2.7727)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1314 (nan)  time: 0.3442  data: 0.0004  max mem: 28503
Epoch: [250]  [1200/1251]  eta: 0:00:17  lr: 0.000296  min_lr: 0.000296  loss: 2.9434 (2.7731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2037 (nan)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [250]  [1250/1251]  eta: 0:00:00  lr: 0.000296  min_lr: 0.000296  loss: 2.7421 (2.7715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1446 (nan)  time: 0.2918  data: 0.0005  max mem: 28503
Epoch: [250] Total time: 0:07:16 (0.3491 s / it)
Averaged stats: lr: 0.000296  min_lr: 0.000296  loss: 2.7421 (2.7654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1446 (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5817 (0.5817)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.5331  data: 5.3280  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7709 (0.7602)  acc1: 87.2000 (86.3636)  acc5: 97.6000 (97.7455)  time: 0.7115  data: 0.5393  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9393 (0.8804)  acc1: 82.0000 (83.3333)  acc5: 96.4000 (96.4571)  time: 0.1990  data: 0.0303  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9393 (0.8911)  acc1: 80.8000 (82.8960)  acc5: 95.6000 (96.3840)  time: 0.1982  data: 0.0297  max mem: 28503
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 83.200 Acc@5 96.524 loss 0.879
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.28%
Epoch: [251]  [   0/1251]  eta: 1:11:38  lr: 0.000296  min_lr: 0.000296  loss: 3.0784 (3.0784)  weight_decay: 0.0500 (0.0500)  time: 3.4357  data: 2.9416  max mem: 28503
Epoch: [251]  [ 200/1251]  eta: 0:06:22  lr: 0.000294  min_lr: 0.000294  loss: 2.8827 (2.7720)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1531 (1.2604)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [251]  [ 400/1251]  eta: 0:05:02  lr: 0.000292  min_lr: 0.000292  loss: 2.8530 (2.7610)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1492 (1.2304)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [251]  [ 600/1251]  eta: 0:03:50  lr: 0.000290  min_lr: 0.000290  loss: 2.9945 (2.7587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2368 (1.2299)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [251]  [ 800/1251]  eta: 0:02:39  lr: 0.000288  min_lr: 0.000288  loss: 2.9105 (2.7564)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3240 (1.2620)  time: 0.3600  data: 0.0004  max mem: 28503
Epoch: [251]  [1000/1251]  eta: 0:01:28  lr: 0.000286  min_lr: 0.000286  loss: 2.5964 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3483 (1.3130)  time: 0.3461  data: 0.0004  max mem: 28503
Epoch: [251]  [1200/1251]  eta: 0:00:17  lr: 0.000284  min_lr: 0.000284  loss: 2.9325 (2.7602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2274 (1.3480)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [251]  [1250/1251]  eta: 0:00:00  lr: 0.000284  min_lr: 0.000284  loss: 2.7054 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5378 (1.3576)  time: 0.2918  data: 0.0007  max mem: 28503
Epoch: [251] Total time: 0:07:18 (0.3506 s / it)
Averaged stats: lr: 0.000284  min_lr: 0.000284  loss: 2.7054 (2.7588)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5378 (1.3576)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5186 (0.5186)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.6056  data: 5.4051  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7416 (0.6987)  acc1: 87.2000 (86.3273)  acc5: 98.0000 (98.0364)  time: 0.6635  data: 0.4917  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8817 (0.8197)  acc1: 81.2000 (83.3143)  acc5: 96.8000 (96.7238)  time: 0.1689  data: 0.0002  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9031 (0.8331)  acc1: 80.8000 (82.9920)  acc5: 96.4000 (96.5760)  time: 0.1686  data: 0.0001  max mem: 28503
Test: Total time: 0:00:09 (0.3894 s / it)
* Acc@1 83.404 Acc@5 96.634 loss 0.821
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.40%
Epoch: [252]  [   0/1251]  eta: 1:06:15  lr: 0.000284  min_lr: 0.000284  loss: 3.4989 (3.4989)  weight_decay: 0.0500 (0.0500)  time: 3.1781  data: 2.8026  max mem: 28503
Epoch: [252]  [ 200/1251]  eta: 0:06:20  lr: 0.000282  min_lr: 0.000282  loss: 2.7510 (2.7456)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1972 (1.2787)  time: 0.3510  data: 0.0004  max mem: 28503
Epoch: [252]  [ 400/1251]  eta: 0:05:03  lr: 0.000280  min_lr: 0.000280  loss: 2.9806 (2.7606)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1520 (1.3289)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [252]  [ 600/1251]  eta: 0:03:49  lr: 0.000279  min_lr: 0.000279  loss: 2.8366 (2.7500)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2012 (1.3249)  time: 0.3454  data: 0.0004  max mem: 28503
Epoch: [252]  [ 800/1251]  eta: 0:02:38  lr: 0.000277  min_lr: 0.000277  loss: 2.7288 (2.7383)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2777 (1.3448)  time: 0.3467  data: 0.0005  max mem: 28503
Epoch: [252]  [1000/1251]  eta: 0:01:28  lr: 0.000275  min_lr: 0.000275  loss: 2.7959 (2.7417)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3913 (1.3452)  time: 0.3499  data: 0.0006  max mem: 28503
Epoch: [252]  [1200/1251]  eta: 0:00:17  lr: 0.000273  min_lr: 0.000273  loss: 2.8889 (2.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2776 (1.3501)  time: 0.3465  data: 0.0005  max mem: 28503
Epoch: [252]  [1250/1251]  eta: 0:00:00  lr: 0.000273  min_lr: 0.000273  loss: 2.7198 (2.7440)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.3421)  time: 0.2917  data: 0.0006  max mem: 28503
Epoch: [252] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.000273  min_lr: 0.000273  loss: 2.7198 (2.7536)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1467 (1.3421)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5854 (0.5854)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.7946  data: 5.6080  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8028 (0.7600)  acc1: 86.8000 (86.6182)  acc5: 97.6000 (97.6727)  time: 0.6817  data: 0.5115  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9336 (0.8833)  acc1: 81.2000 (83.3905)  acc5: 96.4000 (96.7048)  time: 0.1695  data: 0.0010  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9636 (0.8966)  acc1: 81.2000 (82.8800)  acc5: 96.4000 (96.5920)  time: 0.1877  data: 0.0193  max mem: 28503
Test: Total time: 0:00:10 (0.4127 s / it)
* Acc@1 83.254 Acc@5 96.584 loss 0.885
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.40%
Epoch: [253]  [   0/1251]  eta: 1:10:59  lr: 0.000273  min_lr: 0.000273  loss: 3.0151 (3.0151)  weight_decay: 0.0500 (0.0500)  time: 3.4051  data: 2.6375  max mem: 28503
Epoch: [253]  [ 200/1251]  eta: 0:06:22  lr: 0.000271  min_lr: 0.000271  loss: 2.8234 (2.7410)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2277 (1.2846)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [253]  [ 400/1251]  eta: 0:05:02  lr: 0.000269  min_lr: 0.000269  loss: 2.7174 (2.7716)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1358 (1.3386)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [253]  [ 600/1251]  eta: 0:03:50  lr: 0.000267  min_lr: 0.000267  loss: 2.9284 (2.7781)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2070 (1.3244)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [253]  [ 800/1251]  eta: 0:02:38  lr: 0.000265  min_lr: 0.000265  loss: 2.6859 (2.7601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4229 (1.3490)  time: 0.3553  data: 0.0004  max mem: 28503
Epoch: [253]  [1000/1251]  eta: 0:01:28  lr: 0.000264  min_lr: 0.000264  loss: 2.8752 (2.7590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2142 (1.3425)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [253]  [1200/1251]  eta: 0:00:17  lr: 0.000262  min_lr: 0.000262  loss: 2.8266 (2.7590)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1551 (1.3366)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [253]  [1250/1251]  eta: 0:00:00  lr: 0.000261  min_lr: 0.000261  loss: 2.7630 (2.7627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1551 (1.3363)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [253] Total time: 0:07:18 (0.3504 s / it)
Averaged stats: lr: 0.000261  min_lr: 0.000261  loss: 2.7630 (2.7485)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1551 (1.3363)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6525 (0.6525)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.7038  data: 5.4896  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8779 (0.8391)  acc1: 86.4000 (86.1091)  acc5: 97.6000 (97.5636)  time: 0.7050  data: 0.5308  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0136 (0.9547)  acc1: 82.0000 (83.3524)  acc5: 96.0000 (96.4952)  time: 0.1967  data: 0.0274  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0219 (0.9656)  acc1: 81.6000 (82.8640)  acc5: 96.0000 (96.3680)  time: 0.1981  data: 0.0297  max mem: 28503
Test: Total time: 0:00:10 (0.4182 s / it)
* Acc@1 83.246 Acc@5 96.530 loss 0.955
Accuracy of the model on the 50000 test images: 83.2%
Max accuracy: 83.40%
Epoch: [254]  [   0/1251]  eta: 1:06:03  lr: 0.000261  min_lr: 0.000261  loss: 2.5247 (2.5247)  weight_decay: 0.0500 (0.0500)  time: 3.1683  data: 1.8057  max mem: 28503
Epoch: [254]  [ 200/1251]  eta: 0:06:22  lr: 0.000260  min_lr: 0.000260  loss: 3.0259 (2.7972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1871 (1.3901)  time: 0.3495  data: 0.0004  max mem: 28503
Epoch: [254]  [ 400/1251]  eta: 0:05:04  lr: 0.000258  min_lr: 0.000258  loss: 2.7279 (2.7546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2030 (1.3308)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [254]  [ 600/1251]  eta: 0:03:50  lr: 0.000256  min_lr: 0.000256  loss: 2.6722 (2.7412)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3164 (1.3124)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [254]  [ 800/1251]  eta: 0:02:38  lr: 0.000254  min_lr: 0.000254  loss: 2.7803 (2.7322)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2655 (1.3304)  time: 0.3478  data: 0.0005  max mem: 28503
Epoch: [254]  [1000/1251]  eta: 0:01:28  lr: 0.000253  min_lr: 0.000253  loss: 2.6914 (2.7287)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2802 (1.3366)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [254]  [1200/1251]  eta: 0:00:17  lr: 0.000251  min_lr: 0.000251  loss: 2.9286 (2.7263)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2246 (1.3257)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [254]  [1250/1251]  eta: 0:00:00  lr: 0.000251  min_lr: 0.000251  loss: 2.9181 (2.7248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2246 (1.3244)  time: 0.2918  data: 0.0006  max mem: 28503
Epoch: [254] Total time: 0:07:18 (0.3505 s / it)
Averaged stats: lr: 0.000251  min_lr: 0.000251  loss: 2.9181 (2.7467)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2246 (1.3244)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.6210 (0.6210)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.4136  data: 5.2150  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8170 (0.7845)  acc1: 86.8000 (86.3273)  acc5: 97.6000 (97.6727)  time: 0.7279  data: 0.5537  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9440 (0.9050)  acc1: 81.6000 (83.2952)  acc5: 96.4000 (96.5905)  time: 0.2161  data: 0.0439  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9648 (0.9163)  acc1: 81.6000 (82.9120)  acc5: 96.0000 (96.4160)  time: 0.2158  data: 0.0438  max mem: 28503
Test: Total time: 0:00:10 (0.4195 s / it)
* Acc@1 83.364 Acc@5 96.578 loss 0.904
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.40%
Epoch: [255]  [   0/1251]  eta: 1:09:33  lr: 0.000250  min_lr: 0.000250  loss: 3.0833 (3.0833)  weight_decay: 0.0500 (0.0500)  time: 3.3362  data: 2.4538  max mem: 28503
Epoch: [255]  [ 200/1251]  eta: 0:06:24  lr: 0.000249  min_lr: 0.000249  loss: 2.8490 (2.6818)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1790 (1.2309)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [255]  [ 400/1251]  eta: 0:05:03  lr: 0.000247  min_lr: 0.000247  loss: 2.8595 (2.6997)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3572 (1.3017)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [255]  [ 600/1251]  eta: 0:03:50  lr: 0.000245  min_lr: 0.000245  loss: 2.8859 (2.7310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2291 (1.3400)  time: 0.3553  data: 0.0004  max mem: 28503
Epoch: [255]  [ 800/1251]  eta: 0:02:38  lr: 0.000244  min_lr: 0.000244  loss: 2.9187 (2.7299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2813 (1.3267)  time: 0.3450  data: 0.0004  max mem: 28503
Epoch: [255]  [1000/1251]  eta: 0:01:28  lr: 0.000242  min_lr: 0.000242  loss: 2.7454 (2.7345)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1665 (1.3298)  time: 0.3575  data: 0.0004  max mem: 28503
Epoch: [255]  [1200/1251]  eta: 0:00:17  lr: 0.000240  min_lr: 0.000240  loss: 2.2449 (2.7343)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2712 (1.3274)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [255]  [1250/1251]  eta: 0:00:00  lr: 0.000240  min_lr: 0.000240  loss: 2.6412 (2.7304)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2585 (1.3296)  time: 0.2920  data: 0.0007  max mem: 28503
Epoch: [255] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.000240  min_lr: 0.000240  loss: 2.6412 (2.7369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2585 (1.3296)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5310 (0.5310)  acc1: 90.8000 (90.8000)  acc5: 99.6000 (99.6000)  time: 5.7636  data: 5.5570  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7658 (0.7161)  acc1: 86.8000 (86.5818)  acc5: 98.0000 (97.8909)  time: 0.7256  data: 0.5534  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8879 (0.8396)  acc1: 81.2000 (83.2000)  acc5: 96.8000 (96.5524)  time: 0.1952  data: 0.0266  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9233 (0.8511)  acc1: 80.8000 (82.8160)  acc5: 96.0000 (96.4320)  time: 0.1950  data: 0.0265  max mem: 28503
Test: Total time: 0:00:10 (0.4179 s / it)
* Acc@1 83.350 Acc@5 96.576 loss 0.836
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.40%
Epoch: [256]  [   0/1251]  eta: 1:11:29  lr: 0.000240  min_lr: 0.000240  loss: 2.5079 (2.5079)  weight_decay: 0.0500 (0.0500)  time: 3.4285  data: 2.3336  max mem: 28503
Epoch: [256]  [ 200/1251]  eta: 0:06:24  lr: 0.000238  min_lr: 0.000238  loss: 2.7203 (2.7359)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3819 (nan)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [256]  [ 400/1251]  eta: 0:05:03  lr: 0.000236  min_lr: 0.000236  loss: 2.6564 (2.7388)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3932 (nan)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [256]  [ 600/1251]  eta: 0:03:50  lr: 0.000235  min_lr: 0.000235  loss: 2.8178 (2.7484)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3306 (nan)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [256]  [ 800/1251]  eta: 0:02:38  lr: 0.000233  min_lr: 0.000233  loss: 2.8109 (2.7608)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3262 (nan)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [256]  [1000/1251]  eta: 0:01:28  lr: 0.000231  min_lr: 0.000231  loss: 2.8089 (2.7505)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4121 (nan)  time: 0.3601  data: 0.0004  max mem: 28503
Epoch: [256]  [1200/1251]  eta: 0:00:17  lr: 0.000230  min_lr: 0.000230  loss: 3.0245 (2.7561)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3970 (nan)  time: 0.3455  data: 0.0003  max mem: 28503
Epoch: [256]  [1250/1251]  eta: 0:00:00  lr: 0.000229  min_lr: 0.000229  loss: 2.9836 (2.7578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1576 (nan)  time: 0.2916  data: 0.0005  max mem: 28503
Epoch: [256] Total time: 0:07:18 (0.3503 s / it)
Averaged stats: lr: 0.000229  min_lr: 0.000229  loss: 2.9836 (2.7449)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1576 (nan)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6013 (0.6013)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7256  data: 5.5295  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8009 (0.7697)  acc1: 85.6000 (86.6182)  acc5: 98.0000 (97.9273)  time: 0.7195  data: 0.5481  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9336 (0.8884)  acc1: 82.0000 (83.6000)  acc5: 96.4000 (96.6857)  time: 0.1937  data: 0.0250  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9609 (0.9015)  acc1: 81.6000 (83.1840)  acc5: 96.0000 (96.5440)  time: 0.1934  data: 0.0250  max mem: 28503
Test: Total time: 0:00:10 (0.4146 s / it)
* Acc@1 83.448 Acc@5 96.694 loss 0.891
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.45%
Epoch: [257]  [   0/1251]  eta: 0:58:30  lr: 0.000229  min_lr: 0.000229  loss: 2.7920 (2.7920)  weight_decay: 0.0500 (0.0500)  time: 2.8065  data: 2.3828  max mem: 28503
Epoch: [257]  [ 200/1251]  eta: 0:06:20  lr: 0.000228  min_lr: 0.000228  loss: 2.8998 (2.7226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1847 (1.3332)  time: 0.3446  data: 0.0004  max mem: 28503
Epoch: [257]  [ 400/1251]  eta: 0:05:01  lr: 0.000226  min_lr: 0.000226  loss: 2.8153 (2.7281)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3813 (1.4212)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [257]  [ 600/1251]  eta: 0:03:49  lr: 0.000224  min_lr: 0.000224  loss: 2.9677 (2.7348)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1617 (1.3690)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [257]  [ 800/1251]  eta: 0:02:38  lr: 0.000223  min_lr: 0.000223  loss: 2.6989 (2.7392)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4240 (1.4224)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [257]  [1000/1251]  eta: 0:01:27  lr: 0.000221  min_lr: 0.000221  loss: 2.7554 (2.7380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2561 (1.4184)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [257]  [1200/1251]  eta: 0:00:17  lr: 0.000219  min_lr: 0.000219  loss: 2.7676 (2.7298)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1935 (1.3803)  time: 0.3448  data: 0.0004  max mem: 28503
Epoch: [257]  [1250/1251]  eta: 0:00:00  lr: 0.000219  min_lr: 0.000219  loss: 2.9671 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3291 (1.3812)  time: 0.2924  data: 0.0006  max mem: 28503
Epoch: [257] Total time: 0:07:17 (0.3493 s / it)
Averaged stats: lr: 0.000219  min_lr: 0.000219  loss: 2.9671 (2.7398)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3291 (1.3812)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6248 (0.6248)  acc1: 91.2000 (91.2000)  acc5: 98.8000 (98.8000)  time: 5.6141  data: 5.4141  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8343 (0.7951)  acc1: 86.4000 (86.5091)  acc5: 97.6000 (97.7818)  time: 0.7130  data: 0.5408  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9740 (0.9178)  acc1: 81.2000 (83.2762)  acc5: 96.4000 (96.6095)  time: 0.1957  data: 0.0268  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9911 (0.9306)  acc1: 80.4000 (82.8160)  acc5: 96.4000 (96.5760)  time: 0.1952  data: 0.0267  max mem: 28503
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 83.322 Acc@5 96.604 loss 0.915
Accuracy of the model on the 50000 test images: 83.3%
Max accuracy: 83.45%
Epoch: [258]  [   0/1251]  eta: 1:05:43  lr: 0.000219  min_lr: 0.000219  loss: 2.7756 (2.7756)  weight_decay: 0.0500 (0.0500)  time: 3.1526  data: 2.3670  max mem: 28503
Epoch: [258]  [ 200/1251]  eta: 0:06:23  lr: 0.000217  min_lr: 0.000217  loss: 2.8718 (2.7436)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1111 (1.3326)  time: 0.3476  data: 0.0005  max mem: 28503
Epoch: [258]  [ 400/1251]  eta: 0:05:03  lr: 0.000216  min_lr: 0.000216  loss: 3.0101 (2.7443)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2545 (1.3485)  time: 0.3464  data: 0.0005  max mem: 28503
Epoch: [258]  [ 600/1251]  eta: 0:03:50  lr: 0.000214  min_lr: 0.000214  loss: 2.6104 (2.7390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1076 (1.3519)  time: 0.3456  data: 0.0005  max mem: 28503
Epoch: [258]  [ 800/1251]  eta: 0:02:39  lr: 0.000212  min_lr: 0.000212  loss: 2.8516 (2.7475)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3421 (1.3351)  time: 0.3484  data: 0.0005  max mem: 28503
Epoch: [258]  [1000/1251]  eta: 0:01:28  lr: 0.000211  min_lr: 0.000211  loss: 2.7673 (2.7450)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2134 (1.3308)  time: 0.3498  data: 0.0005  max mem: 28503
Epoch: [258]  [1200/1251]  eta: 0:00:17  lr: 0.000209  min_lr: 0.000209  loss: 2.7382 (2.7368)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2862 (1.3324)  time: 0.3448  data: 0.0005  max mem: 28503
Epoch: [258]  [1250/1251]  eta: 0:00:00  lr: 0.000209  min_lr: 0.000209  loss: 2.6350 (2.7370)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2769 (1.3373)  time: 0.2917  data: 0.0007  max mem: 28503
Epoch: [258] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.000209  min_lr: 0.000209  loss: 2.6350 (2.7377)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2769 (1.3373)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5745 (0.5745)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.7868  data: 5.5918  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8083 (0.7569)  acc1: 86.0000 (86.3636)  acc5: 98.0000 (97.8909)  time: 0.6910  data: 0.5195  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9149 (0.8713)  acc1: 82.4000 (83.4476)  acc5: 96.4000 (96.6476)  time: 0.1759  data: 0.0062  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9190 (0.8855)  acc1: 81.2000 (82.9920)  acc5: 96.4000 (96.5600)  time: 0.1761  data: 0.0061  max mem: 28503
Test: Total time: 0:00:10 (0.4051 s / it)
* Acc@1 83.450 Acc@5 96.686 loss 0.868
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.45%
Epoch: [259]  [   0/1251]  eta: 1:06:16  lr: 0.000209  min_lr: 0.000209  loss: 2.9220 (2.9220)  weight_decay: 0.0500 (0.0500)  time: 3.1787  data: 2.7620  max mem: 28503
Epoch: [259]  [ 200/1251]  eta: 0:06:20  lr: 0.000207  min_lr: 0.000207  loss: 2.7954 (2.7534)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2646 (1.3180)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [259]  [ 400/1251]  eta: 0:05:04  lr: 0.000206  min_lr: 0.000206  loss: 2.8765 (2.7315)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4246 (1.3688)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [259]  [ 600/1251]  eta: 0:03:50  lr: 0.000204  min_lr: 0.000204  loss: 2.8645 (2.7330)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3095 (1.3791)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [259]  [ 800/1251]  eta: 0:02:39  lr: 0.000203  min_lr: 0.000203  loss: 2.8240 (2.7289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4499 (1.4043)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [259]  [1000/1251]  eta: 0:01:28  lr: 0.000201  min_lr: 0.000201  loss: 2.4735 (2.7299)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3193 (1.4010)  time: 0.3485  data: 0.0005  max mem: 28503
Epoch: [259]  [1200/1251]  eta: 0:00:17  lr: 0.000199  min_lr: 0.000199  loss: 2.7218 (2.7260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.3913)  time: 0.3621  data: 0.0005  max mem: 28503
Epoch: [259]  [1250/1251]  eta: 0:00:00  lr: 0.000199  min_lr: 0.000199  loss: 2.8599 (2.7281)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4156 (1.3894)  time: 0.2925  data: 0.0006  max mem: 28503
Epoch: [259] Total time: 0:07:20 (0.3519 s / it)
Averaged stats: lr: 0.000199  min_lr: 0.000199  loss: 2.8599 (2.7284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4156 (1.3894)
Test:  [ 0/25]  eta: 0:01:47  loss: 0.6015 (0.6015)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 4.3026  data: 4.0969  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.8112 (0.7644)  acc1: 86.8000 (86.2909)  acc5: 98.0000 (97.9636)  time: 0.6623  data: 0.4903  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9200 (0.8851)  acc1: 81.2000 (83.2571)  acc5: 96.8000 (96.8571)  time: 0.2550  data: 0.0846  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9594 (0.8997)  acc1: 81.2000 (82.8160)  acc5: 96.4000 (96.6880)  time: 0.2073  data: 0.0370  max mem: 28503
Test: Total time: 0:00:10 (0.4069 s / it)
* Acc@1 83.570 Acc@5 96.720 loss 0.885
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.57%
Epoch: [260]  [   0/1251]  eta: 0:57:09  lr: 0.000199  min_lr: 0.000199  loss: 3.0957 (3.0957)  weight_decay: 0.0500 (0.0500)  time: 2.7412  data: 2.3324  max mem: 28503
Epoch: [260]  [ 200/1251]  eta: 0:06:18  lr: 0.000197  min_lr: 0.000197  loss: 2.8456 (2.6395)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3620 (1.3326)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [260]  [ 400/1251]  eta: 0:05:01  lr: 0.000196  min_lr: 0.000196  loss: 2.3960 (2.6591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2187 (1.3849)  time: 0.3586  data: 0.0004  max mem: 28503
Epoch: [260]  [ 600/1251]  eta: 0:03:50  lr: 0.000194  min_lr: 0.000194  loss: 2.7357 (2.6887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3231 (1.3802)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [260]  [ 800/1251]  eta: 0:02:38  lr: 0.000193  min_lr: 0.000193  loss: 2.9508 (2.7012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2948 (1.3745)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [260]  [1000/1251]  eta: 0:01:28  lr: 0.000191  min_lr: 0.000191  loss: 2.8982 (2.7109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2802 (1.4034)  time: 0.3640  data: 0.0004  max mem: 28503
Epoch: [260]  [1200/1251]  eta: 0:00:17  lr: 0.000190  min_lr: 0.000190  loss: 2.8333 (2.7176)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.6151 (1.4461)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [260]  [1250/1251]  eta: 0:00:00  lr: 0.000189  min_lr: 0.000189  loss: 2.6721 (2.7163)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7111 (1.4573)  time: 0.2939  data: 0.0006  max mem: 28503
Epoch: [260] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.000189  min_lr: 0.000189  loss: 2.6721 (2.7246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.7111 (1.4573)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6005 (0.6005)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.8075  data: 5.5992  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8002 (0.7660)  acc1: 86.8000 (86.4364)  acc5: 98.0000 (97.9636)  time: 0.7123  data: 0.5393  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9165 (0.8872)  acc1: 81.2000 (83.3905)  acc5: 96.8000 (96.8191)  time: 0.1856  data: 0.0167  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9631 (0.9001)  acc1: 81.2000 (82.9280)  acc5: 96.4000 (96.6880)  time: 0.1845  data: 0.0161  max mem: 28503
Test: Total time: 0:00:10 (0.4117 s / it)
* Acc@1 83.394 Acc@5 96.718 loss 0.885
Accuracy of the model on the 50000 test images: 83.4%
Max accuracy: 83.57%
Epoch: [261]  [   0/1251]  eta: 1:13:25  lr: 0.000189  min_lr: 0.000189  loss: 2.6036 (2.6036)  weight_decay: 0.0500 (0.0500)  time: 3.5215  data: 2.9681  max mem: 28503
Epoch: [261]  [ 200/1251]  eta: 0:06:25  lr: 0.000188  min_lr: 0.000188  loss: 2.9157 (2.7258)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2259 (1.3310)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [261]  [ 400/1251]  eta: 0:05:05  lr: 0.000186  min_lr: 0.000186  loss: 2.6881 (2.7393)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.3509)  time: 0.3602  data: 0.0004  max mem: 28503
Epoch: [261]  [ 600/1251]  eta: 0:03:51  lr: 0.000185  min_lr: 0.000185  loss: 2.7528 (2.7289)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3103 (1.3599)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [261]  [ 800/1251]  eta: 0:02:39  lr: 0.000183  min_lr: 0.000183  loss: 2.7702 (2.7226)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2837 (1.3787)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [261]  [1000/1251]  eta: 0:01:28  lr: 0.000182  min_lr: 0.000182  loss: 2.9058 (2.7229)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2806 (1.3717)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [261]  [1200/1251]  eta: 0:00:17  lr: 0.000180  min_lr: 0.000180  loss: 2.8989 (2.7198)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2734 (1.3700)  time: 0.3571  data: 0.0004  max mem: 28503
Epoch: [261]  [1250/1251]  eta: 0:00:00  lr: 0.000180  min_lr: 0.000180  loss: 2.8055 (2.7194)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2060 (1.3632)  time: 0.2941  data: 0.0007  max mem: 28503
Epoch: [261] Total time: 0:07:20 (0.3522 s / it)
Averaged stats: lr: 0.000180  min_lr: 0.000180  loss: 2.8055 (2.7240)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2060 (1.3632)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5397 (0.5397)  acc1: 90.4000 (90.4000)  acc5: 99.6000 (99.6000)  time: 5.4735  data: 5.2552  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7570 (0.7050)  acc1: 86.4000 (86.5455)  acc5: 97.6000 (97.8909)  time: 0.7350  data: 0.5618  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8475 (0.8232)  acc1: 81.6000 (83.6191)  acc5: 96.4000 (96.7619)  time: 0.2148  data: 0.0463  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9170 (0.8369)  acc1: 81.6000 (83.1360)  acc5: 96.0000 (96.6240)  time: 0.2146  data: 0.0462  max mem: 28503
Test: Total time: 0:00:10 (0.4217 s / it)
* Acc@1 83.690 Acc@5 96.700 loss 0.825
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.69%
Epoch: [262]  [   0/1251]  eta: 0:57:57  lr: 0.000180  min_lr: 0.000180  loss: 3.1600 (3.1600)  weight_decay: 0.0500 (0.0500)  time: 2.7798  data: 2.3873  max mem: 28503
Epoch: [262]  [ 200/1251]  eta: 0:06:17  lr: 0.000179  min_lr: 0.000179  loss: 2.6385 (2.6948)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3632 (1.4933)  time: 0.3449  data: 0.0004  max mem: 28503
Epoch: [262]  [ 400/1251]  eta: 0:05:00  lr: 0.000177  min_lr: 0.000177  loss: 2.6778 (2.7260)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2965 (1.4201)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [262]  [ 600/1251]  eta: 0:03:48  lr: 0.000176  min_lr: 0.000176  loss: 2.9231 (2.7319)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1328 (1.4000)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [262]  [ 800/1251]  eta: 0:02:38  lr: 0.000174  min_lr: 0.000174  loss: 2.8053 (2.7373)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2599 (1.4082)  time: 0.3455  data: 0.0004  max mem: 28503
Epoch: [262]  [1000/1251]  eta: 0:01:27  lr: 0.000173  min_lr: 0.000173  loss: 2.4608 (2.7255)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2830 (1.3980)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [262]  [1200/1251]  eta: 0:00:17  lr: 0.000171  min_lr: 0.000171  loss: 2.9950 (2.7243)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3320 (1.3940)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [262]  [1250/1251]  eta: 0:00:00  lr: 0.000171  min_lr: 0.000171  loss: 2.8963 (2.7278)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2499 (1.3878)  time: 0.2934  data: 0.0006  max mem: 28503
Epoch: [262] Total time: 0:07:17 (0.3494 s / it)
Averaged stats: lr: 0.000171  min_lr: 0.000171  loss: 2.8963 (2.7284)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2499 (1.3878)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.6625 (0.6625)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.8101  data: 5.6132  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8409 (0.8055)  acc1: 86.8000 (86.7273)  acc5: 98.4000 (97.9636)  time: 0.7388  data: 0.5677  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9400 (0.9316)  acc1: 81.6000 (83.5429)  acc5: 96.0000 (96.7238)  time: 0.2009  data: 0.0316  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9970 (0.9459)  acc1: 81.2000 (82.9600)  acc5: 95.6000 (96.5600)  time: 0.2008  data: 0.0315  max mem: 28503
Test: Total time: 0:00:10 (0.4233 s / it)
* Acc@1 83.510 Acc@5 96.668 loss 0.936
Accuracy of the model on the 50000 test images: 83.5%
Max accuracy: 83.69%
Epoch: [263]  [   0/1251]  eta: 1:09:40  lr: 0.000171  min_lr: 0.000171  loss: 2.7406 (2.7406)  weight_decay: 0.0500 (0.0500)  time: 3.3418  data: 2.4976  max mem: 28503
Epoch: [263]  [ 200/1251]  eta: 0:06:25  lr: 0.000169  min_lr: 0.000169  loss: 2.5603 (2.7336)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2865 (1.3258)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [263]  [ 400/1251]  eta: 0:05:04  lr: 0.000168  min_lr: 0.000168  loss: 2.8790 (2.7268)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2253 (1.3256)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [263]  [ 600/1251]  eta: 0:03:51  lr: 0.000167  min_lr: 0.000167  loss: 2.5191 (2.7420)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5622 (1.4352)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [263]  [ 800/1251]  eta: 0:02:39  lr: 0.000165  min_lr: 0.000165  loss: 2.6522 (2.7310)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1437 (1.3832)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [263]  [1000/1251]  eta: 0:01:28  lr: 0.000164  min_lr: 0.000164  loss: 2.6784 (2.7267)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5432 (1.3866)  time: 0.3591  data: 0.0004  max mem: 28503
Epoch: [263]  [1200/1251]  eta: 0:00:17  lr: 0.000162  min_lr: 0.000162  loss: 2.6874 (2.7251)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1697 (1.3867)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [263]  [1250/1251]  eta: 0:00:00  lr: 0.000162  min_lr: 0.000162  loss: 2.8427 (2.7246)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1756 (1.3872)  time: 0.2939  data: 0.0005  max mem: 28503
Epoch: [263] Total time: 0:07:20 (0.3518 s / it)
Averaged stats: lr: 0.000162  min_lr: 0.000162  loss: 2.8427 (2.7111)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1756 (1.3872)
Test:  [ 0/25]  eta: 0:02:28  loss: 0.5155 (0.5155)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.9289  data: 5.7439  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7167 (0.6861)  acc1: 87.2000 (86.6545)  acc5: 98.0000 (97.9273)  time: 0.7444  data: 0.5744  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8668 (0.8084)  acc1: 82.0000 (83.5429)  acc5: 96.8000 (96.8762)  time: 0.1995  data: 0.0311  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.8769 (0.8222)  acc1: 80.8000 (83.0880)  acc5: 96.4000 (96.7680)  time: 0.1994  data: 0.0311  max mem: 28503
Test: Total time: 0:00:10 (0.4273 s / it)
* Acc@1 83.774 Acc@5 96.746 loss 0.811
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.77%
Epoch: [264]  [   0/1251]  eta: 1:02:25  lr: 0.000162  min_lr: 0.000162  loss: 2.8331 (2.8331)  weight_decay: 0.0500 (0.0500)  time: 2.9937  data: 2.5611  max mem: 28503
Epoch: [264]  [ 200/1251]  eta: 0:06:20  lr: 0.000160  min_lr: 0.000160  loss: 2.8524 (2.7266)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2297 (1.3238)  time: 0.3565  data: 0.0004  max mem: 28503
Epoch: [264]  [ 400/1251]  eta: 0:05:02  lr: 0.000159  min_lr: 0.000159  loss: 2.9231 (2.7152)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2517 (1.2948)  time: 0.3459  data: 0.0004  max mem: 28503
Epoch: [264]  [ 600/1251]  eta: 0:03:50  lr: 0.000158  min_lr: 0.000158  loss: 2.6162 (2.7092)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1486 (1.3116)  time: 0.3554  data: 0.0004  max mem: 28503
Epoch: [264]  [ 800/1251]  eta: 0:02:39  lr: 0.000156  min_lr: 0.000156  loss: 2.7933 (2.7133)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1796 (1.3205)  time: 0.3586  data: 0.0004  max mem: 28503
Epoch: [264]  [1000/1251]  eta: 0:01:28  lr: 0.000155  min_lr: 0.000155  loss: 2.6086 (2.7109)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2856 (1.3185)  time: 0.3501  data: 0.0004  max mem: 28503
Epoch: [264]  [1200/1251]  eta: 0:00:17  lr: 0.000154  min_lr: 0.000154  loss: 2.7010 (2.7113)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1771 (1.3344)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [264]  [1250/1251]  eta: 0:00:00  lr: 0.000153  min_lr: 0.000153  loss: 2.8394 (2.7144)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2685 (1.3395)  time: 0.2946  data: 0.0007  max mem: 28503
Epoch: [264] Total time: 0:07:19 (0.3513 s / it)
Averaged stats: lr: 0.000153  min_lr: 0.000153  loss: 2.8394 (2.7189)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2685 (1.3395)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5816 (0.5816)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6522  data: 5.4497  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8093 (0.7609)  acc1: 87.2000 (86.5455)  acc5: 98.4000 (98.0000)  time: 0.6824  data: 0.5091  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9174 (0.8839)  acc1: 81.2000 (83.3905)  acc5: 96.4000 (96.7810)  time: 0.1769  data: 0.0076  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9674 (0.8986)  acc1: 80.8000 (82.9120)  acc5: 96.0000 (96.5440)  time: 0.1759  data: 0.0075  max mem: 28503
Test: Total time: 0:00:09 (0.3989 s / it)
* Acc@1 83.620 Acc@5 96.708 loss 0.885
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.77%
Epoch: [265]  [   0/1251]  eta: 1:09:26  lr: 0.000153  min_lr: 0.000153  loss: 2.7765 (2.7765)  weight_decay: 0.0500 (0.0500)  time: 3.3306  data: 2.5389  max mem: 28503
Epoch: [265]  [ 200/1251]  eta: 0:06:23  lr: 0.000152  min_lr: 0.000152  loss: 2.5748 (2.6614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2984 (1.4080)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [265]  [ 400/1251]  eta: 0:05:04  lr: 0.000150  min_lr: 0.000150  loss: 2.8950 (2.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1748 (1.3629)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [265]  [ 600/1251]  eta: 0:03:51  lr: 0.000149  min_lr: 0.000149  loss: 2.9805 (2.6982)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1325 (1.3348)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [265]  [ 800/1251]  eta: 0:02:39  lr: 0.000148  min_lr: 0.000148  loss: 2.9451 (2.7094)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3670 (1.3604)  time: 0.3465  data: 0.0004  max mem: 28503
Epoch: [265]  [1000/1251]  eta: 0:01:28  lr: 0.000146  min_lr: 0.000146  loss: 2.8226 (2.7112)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1700 (1.3580)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [265]  [1200/1251]  eta: 0:00:17  lr: 0.000145  min_lr: 0.000145  loss: 2.8123 (2.7149)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2420 (1.3578)  time: 0.3486  data: 0.0004  max mem: 28503
Epoch: [265]  [1250/1251]  eta: 0:00:00  lr: 0.000145  min_lr: 0.000145  loss: 2.9199 (2.7121)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3134 (1.3626)  time: 0.2944  data: 0.0007  max mem: 28503
Epoch: [265] Total time: 0:07:20 (0.3518 s / it)
Averaged stats: lr: 0.000145  min_lr: 0.000145  loss: 2.9199 (2.7077)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3134 (1.3626)
Test:  [ 0/25]  eta: 0:02:29  loss: 0.6053 (0.6053)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.9747  data: 5.7783  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7977 (0.7704)  acc1: 87.6000 (86.5091)  acc5: 98.4000 (98.0364)  time: 0.7113  data: 0.5394  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9385 (0.8960)  acc1: 82.0000 (83.5238)  acc5: 96.4000 (96.7238)  time: 0.1767  data: 0.0078  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9570 (0.9112)  acc1: 81.2000 (83.0720)  acc5: 96.0000 (96.5440)  time: 0.1761  data: 0.0077  max mem: 28503
Test: Total time: 0:00:10 (0.4136 s / it)
* Acc@1 83.636 Acc@5 96.668 loss 0.900
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.77%
Epoch: [266]  [   0/1251]  eta: 1:08:52  lr: 0.000145  min_lr: 0.000145  loss: 2.9046 (2.9046)  weight_decay: 0.0500 (0.0500)  time: 3.3035  data: 2.7764  max mem: 28503
Epoch: [266]  [ 200/1251]  eta: 0:06:22  lr: 0.000143  min_lr: 0.000143  loss: 2.7828 (2.6553)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4019 (1.5571)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [266]  [ 400/1251]  eta: 0:05:04  lr: 0.000142  min_lr: 0.000142  loss: 2.6715 (2.6605)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3089 (1.4567)  time: 0.3576  data: 0.0004  max mem: 28503
Epoch: [266]  [ 600/1251]  eta: 0:03:50  lr: 0.000141  min_lr: 0.000141  loss: 2.9160 (2.6816)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2386 (1.4395)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [266]  [ 800/1251]  eta: 0:02:39  lr: 0.000139  min_lr: 0.000139  loss: 2.6806 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2660 (1.4343)  time: 0.3578  data: 0.0004  max mem: 28503
Epoch: [266]  [1000/1251]  eta: 0:01:28  lr: 0.000138  min_lr: 0.000138  loss: 2.7290 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2886 (1.4374)  time: 0.3569  data: 0.0005  max mem: 28503
Epoch: [266]  [1200/1251]  eta: 0:00:17  lr: 0.000137  min_lr: 0.000137  loss: 2.8256 (2.6979)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2345 (1.4392)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [266]  [1250/1251]  eta: 0:00:00  lr: 0.000137  min_lr: 0.000137  loss: 2.8728 (2.7014)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2882 (1.4455)  time: 0.2950  data: 0.0007  max mem: 28503
Epoch: [266] Total time: 0:07:20 (0.3522 s / it)
Averaged stats: lr: 0.000137  min_lr: 0.000137  loss: 2.8728 (2.7139)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2882 (1.4455)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.6233 (0.6233)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.4693  data: 5.2704  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8229 (0.7818)  acc1: 87.2000 (86.6546)  acc5: 98.4000 (98.1091)  time: 0.7138  data: 0.5403  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9414 (0.9041)  acc1: 81.6000 (83.4476)  acc5: 96.4000 (96.8571)  time: 0.2033  data: 0.0337  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9731 (0.9196)  acc1: 81.2000 (82.9920)  acc5: 96.0000 (96.6720)  time: 0.2020  data: 0.0336  max mem: 28503
Test: Total time: 0:00:10 (0.4118 s / it)
* Acc@1 83.592 Acc@5 96.686 loss 0.908
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.77%
Epoch: [267]  [   0/1251]  eta: 1:16:21  lr: 0.000136  min_lr: 0.000136  loss: 3.0425 (3.0425)  weight_decay: 0.0500 (0.0500)  time: 3.6622  data: 1.5772  max mem: 28503
Epoch: [267]  [ 200/1251]  eta: 0:06:24  lr: 0.000135  min_lr: 0.000135  loss: 2.7680 (2.6028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4273 (1.4745)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [267]  [ 400/1251]  eta: 0:05:04  lr: 0.000134  min_lr: 0.000134  loss: 2.8828 (2.6361)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2438 (1.4208)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [267]  [ 600/1251]  eta: 0:03:51  lr: 0.000133  min_lr: 0.000133  loss: 2.9076 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2092 (1.4498)  time: 0.3586  data: 0.0004  max mem: 28503
Epoch: [267]  [ 800/1251]  eta: 0:02:39  lr: 0.000131  min_lr: 0.000131  loss: 2.8154 (2.6927)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2397 (1.4113)  time: 0.3596  data: 0.0004  max mem: 28503
Epoch: [267]  [1000/1251]  eta: 0:01:28  lr: 0.000130  min_lr: 0.000130  loss: 2.7940 (2.7012)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.3819)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [267]  [1200/1251]  eta: 0:00:17  lr: 0.000129  min_lr: 0.000129  loss: 2.7264 (2.6969)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2500 (1.3937)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [267]  [1250/1251]  eta: 0:00:00  lr: 0.000129  min_lr: 0.000129  loss: 2.7632 (2.6952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2226 (1.3949)  time: 0.2923  data: 0.0008  max mem: 28503
Epoch: [267] Total time: 0:07:20 (0.3520 s / it)
Averaged stats: lr: 0.000129  min_lr: 0.000129  loss: 2.7632 (2.7043)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2226 (1.3949)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5624 (0.5624)  acc1: 90.0000 (90.0000)  acc5: 98.8000 (98.8000)  time: 5.3002  data: 5.0903  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7596 (0.7164)  acc1: 86.8000 (86.2546)  acc5: 98.0000 (98.0000)  time: 0.7120  data: 0.5398  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8781 (0.8405)  acc1: 81.2000 (83.0857)  acc5: 96.4000 (96.7429)  time: 0.2108  data: 0.0424  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9282 (0.8555)  acc1: 80.4000 (82.5600)  acc5: 95.6000 (96.5760)  time: 0.2106  data: 0.0423  max mem: 28503
Test: Total time: 0:00:10 (0.4118 s / it)
* Acc@1 83.602 Acc@5 96.664 loss 0.841
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.77%
Epoch: [268]  [   0/1251]  eta: 1:09:34  lr: 0.000128  min_lr: 0.000128  loss: 2.9992 (2.9992)  weight_decay: 0.0500 (0.0500)  time: 3.3365  data: 2.2851  max mem: 28503
Epoch: [268]  [ 200/1251]  eta: 0:06:22  lr: 0.000127  min_lr: 0.000127  loss: 2.7061 (2.7601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3448 (1.3412)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [268]  [ 400/1251]  eta: 0:05:04  lr: 0.000126  min_lr: 0.000126  loss: 2.7753 (2.7313)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3329 (1.3530)  time: 0.3448  data: 0.0005  max mem: 28503
Epoch: [268]  [ 600/1251]  eta: 0:03:50  lr: 0.000125  min_lr: 0.000125  loss: 3.0228 (2.7341)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3303 (1.3707)  time: 0.3600  data: 0.0005  max mem: 28503
Epoch: [268]  [ 800/1251]  eta: 0:02:39  lr: 0.000123  min_lr: 0.000123  loss: 2.5889 (2.7200)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3269 (1.3845)  time: 0.3492  data: 0.0004  max mem: 28503
Epoch: [268]  [1000/1251]  eta: 0:01:28  lr: 0.000122  min_lr: 0.000122  loss: 2.6458 (2.7134)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3030 (1.4032)  time: 0.3481  data: 0.0005  max mem: 28503
Epoch: [268]  [1200/1251]  eta: 0:00:17  lr: 0.000121  min_lr: 0.000121  loss: 2.8342 (2.7136)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2977 (1.3769)  time: 0.3722  data: 0.0004  max mem: 28503
Epoch: [268]  [1250/1251]  eta: 0:00:00  lr: 0.000121  min_lr: 0.000121  loss: 2.6235 (2.7118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4628 (1.3813)  time: 0.2949  data: 0.0007  max mem: 28503
Epoch: [268] Total time: 0:07:21 (0.3526 s / it)
Averaged stats: lr: 0.000121  min_lr: 0.000121  loss: 2.6235 (2.6999)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4628 (1.3813)
Test:  [ 0/25]  eta: 0:02:24  loss: 0.5579 (0.5579)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.7980  data: 5.6022  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7600 (0.7123)  acc1: 87.2000 (86.8000)  acc5: 98.0000 (97.8909)  time: 0.6819  data: 0.5108  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8427 (0.8351)  acc1: 81.2000 (83.7905)  acc5: 96.4000 (96.7238)  time: 0.1775  data: 0.0090  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9230 (0.8515)  acc1: 81.2000 (83.2000)  acc5: 95.6000 (96.4640)  time: 0.1795  data: 0.0111  max mem: 28503
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 83.746 Acc@5 96.714 loss 0.839
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.77%
Epoch: [269]  [   0/1251]  eta: 1:13:10  lr: 0.000121  min_lr: 0.000121  loss: 2.1377 (2.1377)  weight_decay: 0.0500 (0.0500)  time: 3.5097  data: 2.3440  max mem: 28503
Epoch: [269]  [ 200/1251]  eta: 0:06:23  lr: 0.000120  min_lr: 0.000120  loss: 2.8804 (2.6958)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3552 (1.3471)  time: 0.3472  data: 0.0004  max mem: 28503
Epoch: [269]  [ 400/1251]  eta: 0:05:02  lr: 0.000118  min_lr: 0.000118  loss: 2.7656 (2.6959)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2852 (1.3469)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [269]  [ 600/1251]  eta: 0:03:50  lr: 0.000117  min_lr: 0.000117  loss: 2.8135 (2.6890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2322 (1.3597)  time: 0.3588  data: 0.0004  max mem: 28503
Epoch: [269]  [ 800/1251]  eta: 0:02:39  lr: 0.000116  min_lr: 0.000116  loss: 2.7630 (2.6937)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2271 (1.3725)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [269]  [1000/1251]  eta: 0:01:28  lr: 0.000115  min_lr: 0.000115  loss: 2.8463 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2743 (1.3623)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [269]  [1200/1251]  eta: 0:00:17  lr: 0.000113  min_lr: 0.000113  loss: 2.5093 (2.6887)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2994 (1.3605)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [269]  [1250/1251]  eta: 0:00:00  lr: 0.000113  min_lr: 0.000113  loss: 2.9252 (2.6874)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3482 (1.3644)  time: 0.2952  data: 0.0005  max mem: 28503
Epoch: [269] Total time: 0:07:20 (0.3521 s / it)
Averaged stats: lr: 0.000113  min_lr: 0.000113  loss: 2.9252 (2.6925)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3482 (1.3644)
Test:  [ 0/25]  eta: 0:02:15  loss: 0.5875 (0.5875)  acc1: 90.8000 (90.8000)  acc5: 99.2000 (99.2000)  time: 5.4073  data: 5.2158  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7790 (0.7413)  acc1: 87.2000 (86.8727)  acc5: 98.0000 (98.0364)  time: 0.7074  data: 0.5365  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8929 (0.8608)  acc1: 82.4000 (83.9048)  acc5: 96.4000 (96.8000)  time: 0.2030  data: 0.0343  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9309 (0.8755)  acc1: 82.0000 (83.3280)  acc5: 96.0000 (96.6560)  time: 0.2027  data: 0.0342  max mem: 28503
Test: Total time: 0:00:10 (0.4092 s / it)
* Acc@1 83.696 Acc@5 96.706 loss 0.864
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.77%
Epoch: [270]  [   0/1251]  eta: 1:10:52  lr: 0.000113  min_lr: 0.000113  loss: 2.7118 (2.7118)  weight_decay: 0.0500 (0.0500)  time: 3.3995  data: 1.7692  max mem: 28503
Epoch: [270]  [ 200/1251]  eta: 0:06:25  lr: 0.000112  min_lr: 0.000112  loss: 2.8777 (2.7154)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4264 (1.4007)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [270]  [ 400/1251]  eta: 0:05:04  lr: 0.000111  min_lr: 0.000111  loss: 2.7728 (2.7317)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2512 (1.4569)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [270]  [ 600/1251]  eta: 0:03:51  lr: 0.000110  min_lr: 0.000110  loss: 2.7402 (2.7119)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3171 (1.4153)  time: 0.3571  data: 0.0004  max mem: 28503
Epoch: [270]  [ 800/1251]  eta: 0:02:39  lr: 0.000109  min_lr: 0.000109  loss: 2.7226 (2.7076)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2544 (1.3891)  time: 0.3500  data: 0.0008  max mem: 28503
Epoch: [270]  [1000/1251]  eta: 0:01:28  lr: 0.000107  min_lr: 0.000107  loss: 2.8549 (2.7105)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4056 (nan)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [270]  [1200/1251]  eta: 0:00:18  lr: 0.000106  min_lr: 0.000106  loss: 2.7485 (2.7057)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3025 (nan)  time: 0.3501  data: 0.0004  max mem: 28503
Epoch: [270]  [1250/1251]  eta: 0:00:00  lr: 0.000106  min_lr: 0.000106  loss: 2.8910 (2.7078)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2885 (nan)  time: 0.2953  data: 0.0010  max mem: 28503
Epoch: [270] Total time: 0:07:21 (0.3527 s / it)
Averaged stats: lr: 0.000106  min_lr: 0.000106  loss: 2.8910 (2.6974)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2885 (nan)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6352 (0.6352)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5780  data: 5.3766  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8165 (0.7854)  acc1: 87.2000 (86.5091)  acc5: 98.0000 (97.8909)  time: 0.7470  data: 0.5741  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9321 (0.9080)  acc1: 81.6000 (83.5429)  acc5: 96.4000 (96.6857)  time: 0.2238  data: 0.0546  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9936 (0.9221)  acc1: 81.6000 (83.0240)  acc5: 96.0000 (96.4960)  time: 0.2230  data: 0.0545  max mem: 28503
Test: Total time: 0:00:10 (0.4321 s / it)
* Acc@1 83.646 Acc@5 96.744 loss 0.911
Accuracy of the model on the 50000 test images: 83.6%
Max accuracy: 83.77%
Epoch: [271]  [   0/1251]  eta: 1:10:17  lr: 0.000106  min_lr: 0.000106  loss: 3.0551 (3.0551)  weight_decay: 0.0500 (0.0500)  time: 3.3711  data: 1.5855  max mem: 28503
Epoch: [271]  [ 200/1251]  eta: 0:06:23  lr: 0.000105  min_lr: 0.000105  loss: 2.8275 (2.6795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3846 (1.3748)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [271]  [ 400/1251]  eta: 0:05:03  lr: 0.000104  min_lr: 0.000104  loss: 2.8557 (2.6981)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3422 (1.4616)  time: 0.3452  data: 0.0004  max mem: 28503
Epoch: [271]  [ 600/1251]  eta: 0:03:50  lr: 0.000102  min_lr: 0.000102  loss: 2.8314 (2.6954)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5636 (1.4579)  time: 0.3457  data: 0.0004  max mem: 28503
Epoch: [271]  [ 800/1251]  eta: 0:02:39  lr: 0.000101  min_lr: 0.000101  loss: 2.9228 (2.6977)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3415 (1.4373)  time: 0.3563  data: 0.0004  max mem: 28503
Epoch: [271]  [1000/1251]  eta: 0:01:28  lr: 0.000100  min_lr: 0.000100  loss: 2.6137 (2.7013)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2458 (1.4418)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [271]  [1200/1251]  eta: 0:00:17  lr: 0.000099  min_lr: 0.000099  loss: 2.7558 (2.6962)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2104 (1.4176)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [271]  [1250/1251]  eta: 0:00:00  lr: 0.000099  min_lr: 0.000099  loss: 2.8397 (2.6970)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1393 (1.4096)  time: 0.3023  data: 0.0005  max mem: 28503
Epoch: [271] Total time: 0:07:19 (0.3512 s / it)
Averaged stats: lr: 0.000099  min_lr: 0.000099  loss: 2.8397 (2.6956)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1393 (1.4096)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.6081 (0.6081)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.7435  data: 5.5251  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8003 (0.7653)  acc1: 87.2000 (86.4727)  acc5: 98.0000 (97.9273)  time: 0.7014  data: 0.5283  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9217 (0.8910)  acc1: 81.2000 (83.5048)  acc5: 96.4000 (96.7429)  time: 0.1890  data: 0.0192  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9749 (0.9044)  acc1: 81.2000 (83.0080)  acc5: 95.6000 (96.6240)  time: 0.1888  data: 0.0190  max mem: 28503
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 83.654 Acc@5 96.696 loss 0.894
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.77%
Epoch: [272]  [   0/1251]  eta: 1:09:26  lr: 0.000099  min_lr: 0.000099  loss: 2.4400 (2.4400)  weight_decay: 0.0500 (0.0500)  time: 3.3303  data: 2.7767  max mem: 28503
Epoch: [272]  [ 200/1251]  eta: 0:06:24  lr: 0.000098  min_lr: 0.000098  loss: 2.7186 (2.6545)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3641 (1.3618)  time: 0.3576  data: 0.0004  max mem: 28503
Epoch: [272]  [ 400/1251]  eta: 0:05:04  lr: 0.000097  min_lr: 0.000097  loss: 2.8802 (2.6903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1525 (1.3810)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [272]  [ 600/1251]  eta: 0:03:51  lr: 0.000096  min_lr: 0.000096  loss: 2.3152 (2.6634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3669 (1.3537)  time: 0.3580  data: 0.0004  max mem: 28503
Epoch: [272]  [ 800/1251]  eta: 0:02:39  lr: 0.000094  min_lr: 0.000094  loss: 2.5277 (2.6624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4482 (1.4133)  time: 0.3559  data: 0.0004  max mem: 28503
Epoch: [272]  [1000/1251]  eta: 0:01:28  lr: 0.000093  min_lr: 0.000093  loss: 2.8067 (2.6731)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1521 (1.3941)  time: 0.3456  data: 0.0004  max mem: 28503
Epoch: [272]  [1200/1251]  eta: 0:00:17  lr: 0.000092  min_lr: 0.000092  loss: 2.7585 (2.6838)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [272]  [1250/1251]  eta: 0:00:00  lr: 0.000092  min_lr: 0.000092  loss: 2.6494 (2.6810)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.2980  data: 0.0007  max mem: 28503
Epoch: [272] Total time: 0:07:20 (0.3519 s / it)
Averaged stats: lr: 0.000092  min_lr: 0.000092  loss: 2.6494 (2.6806)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5500 (0.5500)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5449  data: 5.3450  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7390 (0.7034)  acc1: 87.2000 (87.0909)  acc5: 98.0000 (98.0000)  time: 0.7229  data: 0.5509  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8565 (0.8276)  acc1: 82.0000 (83.7905)  acc5: 96.8000 (96.7619)  time: 0.2045  data: 0.0358  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9169 (0.8421)  acc1: 81.6000 (83.3440)  acc5: 96.0000 (96.5600)  time: 0.2040  data: 0.0357  max mem: 28503
Test: Total time: 0:00:10 (0.4154 s / it)
* Acc@1 83.792 Acc@5 96.714 loss 0.832
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.79%
Epoch: [273]  [   0/1251]  eta: 1:06:49  lr: 0.000092  min_lr: 0.000092  loss: 2.7317 (2.7317)  weight_decay: 0.0500 (0.0500)  time: 3.2047  data: 2.8466  max mem: 28503
Epoch: [273]  [ 200/1251]  eta: 0:06:21  lr: 0.000091  min_lr: 0.000091  loss: 2.8577 (2.6975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3383 (1.4505)  time: 0.3550  data: 0.0004  max mem: 28503
Epoch: [273]  [ 400/1251]  eta: 0:05:02  lr: 0.000090  min_lr: 0.000090  loss: 2.6782 (2.6674)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2029 (1.3906)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [273]  [ 600/1251]  eta: 0:03:50  lr: 0.000089  min_lr: 0.000089  loss: 2.5734 (2.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1670 (1.3624)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [273]  [ 800/1251]  eta: 0:02:39  lr: 0.000088  min_lr: 0.000088  loss: 2.7366 (2.6688)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2820 (1.3732)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [273]  [1000/1251]  eta: 0:01:28  lr: 0.000087  min_lr: 0.000087  loss: 2.4703 (2.6657)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1003 (1.3505)  time: 0.3484  data: 0.0005  max mem: 28503
Epoch: [273]  [1200/1251]  eta: 0:00:17  lr: 0.000086  min_lr: 0.000086  loss: 2.7761 (2.6664)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2224 (1.3501)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [273]  [1250/1251]  eta: 0:00:00  lr: 0.000085  min_lr: 0.000085  loss: 2.4862 (2.6656)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2224 (1.3475)  time: 0.2947  data: 0.0006  max mem: 28503
Epoch: [273] Total time: 0:07:20 (0.3517 s / it)
Averaged stats: lr: 0.000085  min_lr: 0.000085  loss: 2.4862 (2.6898)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2224 (1.3475)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5393 (0.5393)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6329  data: 5.4349  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7289 (0.6988)  acc1: 87.6000 (86.9091)  acc5: 98.0000 (98.0727)  time: 0.7698  data: 0.5967  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8494 (0.8216)  acc1: 81.6000 (83.7905)  acc5: 96.8000 (96.8191)  time: 0.2260  data: 0.0565  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9208 (0.8369)  acc1: 81.2000 (83.2640)  acc5: 96.0000 (96.6560)  time: 0.2248  data: 0.0564  max mem: 28503
Test: Total time: 0:00:10 (0.4370 s / it)
* Acc@1 83.696 Acc@5 96.740 loss 0.827
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.79%
Epoch: [274]  [   0/1251]  eta: 1:08:17  lr: 0.000085  min_lr: 0.000085  loss: 3.0894 (3.0894)  weight_decay: 0.0500 (0.0500)  time: 3.2751  data: 2.1067  max mem: 28503
Epoch: [274]  [ 200/1251]  eta: 0:06:23  lr: 0.000084  min_lr: 0.000084  loss: 2.7301 (2.6983)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4154 (1.5871)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [274]  [ 400/1251]  eta: 0:05:04  lr: 0.000083  min_lr: 0.000083  loss: 2.6609 (2.6817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1571 (1.4516)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [274]  [ 600/1251]  eta: 0:03:50  lr: 0.000082  min_lr: 0.000082  loss: 2.5258 (2.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1989 (1.4416)  time: 0.3452  data: 0.0005  max mem: 28503
Epoch: [274]  [ 800/1251]  eta: 0:02:39  lr: 0.000081  min_lr: 0.000081  loss: 2.7596 (2.6906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3205 (1.4297)  time: 0.3458  data: 0.0004  max mem: 28503
Epoch: [274]  [1000/1251]  eta: 0:01:28  lr: 0.000080  min_lr: 0.000080  loss: 2.3902 (2.6760)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3095 (1.4181)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [274]  [1200/1251]  eta: 0:00:17  lr: 0.000079  min_lr: 0.000079  loss: 2.6469 (2.6801)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4110 (1.4266)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [274]  [1250/1251]  eta: 0:00:00  lr: 0.000079  min_lr: 0.000079  loss: 2.8239 (2.6817)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4110 (1.4248)  time: 0.2922  data: 0.0007  max mem: 28503
Epoch: [274] Total time: 0:07:19 (0.3513 s / it)
Averaged stats: lr: 0.000079  min_lr: 0.000079  loss: 2.8239 (2.6840)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4110 (1.4248)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.5740 (0.5740)  acc1: 90.4000 (90.4000)  acc5: 99.2000 (99.2000)  time: 5.5549  data: 5.3523  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7747 (0.7305)  acc1: 87.2000 (86.9818)  acc5: 98.0000 (98.0000)  time: 0.6592  data: 0.4869  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8858 (0.8522)  acc1: 82.4000 (83.6571)  acc5: 96.8000 (96.7619)  time: 0.1728  data: 0.0040  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9338 (0.8681)  acc1: 80.8000 (83.2000)  acc5: 96.8000 (96.6560)  time: 0.1723  data: 0.0039  max mem: 28503
Test: Total time: 0:00:09 (0.3912 s / it)
* Acc@1 83.652 Acc@5 96.736 loss 0.858
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.79%
Epoch: [275]  [   0/1251]  eta: 1:08:57  lr: 0.000079  min_lr: 0.000079  loss: 2.6851 (2.6851)  weight_decay: 0.0500 (0.0500)  time: 3.3072  data: 1.7923  max mem: 28503
Epoch: [275]  [ 200/1251]  eta: 0:06:24  lr: 0.000078  min_lr: 0.000078  loss: 2.7005 (2.6823)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2795 (1.4042)  time: 0.3573  data: 0.0004  max mem: 28503
Epoch: [275]  [ 400/1251]  eta: 0:05:03  lr: 0.000077  min_lr: 0.000077  loss: 2.8256 (2.7040)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4444 (1.4306)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [275]  [ 600/1251]  eta: 0:03:50  lr: 0.000076  min_lr: 0.000076  loss: 2.7936 (2.7028)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2031 (1.4223)  time: 0.3460  data: 0.0004  max mem: 28503
Epoch: [275]  [ 800/1251]  eta: 0:02:39  lr: 0.000075  min_lr: 0.000075  loss: 2.8688 (2.6955)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2638 (1.4104)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [275]  [1000/1251]  eta: 0:01:28  lr: 0.000074  min_lr: 0.000074  loss: 2.8484 (2.6965)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3058 (1.4261)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [275]  [1200/1251]  eta: 0:00:17  lr: 0.000073  min_lr: 0.000073  loss: 2.9142 (2.6944)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2896 (1.4274)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [275]  [1250/1251]  eta: 0:00:00  lr: 0.000073  min_lr: 0.000073  loss: 2.8434 (2.6952)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3080 (1.4297)  time: 0.2944  data: 0.0007  max mem: 28503
Epoch: [275] Total time: 0:07:20 (0.3520 s / it)
Averaged stats: lr: 0.000073  min_lr: 0.000073  loss: 2.8434 (2.6876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3080 (1.4297)
Test:  [ 0/25]  eta: 0:02:21  loss: 0.5965 (0.5965)  acc1: 90.4000 (90.4000)  acc5: 98.8000 (98.8000)  time: 5.6548  data: 5.4589  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7718 (0.7395)  acc1: 87.6000 (87.0909)  acc5: 98.0000 (97.9273)  time: 0.7011  data: 0.5286  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8956 (0.8652)  acc1: 81.6000 (83.6191)  acc5: 96.4000 (96.7048)  time: 0.1871  data: 0.0178  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9506 (0.8821)  acc1: 80.0000 (83.0400)  acc5: 96.0000 (96.6080)  time: 0.1863  data: 0.0180  max mem: 28503
Test: Total time: 0:00:10 (0.4061 s / it)
* Acc@1 83.650 Acc@5 96.692 loss 0.872
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.79%
Epoch: [276]  [   0/1251]  eta: 1:04:51  lr: 0.000073  min_lr: 0.000073  loss: 3.0551 (3.0551)  weight_decay: 0.0500 (0.0500)  time: 3.1110  data: 2.3017  max mem: 28503
Epoch: [276]  [ 200/1251]  eta: 0:06:23  lr: 0.000072  min_lr: 0.000072  loss: 2.6387 (2.6369)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3651 (1.5061)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [276]  [ 400/1251]  eta: 0:05:04  lr: 0.000071  min_lr: 0.000071  loss: 2.7300 (2.6575)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2495 (1.4608)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [276]  [ 600/1251]  eta: 0:03:51  lr: 0.000070  min_lr: 0.000070  loss: 2.7939 (2.6634)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3348 (1.4885)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [276]  [ 800/1251]  eta: 0:02:39  lr: 0.000069  min_lr: 0.000069  loss: 2.4504 (2.6682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1874 (1.4509)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [276]  [1000/1251]  eta: 0:01:28  lr: 0.000068  min_lr: 0.000068  loss: 2.7475 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1799 (1.4236)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [276]  [1200/1251]  eta: 0:00:17  lr: 0.000067  min_lr: 0.000067  loss: 2.8768 (2.6778)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5443 (1.4530)  time: 0.3470  data: 0.0005  max mem: 28503
Epoch: [276]  [1250/1251]  eta: 0:00:00  lr: 0.000067  min_lr: 0.000067  loss: 2.7941 (2.6779)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1932 (1.4466)  time: 0.2944  data: 0.0007  max mem: 28503
Epoch: [276] Total time: 0:07:20 (0.3522 s / it)
Averaged stats: lr: 0.000067  min_lr: 0.000067  loss: 2.7941 (2.6845)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1932 (1.4466)
Test:  [ 0/25]  eta: 0:01:37  loss: 0.5671 (0.5671)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 3.9018  data: 3.7032  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7530 (0.7121)  acc1: 87.2000 (86.7636)  acc5: 98.0000 (97.9273)  time: 0.6722  data: 0.4992  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8670 (0.8346)  acc1: 82.0000 (83.6381)  acc5: 96.4000 (96.7238)  time: 0.2796  data: 0.1101  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9225 (0.8500)  acc1: 81.6000 (83.1200)  acc5: 96.0000 (96.5920)  time: 0.2370  data: 0.0680  max mem: 28503
Test: Total time: 0:00:10 (0.4105 s / it)
* Acc@1 83.724 Acc@5 96.736 loss 0.842
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.79%
Epoch: [277]  [   0/1251]  eta: 1:13:10  lr: 0.000067  min_lr: 0.000067  loss: 2.1269 (2.1269)  weight_decay: 0.0500 (0.0500)  time: 3.5092  data: 1.6882  max mem: 28503
Epoch: [277]  [ 200/1251]  eta: 0:06:24  lr: 0.000066  min_lr: 0.000066  loss: 2.6278 (2.6858)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2598 (1.3963)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [277]  [ 400/1251]  eta: 0:05:04  lr: 0.000065  min_lr: 0.000065  loss: 2.5005 (2.6916)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3913 (1.4126)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [277]  [ 600/1251]  eta: 0:03:51  lr: 0.000064  min_lr: 0.000064  loss: 2.8433 (2.6975)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2699 (1.4204)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [277]  [ 800/1251]  eta: 0:02:39  lr: 0.000064  min_lr: 0.000064  loss: 2.8458 (2.6929)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3298 (1.4041)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [277]  [1000/1251]  eta: 0:01:28  lr: 0.000063  min_lr: 0.000063  loss: 2.7319 (2.6835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4035 (1.4478)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [277]  [1200/1251]  eta: 0:00:17  lr: 0.000062  min_lr: 0.000062  loss: 2.8931 (2.6853)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4170 (1.4553)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [277]  [1250/1251]  eta: 0:00:00  lr: 0.000062  min_lr: 0.000062  loss: 2.7284 (2.6846)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3470 (1.4485)  time: 0.2949  data: 0.0006  max mem: 28503
Epoch: [277] Total time: 0:07:20 (0.3520 s / it)
Averaged stats: lr: 0.000062  min_lr: 0.000062  loss: 2.7284 (2.6882)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3470 (1.4485)
Test:  [ 0/25]  eta: 0:02:25  loss: 0.5970 (0.5970)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.8201  data: 5.6273  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7833 (0.7426)  acc1: 86.8000 (86.9091)  acc5: 98.0000 (97.9636)  time: 0.6826  data: 0.5118  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8949 (0.8657)  acc1: 82.0000 (83.7524)  acc5: 96.4000 (96.7619)  time: 0.1694  data: 0.0009  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9507 (0.8803)  acc1: 81.2000 (83.2480)  acc5: 96.0000 (96.6560)  time: 0.1714  data: 0.0030  max mem: 28503
Test: Total time: 0:00:10 (0.4001 s / it)
* Acc@1 83.734 Acc@5 96.700 loss 0.870
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.79%
Epoch: [278]  [   0/1251]  eta: 1:10:04  lr: 0.000062  min_lr: 0.000062  loss: 2.8721 (2.8721)  weight_decay: 0.0500 (0.0500)  time: 3.3608  data: 2.4084  max mem: 28503
Epoch: [278]  [ 200/1251]  eta: 0:06:24  lr: 0.000061  min_lr: 0.000061  loss: 2.8022 (2.6908)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3454 (1.5334)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [278]  [ 400/1251]  eta: 0:05:04  lr: 0.000060  min_lr: 0.000060  loss: 2.5713 (2.6909)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4317 (1.5687)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [278]  [ 600/1251]  eta: 0:03:51  lr: 0.000059  min_lr: 0.000059  loss: 2.7489 (2.6824)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2409 (1.4830)  time: 0.3574  data: 0.0004  max mem: 28503
Epoch: [278]  [ 800/1251]  eta: 0:02:40  lr: 0.000058  min_lr: 0.000058  loss: 2.8260 (2.6777)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2483 (1.4691)  time: 0.3609  data: 0.0004  max mem: 28503
Epoch: [278]  [1000/1251]  eta: 0:01:28  lr: 0.000057  min_lr: 0.000057  loss: 2.9010 (2.6826)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1587 (1.4448)  time: 0.3513  data: 0.0004  max mem: 28503
Epoch: [278]  [1200/1251]  eta: 0:00:18  lr: 0.000056  min_lr: 0.000056  loss: 2.8012 (2.6876)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2449 (1.4386)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [278]  [1250/1251]  eta: 0:00:00  lr: 0.000056  min_lr: 0.000056  loss: 2.7745 (2.6835)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (1.4368)  time: 0.2952  data: 0.0006  max mem: 28503
Epoch: [278] Total time: 0:07:21 (0.3532 s / it)
Averaged stats: lr: 0.000056  min_lr: 0.000056  loss: 2.7745 (2.6796)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1789 (1.4368)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5530 (0.5530)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5691  data: 5.3666  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7308 (0.6969)  acc1: 87.2000 (86.8364)  acc5: 98.0000 (97.9636)  time: 0.7361  data: 0.5629  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8472 (0.8222)  acc1: 82.0000 (83.7905)  acc5: 96.0000 (96.7429)  time: 0.2106  data: 0.0413  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9131 (0.8369)  acc1: 81.6000 (83.2960)  acc5: 96.0000 (96.6240)  time: 0.2177  data: 0.0494  max mem: 28503
Test: Total time: 0:00:10 (0.4276 s / it)
* Acc@1 83.752 Acc@5 96.804 loss 0.826
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.79%
Epoch: [279]  [   0/1251]  eta: 1:06:27  lr: 0.000056  min_lr: 0.000056  loss: 3.1525 (3.1525)  weight_decay: 0.0500 (0.0500)  time: 3.1877  data: 2.6475  max mem: 28503
Epoch: [279]  [ 200/1251]  eta: 0:06:28  lr: 0.000055  min_lr: 0.000055  loss: 2.9410 (2.7051)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2670 (1.3717)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [279]  [ 400/1251]  eta: 0:05:05  lr: 0.000055  min_lr: 0.000055  loss: 2.8780 (2.6836)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4790 (1.3964)  time: 0.3494  data: 0.0009  max mem: 28503
Epoch: [279]  [ 600/1251]  eta: 0:03:51  lr: 0.000054  min_lr: 0.000054  loss: 2.8399 (2.6923)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4020 (1.4285)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [279]  [ 800/1251]  eta: 0:02:40  lr: 0.000053  min_lr: 0.000053  loss: 2.7784 (2.6871)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2604 (1.4347)  time: 0.3592  data: 0.0004  max mem: 28503
Epoch: [279]  [1000/1251]  eta: 0:01:28  lr: 0.000052  min_lr: 0.000052  loss: 2.6791 (2.6772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2630 (1.4126)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [279]  [1200/1251]  eta: 0:00:18  lr: 0.000051  min_lr: 0.000051  loss: 2.6366 (2.6761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2453 (1.4001)  time: 0.3492  data: 0.0004  max mem: 28503
Epoch: [279]  [1250/1251]  eta: 0:00:00  lr: 0.000051  min_lr: 0.000051  loss: 2.5777 (2.6732)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.3909)  time: 0.2952  data: 0.0007  max mem: 28503
Epoch: [279] Total time: 0:07:22 (0.3536 s / it)
Averaged stats: lr: 0.000051  min_lr: 0.000051  loss: 2.5777 (2.6758)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1527 (1.3909)
Test:  [ 0/25]  eta: 0:02:17  loss: 0.5238 (0.5238)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5145  data: 5.3190  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7054 (0.6727)  acc1: 87.2000 (86.8000)  acc5: 98.0000 (98.0364)  time: 0.7250  data: 0.5526  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8310 (0.7943)  acc1: 81.6000 (83.7333)  acc5: 96.4000 (96.8952)  time: 0.2073  data: 0.0380  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.8691 (0.8087)  acc1: 81.2000 (83.2640)  acc5: 96.4000 (96.7520)  time: 0.2064  data: 0.0380  max mem: 28503
Test: Total time: 0:00:10 (0.4174 s / it)
* Acc@1 83.808 Acc@5 96.784 loss 0.798
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.81%
Epoch: [280]  [   0/1251]  eta: 1:00:52  lr: 0.000051  min_lr: 0.000051  loss: 3.3456 (3.3456)  weight_decay: 0.0500 (0.0500)  time: 2.9194  data: 2.4945  max mem: 28503
Epoch: [280]  [ 200/1251]  eta: 0:06:19  lr: 0.000050  min_lr: 0.000050  loss: 2.8530 (2.6930)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2542 (1.3939)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [280]  [ 400/1251]  eta: 0:05:02  lr: 0.000050  min_lr: 0.000050  loss: 2.9062 (2.7016)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3657 (1.4102)  time: 0.3486  data: 0.0004  max mem: 28503
Epoch: [280]  [ 600/1251]  eta: 0:03:50  lr: 0.000049  min_lr: 0.000049  loss: 2.6433 (2.6894)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1680 (1.4030)  time: 0.3555  data: 0.0004  max mem: 28503
Epoch: [280]  [ 800/1251]  eta: 0:02:39  lr: 0.000048  min_lr: 0.000048  loss: 2.7887 (2.6888)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2829 (1.4160)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [280]  [1000/1251]  eta: 0:01:28  lr: 0.000047  min_lr: 0.000047  loss: 2.8802 (2.6872)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3869 (1.4071)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [280]  [1200/1251]  eta: 0:00:17  lr: 0.000046  min_lr: 0.000046  loss: 2.8808 (2.6902)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3771 (1.4101)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [280]  [1250/1251]  eta: 0:00:00  lr: 0.000046  min_lr: 0.000046  loss: 2.7595 (2.6874)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2372 (1.4049)  time: 0.2945  data: 0.0006  max mem: 28503
Epoch: [280] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.000046  min_lr: 0.000046  loss: 2.7595 (2.6765)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2372 (1.4049)
Test:  [ 0/25]  eta: 0:02:12  loss: 0.5670 (0.5670)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.3023  data: 5.1048  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7514 (0.7155)  acc1: 86.8000 (86.7273)  acc5: 98.4000 (98.1818)  time: 0.6758  data: 0.5044  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8812 (0.8377)  acc1: 81.6000 (83.7143)  acc5: 96.4000 (96.9905)  time: 0.1991  data: 0.0305  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9274 (0.8541)  acc1: 81.6000 (83.0880)  acc5: 96.4000 (96.8480)  time: 0.2046  data: 0.0362  max mem: 28503
Test: Total time: 0:00:10 (0.4077 s / it)
* Acc@1 83.766 Acc@5 96.772 loss 0.844
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.81%
Epoch: [281]  [   0/1251]  eta: 1:11:42  lr: 0.000046  min_lr: 0.000046  loss: 2.9443 (2.9443)  weight_decay: 0.0500 (0.0500)  time: 3.4391  data: 3.0003  max mem: 28503
Epoch: [281]  [ 200/1251]  eta: 0:06:24  lr: 0.000046  min_lr: 0.000046  loss: 2.7665 (2.6617)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3709 (nan)  time: 0.3515  data: 0.0004  max mem: 28503
Epoch: [281]  [ 400/1251]  eta: 0:05:05  lr: 0.000045  min_lr: 0.000045  loss: 2.6408 (2.6549)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4210 (nan)  time: 0.3510  data: 0.0004  max mem: 28503
Epoch: [281]  [ 600/1251]  eta: 0:03:52  lr: 0.000044  min_lr: 0.000044  loss: 2.8009 (2.6654)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2071 (nan)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [281]  [ 800/1251]  eta: 0:02:40  lr: 0.000043  min_lr: 0.000043  loss: 2.6586 (2.6635)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2289 (nan)  time: 0.3523  data: 0.0004  max mem: 28503
Epoch: [281]  [1000/1251]  eta: 0:01:28  lr: 0.000043  min_lr: 0.000043  loss: 2.7820 (2.6704)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2852 (nan)  time: 0.3467  data: 0.0004  max mem: 28503
Epoch: [281]  [1200/1251]  eta: 0:00:18  lr: 0.000042  min_lr: 0.000042  loss: 2.8033 (2.6726)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3144 (nan)  time: 0.3497  data: 0.0004  max mem: 28503
Epoch: [281]  [1250/1251]  eta: 0:00:00  lr: 0.000042  min_lr: 0.000042  loss: 2.6395 (2.6714)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3507 (nan)  time: 0.2929  data: 0.0007  max mem: 28503
Epoch: [281] Total time: 0:07:21 (0.3526 s / it)
Averaged stats: lr: 0.000042  min_lr: 0.000042  loss: 2.6395 (2.6662)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3507 (nan)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5537 (0.5537)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.4768  data: 5.2775  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7341 (0.6958)  acc1: 87.2000 (86.6546)  acc5: 98.4000 (98.0727)  time: 0.6954  data: 0.5223  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8480 (0.8195)  acc1: 81.6000 (83.5619)  acc5: 96.4000 (96.9524)  time: 0.1992  data: 0.0234  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9018 (0.8348)  acc1: 81.2000 (83.0880)  acc5: 96.4000 (96.7360)  time: 0.1992  data: 0.0233  max mem: 28503
Test: Total time: 0:00:10 (0.4089 s / it)
* Acc@1 83.828 Acc@5 96.752 loss 0.824
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [282]  [   0/1251]  eta: 1:00:26  lr: 0.000042  min_lr: 0.000042  loss: 2.6151 (2.6151)  weight_decay: 0.0500 (0.0500)  time: 2.8993  data: 2.4925  max mem: 28503
Epoch: [282]  [ 200/1251]  eta: 0:06:21  lr: 0.000041  min_lr: 0.000041  loss: 2.7890 (2.6230)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3310 (1.4254)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [282]  [ 400/1251]  eta: 0:05:03  lr: 0.000040  min_lr: 0.000040  loss: 2.8499 (2.6626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1998 (1.4149)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [282]  [ 600/1251]  eta: 0:03:51  lr: 0.000040  min_lr: 0.000040  loss: 2.7110 (2.6682)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2353 (1.4256)  time: 0.3506  data: 0.0004  max mem: 28503
Epoch: [282]  [ 800/1251]  eta: 0:02:39  lr: 0.000039  min_lr: 0.000039  loss: 2.4929 (2.6669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3336 (1.4536)  time: 0.3502  data: 0.0004  max mem: 28503
Epoch: [282]  [1000/1251]  eta: 0:01:28  lr: 0.000038  min_lr: 0.000038  loss: 2.5578 (2.6687)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3306 (1.4322)  time: 0.3474  data: 0.0004  max mem: 28503
Epoch: [282]  [1200/1251]  eta: 0:00:17  lr: 0.000037  min_lr: 0.000037  loss: 2.8363 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1681 (1.4202)  time: 0.3468  data: 0.0004  max mem: 28503
Epoch: [282]  [1250/1251]  eta: 0:00:00  lr: 0.000037  min_lr: 0.000037  loss: 2.7252 (2.6671)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2641 (1.4181)  time: 0.3028  data: 0.0007  max mem: 28503
Epoch: [282] Total time: 0:07:20 (0.3523 s / it)
Averaged stats: lr: 0.000037  min_lr: 0.000037  loss: 2.7252 (2.6739)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2641 (1.4181)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.6086 (0.6086)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5941  data: 5.3962  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7991 (0.7541)  acc1: 86.8000 (86.9091)  acc5: 98.4000 (98.1818)  time: 0.7178  data: 0.5465  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9099 (0.8806)  acc1: 81.2000 (83.7143)  acc5: 96.4000 (96.9143)  time: 0.1992  data: 0.0308  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9637 (0.8954)  acc1: 80.8000 (83.2800)  acc5: 96.4000 (96.7360)  time: 0.1990  data: 0.0307  max mem: 28503
Test: Total time: 0:00:10 (0.4135 s / it)
* Acc@1 83.828 Acc@5 96.780 loss 0.885
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [283]  [   0/1251]  eta: 1:11:30  lr: 0.000037  min_lr: 0.000037  loss: 2.3283 (2.3283)  weight_decay: 0.0500 (0.0500)  time: 3.4300  data: 2.2798  max mem: 28503
Epoch: [283]  [ 200/1251]  eta: 0:06:22  lr: 0.000037  min_lr: 0.000037  loss: 2.8433 (2.6667)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2426 (1.4195)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [283]  [ 400/1251]  eta: 0:05:04  lr: 0.000036  min_lr: 0.000036  loss: 2.6874 (2.6661)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4716 (1.4703)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [283]  [ 600/1251]  eta: 0:03:51  lr: 0.000035  min_lr: 0.000035  loss: 2.8851 (2.6748)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2091 (1.4487)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [283]  [ 800/1251]  eta: 0:02:39  lr: 0.000035  min_lr: 0.000035  loss: 2.7659 (2.6813)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2420 (1.4706)  time: 0.3453  data: 0.0004  max mem: 28503
Epoch: [283]  [1000/1251]  eta: 0:01:28  lr: 0.000034  min_lr: 0.000034  loss: 2.8514 (2.6690)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2910 (1.4731)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [283]  [1200/1251]  eta: 0:00:17  lr: 0.000033  min_lr: 0.000033  loss: 2.7266 (2.6764)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3497 (1.4544)  time: 0.3476  data: 0.0004  max mem: 28503
Epoch: [283]  [1250/1251]  eta: 0:00:00  lr: 0.000033  min_lr: 0.000033  loss: 2.7275 (2.6772)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3229 (1.4518)  time: 0.2943  data: 0.0007  max mem: 28503
Epoch: [283] Total time: 0:07:20 (0.3520 s / it)
Averaged stats: lr: 0.000033  min_lr: 0.000033  loss: 2.7275 (2.6730)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3229 (1.4518)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.6034 (0.6034)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 4.5549  data: 4.3172  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7920 (0.7547)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0727)  time: 0.6849  data: 0.5100  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9070 (0.8799)  acc1: 81.6000 (83.6381)  acc5: 96.4000 (96.8191)  time: 0.2332  data: 0.0647  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9635 (0.8957)  acc1: 81.6000 (83.0880)  acc5: 96.0000 (96.6400)  time: 0.1964  data: 0.0281  max mem: 28503
Test: Total time: 0:00:09 (0.3991 s / it)
* Acc@1 83.798 Acc@5 96.748 loss 0.886
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [284]  [   0/1251]  eta: 1:08:10  lr: 0.000033  min_lr: 0.000033  loss: 3.1508 (3.1508)  weight_decay: 0.0500 (0.0500)  time: 3.2696  data: 2.6791  max mem: 28503
Epoch: [284]  [ 200/1251]  eta: 0:06:25  lr: 0.000032  min_lr: 0.000032  loss: 2.7892 (2.6552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4965 (1.4777)  time: 0.3566  data: 0.0005  max mem: 28503
Epoch: [284]  [ 400/1251]  eta: 0:05:05  lr: 0.000032  min_lr: 0.000032  loss: 2.8442 (2.6660)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1306 (1.3730)  time: 0.3590  data: 0.0003  max mem: 28503
Epoch: [284]  [ 600/1251]  eta: 0:03:51  lr: 0.000031  min_lr: 0.000031  loss: 2.6932 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (1.3512)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [284]  [ 800/1251]  eta: 0:02:39  lr: 0.000031  min_lr: 0.000031  loss: 2.6934 (2.6558)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2926 (1.3405)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [284]  [1000/1251]  eta: 0:01:28  lr: 0.000030  min_lr: 0.000030  loss: 2.7832 (2.6620)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1501 (1.3385)  time: 0.3633  data: 0.0004  max mem: 28503
Epoch: [284]  [1200/1251]  eta: 0:00:18  lr: 0.000029  min_lr: 0.000029  loss: 2.6673 (2.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2760 (1.3915)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [284]  [1250/1251]  eta: 0:00:00  lr: 0.000029  min_lr: 0.000029  loss: 2.7943 (2.6587)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3524 (1.3981)  time: 0.2950  data: 0.0005  max mem: 28503
Epoch: [284] Total time: 0:07:21 (0.3525 s / it)
Averaged stats: lr: 0.000029  min_lr: 0.000029  loss: 2.7943 (2.6626)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3524 (1.3981)
Test:  [ 0/25]  eta: 0:02:16  loss: 0.5572 (0.5572)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.4486  data: 5.2466  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7345 (0.7013)  acc1: 87.2000 (86.8727)  acc5: 98.0000 (98.0727)  time: 0.6504  data: 0.4773  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8597 (0.8248)  acc1: 82.0000 (83.8286)  acc5: 96.4000 (96.9333)  time: 0.1716  data: 0.0003  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9067 (0.8401)  acc1: 81.2000 (83.2640)  acc5: 96.4000 (96.7840)  time: 0.1933  data: 0.0225  max mem: 28503
Test: Total time: 0:00:10 (0.4040 s / it)
* Acc@1 83.804 Acc@5 96.734 loss 0.831
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [285]  [   0/1251]  eta: 1:09:54  lr: 0.000029  min_lr: 0.000029  loss: 2.8069 (2.8069)  weight_decay: 0.0500 (0.0500)  time: 3.3531  data: 2.3870  max mem: 28503
Epoch: [285]  [ 200/1251]  eta: 0:06:22  lr: 0.000029  min_lr: 0.000029  loss: 2.7920 (2.7095)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2237 (1.3305)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [285]  [ 400/1251]  eta: 0:05:04  lr: 0.000028  min_lr: 0.000028  loss: 2.4219 (2.6903)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4005 (1.3897)  time: 0.3509  data: 0.0005  max mem: 28503
Epoch: [285]  [ 600/1251]  eta: 0:03:51  lr: 0.000027  min_lr: 0.000027  loss: 2.5685 (2.6922)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3914 (1.4002)  time: 0.3516  data: 0.0004  max mem: 28503
Epoch: [285]  [ 800/1251]  eta: 0:02:39  lr: 0.000027  min_lr: 0.000027  loss: 2.7950 (2.6771)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2188 (1.4064)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [285]  [1000/1251]  eta: 0:01:28  lr: 0.000026  min_lr: 0.000026  loss: 2.7200 (2.6734)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2049 (1.4057)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [285]  [1200/1251]  eta: 0:00:18  lr: 0.000026  min_lr: 0.000026  loss: 2.8957 (2.6741)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2459 (1.4094)  time: 0.3601  data: 0.0004  max mem: 28503
Epoch: [285]  [1250/1251]  eta: 0:00:00  lr: 0.000026  min_lr: 0.000026  loss: 2.7646 (2.6756)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (1.4073)  time: 0.2947  data: 0.0006  max mem: 28503
Epoch: [285] Total time: 0:07:21 (0.3530 s / it)
Averaged stats: lr: 0.000026  min_lr: 0.000026  loss: 2.7646 (2.6632)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2381 (1.4073)
Test:  [ 0/25]  eta: 0:01:28  loss: 0.5750 (0.5750)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 3.5476  data: 3.2983  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7559 (0.7160)  acc1: 86.8000 (86.8727)  acc5: 98.0000 (98.0727)  time: 0.6588  data: 0.4828  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8785 (0.8400)  acc1: 81.6000 (83.7143)  acc5: 96.8000 (96.8191)  time: 0.2985  data: 0.1300  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9168 (0.8554)  acc1: 81.2000 (83.1360)  acc5: 96.4000 (96.6400)  time: 0.2155  data: 0.0471  max mem: 28503
Test: Total time: 0:00:10 (0.4110 s / it)
* Acc@1 83.784 Acc@5 96.752 loss 0.846
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [286]  [   0/1251]  eta: 1:08:06  lr: 0.000026  min_lr: 0.000026  loss: 3.0314 (3.0314)  weight_decay: 0.0500 (0.0500)  time: 3.2664  data: 1.6185  max mem: 28503
Epoch: [286]  [ 200/1251]  eta: 0:06:25  lr: 0.000025  min_lr: 0.000025  loss: 2.3629 (2.6180)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2485 (1.4673)  time: 0.3479  data: 0.0004  max mem: 28503
Epoch: [286]  [ 400/1251]  eta: 0:05:04  lr: 0.000025  min_lr: 0.000025  loss: 2.9719 (2.6581)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2008 (1.4564)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [286]  [ 600/1251]  eta: 0:03:51  lr: 0.000024  min_lr: 0.000024  loss: 2.9022 (2.6628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4695 (1.4485)  time: 0.3509  data: 0.0004  max mem: 28503
Epoch: [286]  [ 800/1251]  eta: 0:02:39  lr: 0.000023  min_lr: 0.000023  loss: 2.6882 (2.6653)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3178 (1.4662)  time: 0.3469  data: 0.0004  max mem: 28503
Epoch: [286]  [1000/1251]  eta: 0:01:28  lr: 0.000023  min_lr: 0.000023  loss: 2.7806 (2.6681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2601 (1.4437)  time: 0.3471  data: 0.0005  max mem: 28503
Epoch: [286]  [1200/1251]  eta: 0:00:17  lr: 0.000022  min_lr: 0.000022  loss: 2.6826 (2.6689)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2274 (1.4371)  time: 0.3522  data: 0.0005  max mem: 28503
Epoch: [286]  [1250/1251]  eta: 0:00:00  lr: 0.000022  min_lr: 0.000022  loss: 2.6297 (2.6681)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3340 (1.4364)  time: 0.2941  data: 0.0007  max mem: 28503
Epoch: [286] Total time: 0:07:20 (0.3519 s / it)
Averaged stats: lr: 0.000022  min_lr: 0.000022  loss: 2.6297 (2.6628)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3340 (1.4364)
Test:  [ 0/25]  eta: 0:02:23  loss: 0.5722 (0.5722)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.7511  data: 5.5493  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7591 (0.7160)  acc1: 87.6000 (86.8727)  acc5: 98.0000 (97.9636)  time: 0.6771  data: 0.5051  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8720 (0.8424)  acc1: 82.0000 (83.7143)  acc5: 96.4000 (96.7238)  time: 0.1894  data: 0.0207  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9221 (0.8583)  acc1: 81.6000 (83.1840)  acc5: 96.0000 (96.5440)  time: 0.1974  data: 0.0290  max mem: 28503
Test: Total time: 0:00:10 (0.4184 s / it)
* Acc@1 83.816 Acc@5 96.740 loss 0.848
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [287]  [   0/1251]  eta: 1:14:53  lr: 0.000022  min_lr: 0.000022  loss: 3.0524 (3.0524)  weight_decay: 0.0500 (0.0500)  time: 3.5918  data: 2.1986  max mem: 28503
Epoch: [287]  [ 200/1251]  eta: 0:06:25  lr: 0.000022  min_lr: 0.000022  loss: 2.6570 (2.6834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2456 (1.4103)  time: 0.3593  data: 0.0004  max mem: 28503
Epoch: [287]  [ 400/1251]  eta: 0:05:05  lr: 0.000021  min_lr: 0.000021  loss: 2.8791 (2.6980)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1810 (1.4366)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [287]  [ 600/1251]  eta: 0:03:51  lr: 0.000021  min_lr: 0.000021  loss: 2.7984 (2.7085)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3294 (1.5115)  time: 0.3493  data: 0.0005  max mem: 28503
Epoch: [287]  [ 800/1251]  eta: 0:02:39  lr: 0.000020  min_lr: 0.000020  loss: 2.6739 (2.6943)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2787 (1.5181)  time: 0.3506  data: 0.0005  max mem: 28503
Epoch: [287]  [1000/1251]  eta: 0:01:28  lr: 0.000020  min_lr: 0.000020  loss: 2.5986 (2.6906)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3516 (1.4930)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [287]  [1200/1251]  eta: 0:00:18  lr: 0.000019  min_lr: 0.000019  loss: 2.8144 (2.6878)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2423 (1.4708)  time: 0.3540  data: 0.0004  max mem: 28503
Epoch: [287]  [1250/1251]  eta: 0:00:00  lr: 0.000019  min_lr: 0.000019  loss: 2.9069 (2.6907)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2152 (1.4693)  time: 0.2945  data: 0.0005  max mem: 28503
Epoch: [287] Total time: 0:07:21 (0.3530 s / it)
Averaged stats: lr: 0.000019  min_lr: 0.000019  loss: 2.9069 (2.6598)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2152 (1.4693)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.7039 (0.7039)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6318  data: 5.4202  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8808 (0.8507)  acc1: 86.8000 (86.8364)  acc5: 98.4000 (98.0364)  time: 0.7065  data: 0.5326  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 1.0051 (0.9781)  acc1: 81.6000 (83.6381)  acc5: 96.8000 (96.7619)  time: 0.1930  data: 0.0220  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0563 (0.9936)  acc1: 81.2000 (83.1200)  acc5: 96.4000 (96.5920)  time: 0.1933  data: 0.0219  max mem: 28503
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 83.696 Acc@5 96.670 loss 0.985
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.83%
Epoch: [288]  [   0/1251]  eta: 1:10:33  lr: 0.000019  min_lr: 0.000019  loss: 2.7582 (2.7582)  weight_decay: 0.0500 (0.0500)  time: 3.3838  data: 2.4810  max mem: 28503
Epoch: [288]  [ 200/1251]  eta: 0:06:24  lr: 0.000019  min_lr: 0.000019  loss: 2.7247 (2.6791)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2000 (1.3147)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [288]  [ 400/1251]  eta: 0:05:04  lr: 0.000018  min_lr: 0.000018  loss: 2.7832 (2.7118)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2424 (1.3080)  time: 0.3577  data: 0.0004  max mem: 28503
Epoch: [288]  [ 600/1251]  eta: 0:03:51  lr: 0.000018  min_lr: 0.000018  loss: 2.7905 (2.6834)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1690 (1.3152)  time: 0.3464  data: 0.0004  max mem: 28503
Epoch: [288]  [ 800/1251]  eta: 0:02:39  lr: 0.000017  min_lr: 0.000017  loss: 2.7503 (2.6745)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1592 (1.3085)  time: 0.3451  data: 0.0004  max mem: 28503
Epoch: [288]  [1000/1251]  eta: 0:01:28  lr: 0.000017  min_lr: 0.000017  loss: 2.7653 (2.6721)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2858 (1.3363)  time: 0.3463  data: 0.0004  max mem: 28503
Epoch: [288]  [1200/1251]  eta: 0:00:17  lr: 0.000016  min_lr: 0.000016  loss: 2.7648 (2.6715)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2155 (nan)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [288]  [1250/1251]  eta: 0:00:00  lr: 0.000016  min_lr: 0.000016  loss: 2.7508 (2.6751)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2366 (nan)  time: 0.2969  data: 0.0006  max mem: 28503
Epoch: [288] Total time: 0:07:19 (0.3510 s / it)
Averaged stats: lr: 0.000016  min_lr: 0.000016  loss: 2.7508 (2.6624)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2366 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6000 (0.6000)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6336  data: 5.4401  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7866 (0.7456)  acc1: 86.8000 (86.8727)  acc5: 98.0000 (98.0727)  time: 0.7487  data: 0.5674  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8941 (0.8695)  acc1: 81.6000 (83.6952)  acc5: 96.8000 (96.9333)  time: 0.2172  data: 0.0431  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9407 (0.8856)  acc1: 81.2000 (83.2160)  acc5: 96.4000 (96.7360)  time: 0.2139  data: 0.0430  max mem: 28503
Test: Total time: 0:00:10 (0.4290 s / it)
* Acc@1 83.804 Acc@5 96.798 loss 0.876
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.83%
Epoch: [289]  [   0/1251]  eta: 1:06:44  lr: 0.000016  min_lr: 0.000016  loss: 2.9817 (2.9817)  weight_decay: 0.0500 (0.0500)  time: 3.2012  data: 2.1336  max mem: 28503
Epoch: [289]  [ 200/1251]  eta: 0:06:24  lr: 0.000016  min_lr: 0.000016  loss: 2.9306 (2.6972)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1977 (1.4263)  time: 0.3462  data: 0.0004  max mem: 28503
Epoch: [289]  [ 400/1251]  eta: 0:05:04  lr: 0.000015  min_lr: 0.000015  loss: 2.8942 (2.6890)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1236 (1.3827)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [289]  [ 600/1251]  eta: 0:03:51  lr: 0.000015  min_lr: 0.000015  loss: 2.6236 (2.6849)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2960 (1.4086)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [289]  [ 800/1251]  eta: 0:02:39  lr: 0.000014  min_lr: 0.000014  loss: 2.7000 (2.6706)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3105 (1.4463)  time: 0.3493  data: 0.0004  max mem: 28503
Epoch: [289]  [1000/1251]  eta: 0:01:28  lr: 0.000014  min_lr: 0.000014  loss: 2.7379 (2.6627)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2057 (1.4176)  time: 0.3665  data: 0.0004  max mem: 28503
Epoch: [289]  [1200/1251]  eta: 0:00:18  lr: 0.000014  min_lr: 0.000014  loss: 2.6582 (2.6614)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1180 (1.4061)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [289]  [1250/1251]  eta: 0:00:00  lr: 0.000014  min_lr: 0.000014  loss: 2.8130 (2.6613)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1180 (1.4051)  time: 0.2950  data: 0.0007  max mem: 28503
Epoch: [289] Total time: 0:07:21 (0.3530 s / it)
Averaged stats: lr: 0.000014  min_lr: 0.000014  loss: 2.8130 (2.6607)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1180 (1.4051)
Test:  [ 0/25]  eta: 0:02:13  loss: 0.5455 (0.5455)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.3488  data: 5.1457  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7211 (0.6856)  acc1: 86.8000 (86.8727)  acc5: 98.4000 (98.1091)  time: 0.7442  data: 0.5711  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8373 (0.8081)  acc1: 81.6000 (83.6191)  acc5: 96.8000 (96.8762)  time: 0.2267  data: 0.0569  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.8881 (0.8242)  acc1: 81.2000 (83.1040)  acc5: 96.4000 (96.7200)  time: 0.2261  data: 0.0568  max mem: 28503
Test: Total time: 0:00:10 (0.4259 s / it)
* Acc@1 83.850 Acc@5 96.796 loss 0.813
Accuracy of the model on the 50000 test images: 83.9%
Max accuracy: 83.85%
Epoch: [290]  [   0/1251]  eta: 1:08:33  lr: 0.000014  min_lr: 0.000014  loss: 3.0529 (3.0529)  weight_decay: 0.0500 (0.0500)  time: 3.2879  data: 2.9137  max mem: 28503
Epoch: [290]  [ 200/1251]  eta: 0:06:24  lr: 0.000013  min_lr: 0.000013  loss: 2.6548 (2.6411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2102 (1.2816)  time: 0.3569  data: 0.0004  max mem: 28503
Epoch: [290]  [ 400/1251]  eta: 0:05:04  lr: 0.000013  min_lr: 0.000013  loss: 2.7151 (2.6428)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5260 (1.3958)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [290]  [ 600/1251]  eta: 0:03:51  lr: 0.000012  min_lr: 0.000012  loss: 2.7793 (2.6637)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2337 (1.3796)  time: 0.3475  data: 0.0004  max mem: 28503
Epoch: [290]  [ 800/1251]  eta: 0:02:39  lr: 0.000012  min_lr: 0.000012  loss: 2.7810 (2.6641)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3074 (1.3741)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [290]  [1000/1251]  eta: 0:01:28  lr: 0.000012  min_lr: 0.000012  loss: 2.8452 (2.6649)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1417 (1.3676)  time: 0.3573  data: 0.0004  max mem: 28503
Epoch: [290]  [1200/1251]  eta: 0:00:17  lr: 0.000011  min_lr: 0.000011  loss: 2.8417 (2.6625)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2568 (1.3845)  time: 0.3499  data: 0.0004  max mem: 28503
Epoch: [290]  [1250/1251]  eta: 0:00:00  lr: 0.000011  min_lr: 0.000011  loss: 2.8385 (2.6619)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3529 (1.3914)  time: 0.2944  data: 0.0006  max mem: 28503
Epoch: [290] Total time: 0:07:20 (0.3525 s / it)
Averaged stats: lr: 0.000011  min_lr: 0.000011  loss: 2.8385 (2.6623)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3529 (1.3914)
Test:  [ 0/25]  eta: 0:02:14  loss: 0.6006 (0.6006)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.3627  data: 5.1528  max mem: 28503
Test:  [10/25]  eta: 0:00:09  loss: 0.7975 (0.7512)  acc1: 86.8000 (86.9455)  acc5: 98.0000 (98.0364)  time: 0.6411  data: 0.4687  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9071 (0.8754)  acc1: 82.0000 (83.7143)  acc5: 96.8000 (96.8762)  time: 0.1722  data: 0.0037  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9549 (0.8905)  acc1: 81.2000 (83.2000)  acc5: 96.4000 (96.6720)  time: 0.1721  data: 0.0037  max mem: 28503
Test: Total time: 0:00:09 (0.3824 s / it)
* Acc@1 83.782 Acc@5 96.756 loss 0.881
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [291]  [   0/1251]  eta: 1:09:11  lr: 0.000011  min_lr: 0.000011  loss: 2.8698 (2.8698)  weight_decay: 0.0500 (0.0500)  time: 3.3186  data: 2.2391  max mem: 28503
Epoch: [291]  [ 200/1251]  eta: 0:06:24  lr: 0.000011  min_lr: 0.000011  loss: 2.7134 (2.6837)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2286 (1.6478)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [291]  [ 400/1251]  eta: 0:05:05  lr: 0.000010  min_lr: 0.000010  loss: 2.7135 (2.6646)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2602 (1.5353)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [291]  [ 600/1251]  eta: 0:03:51  lr: 0.000010  min_lr: 0.000010  loss: 2.9029 (2.6761)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2245 (1.4389)  time: 0.3509  data: 0.0004  max mem: 28503
Epoch: [291]  [ 800/1251]  eta: 0:02:40  lr: 0.000010  min_lr: 0.000010  loss: 2.8014 (2.6601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3130 (1.4334)  time: 0.3574  data: 0.0004  max mem: 28503
Epoch: [291]  [1000/1251]  eta: 0:01:28  lr: 0.000009  min_lr: 0.000009  loss: 2.7765 (2.6595)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2451 (1.4351)  time: 0.3490  data: 0.0004  max mem: 28503
Epoch: [291]  [1200/1251]  eta: 0:00:18  lr: 0.000009  min_lr: 0.000009  loss: 2.7415 (2.6692)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2849 (1.4266)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [291]  [1250/1251]  eta: 0:00:00  lr: 0.000009  min_lr: 0.000009  loss: 2.8261 (2.6685)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2733 (1.4235)  time: 0.2946  data: 0.0005  max mem: 28503
Epoch: [291] Total time: 0:07:22 (0.3536 s / it)
Averaged stats: lr: 0.000009  min_lr: 0.000009  loss: 2.8261 (2.6601)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2733 (1.4235)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6124 (0.6124)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5262  data: 5.3308  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.8071 (0.7605)  acc1: 86.8000 (86.7636)  acc5: 98.4000 (98.0727)  time: 0.7437  data: 0.5722  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9114 (0.8852)  acc1: 81.6000 (83.5048)  acc5: 96.8000 (96.7619)  time: 0.2186  data: 0.0482  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9635 (0.9001)  acc1: 81.2000 (83.0080)  acc5: 96.4000 (96.5760)  time: 0.2182  data: 0.0482  max mem: 28503
Test: Total time: 0:00:10 (0.4261 s / it)
* Acc@1 83.796 Acc@5 96.744 loss 0.890
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [292]  [   0/1251]  eta: 1:14:17  lr: 0.000009  min_lr: 0.000009  loss: 2.9291 (2.9291)  weight_decay: 0.0500 (0.0500)  time: 3.5635  data: 2.4727  max mem: 28503
Epoch: [292]  [ 200/1251]  eta: 0:06:27  lr: 0.000009  min_lr: 0.000009  loss: 2.5620 (2.6104)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2496 (1.3598)  time: 0.3594  data: 0.0004  max mem: 28503
Epoch: [292]  [ 400/1251]  eta: 0:05:05  lr: 0.000008  min_lr: 0.000008  loss: 2.8556 (2.6476)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2218 (1.3756)  time: 0.3485  data: 0.0004  max mem: 28503
Epoch: [292]  [ 600/1251]  eta: 0:03:52  lr: 0.000008  min_lr: 0.000008  loss: 2.5171 (2.6666)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3174 (1.3976)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [292]  [ 800/1251]  eta: 0:02:40  lr: 0.000008  min_lr: 0.000008  loss: 2.7508 (2.6786)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2755 (1.3873)  time: 0.3484  data: 0.0004  max mem: 28503
Epoch: [292]  [1000/1251]  eta: 0:01:28  lr: 0.000008  min_lr: 0.000008  loss: 2.8483 (2.6795)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2509 (1.3753)  time: 0.3578  data: 0.0004  max mem: 28503
Epoch: [292]  [1200/1251]  eta: 0:00:18  lr: 0.000007  min_lr: 0.000007  loss: 2.5500 (2.6864)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4396 (1.3904)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [292]  [1250/1251]  eta: 0:00:00  lr: 0.000007  min_lr: 0.000007  loss: 2.9320 (2.6880)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1647 (1.3827)  time: 0.2950  data: 0.0007  max mem: 28503
Epoch: [292] Total time: 0:07:22 (0.3538 s / it)
Averaged stats: lr: 0.000007  min_lr: 0.000007  loss: 2.9320 (2.6570)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1647 (1.3827)
Test:  [ 0/25]  eta: 0:02:11  loss: 0.6851 (0.6851)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.2792  data: 5.0785  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8641 (0.8310)  acc1: 87.2000 (86.8000)  acc5: 98.0000 (97.9636)  time: 0.6728  data: 0.4984  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9900 (0.9573)  acc1: 82.0000 (83.5619)  acc5: 96.4000 (96.7810)  time: 0.2084  data: 0.0384  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 1.0333 (0.9729)  acc1: 80.8000 (83.0560)  acc5: 96.4000 (96.6560)  time: 0.1968  data: 0.0266  max mem: 28503
Test: Total time: 0:00:10 (0.4107 s / it)
* Acc@1 83.736 Acc@5 96.684 loss 0.963
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.85%
Epoch: [293]  [   0/1251]  eta: 1:11:43  lr: 0.000007  min_lr: 0.000007  loss: 2.7695 (2.7695)  weight_decay: 0.0500 (0.0500)  time: 3.4398  data: 2.6137  max mem: 28503
Epoch: [293]  [ 200/1251]  eta: 0:06:26  lr: 0.000007  min_lr: 0.000007  loss: 2.8100 (2.7356)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.4184 (1.4819)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [293]  [ 400/1251]  eta: 0:05:05  lr: 0.000007  min_lr: 0.000007  loss: 2.4553 (2.6631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2734 (1.4630)  time: 0.3502  data: 0.0004  max mem: 28503
Epoch: [293]  [ 600/1251]  eta: 0:03:52  lr: 0.000006  min_lr: 0.000006  loss: 2.7305 (2.6538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3470 (1.4656)  time: 0.3497  data: 0.0004  max mem: 28503
Epoch: [293]  [ 800/1251]  eta: 0:02:40  lr: 0.000006  min_lr: 0.000006  loss: 2.5145 (2.6486)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2250 (1.4260)  time: 0.3561  data: 0.0004  max mem: 28503
Epoch: [293]  [1000/1251]  eta: 0:01:28  lr: 0.000006  min_lr: 0.000006  loss: 2.5879 (2.6462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2923 (1.4130)  time: 0.3508  data: 0.0004  max mem: 28503
Epoch: [293]  [1200/1251]  eta: 0:00:18  lr: 0.000006  min_lr: 0.000006  loss: 2.8112 (2.6482)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1413 (1.3938)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [293]  [1250/1251]  eta: 0:00:00  lr: 0.000006  min_lr: 0.000006  loss: 2.8713 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2046 (1.3886)  time: 0.2960  data: 0.0006  max mem: 28503
Epoch: [293] Total time: 0:07:22 (0.3537 s / it)
Averaged stats: lr: 0.000006  min_lr: 0.000006  loss: 2.8713 (2.6519)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2046 (1.3886)
Test:  [ 0/25]  eta: 0:01:53  loss: 0.5902 (0.5902)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 4.5285  data: 4.3231  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7780 (0.7350)  acc1: 86.8000 (86.8364)  acc5: 98.4000 (98.0727)  time: 0.6870  data: 0.5110  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8886 (0.8599)  acc1: 81.6000 (83.5429)  acc5: 96.8000 (96.8191)  time: 0.2384  data: 0.0650  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9347 (0.8757)  acc1: 81.2000 (83.0880)  acc5: 96.0000 (96.6240)  time: 0.1925  data: 0.0207  max mem: 28503
Test: Total time: 0:00:10 (0.4019 s / it)
* Acc@1 83.812 Acc@5 96.742 loss 0.865
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [294]  [   0/1251]  eta: 1:05:52  lr: 0.000006  min_lr: 0.000006  loss: 2.9772 (2.9772)  weight_decay: 0.0500 (0.0500)  time: 3.1596  data: 2.5782  max mem: 28503
Epoch: [294]  [ 200/1251]  eta: 0:06:25  lr: 0.000005  min_lr: 0.000005  loss: 2.7423 (2.6886)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3380 (1.4364)  time: 0.3601  data: 0.0004  max mem: 28503
Epoch: [294]  [ 400/1251]  eta: 0:05:05  lr: 0.000005  min_lr: 0.000005  loss: 2.6952 (2.6510)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2212 (1.3639)  time: 0.3486  data: 0.0004  max mem: 28503
Epoch: [294]  [ 600/1251]  eta: 0:03:51  lr: 0.000005  min_lr: 0.000005  loss: 2.6627 (2.6568)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2961 (1.3544)  time: 0.3483  data: 0.0006  max mem: 28503
Epoch: [294]  [ 800/1251]  eta: 0:02:39  lr: 0.000005  min_lr: 0.000005  loss: 2.7697 (2.6438)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2008 (1.3549)  time: 0.3466  data: 0.0004  max mem: 28503
Epoch: [294]  [1000/1251]  eta: 0:01:28  lr: 0.000004  min_lr: 0.000004  loss: 2.8784 (2.6425)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1117 (1.3289)  time: 0.3542  data: 0.0004  max mem: 28503
Epoch: [294]  [1200/1251]  eta: 0:00:17  lr: 0.000004  min_lr: 0.000004  loss: 2.8179 (2.6441)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2277 (1.3212)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [294]  [1250/1251]  eta: 0:00:00  lr: 0.000004  min_lr: 0.000004  loss: 2.7663 (2.6446)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2874 (1.3214)  time: 0.2939  data: 0.0007  max mem: 28503
Epoch: [294] Total time: 0:07:20 (0.3520 s / it)
Averaged stats: lr: 0.000004  min_lr: 0.000004  loss: 2.7663 (2.6591)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2874 (1.3214)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.6125 (0.6125)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6180  data: 5.4250  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8040 (0.7587)  acc1: 86.8000 (86.8000)  acc5: 98.4000 (98.0364)  time: 0.7099  data: 0.5372  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9127 (0.8832)  acc1: 81.6000 (83.4667)  acc5: 96.8000 (96.7810)  time: 0.1942  data: 0.0243  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9618 (0.8984)  acc1: 80.8000 (82.9920)  acc5: 96.0000 (96.6080)  time: 0.1938  data: 0.0242  max mem: 28503
Test: Total time: 0:00:10 (0.4111 s / it)
* Acc@1 83.760 Acc@5 96.748 loss 0.889
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [295]  [   0/1251]  eta: 1:12:02  lr: 0.000004  min_lr: 0.000004  loss: 1.7251 (1.7251)  weight_decay: 0.0500 (0.0500)  time: 3.4554  data: 2.7613  max mem: 28503
Epoch: [295]  [ 200/1251]  eta: 0:06:26  lr: 0.000004  min_lr: 0.000004  loss: 2.8451 (2.6380)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2637 (1.3495)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [295]  [ 400/1251]  eta: 0:05:05  lr: 0.000004  min_lr: 0.000004  loss: 2.6503 (2.6390)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.0801 (1.3314)  time: 0.3480  data: 0.0004  max mem: 28503
Epoch: [295]  [ 600/1251]  eta: 0:03:52  lr: 0.000004  min_lr: 0.000004  loss: 2.7537 (2.6521)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2241 (1.3110)  time: 0.3504  data: 0.0004  max mem: 28503
Epoch: [295]  [ 800/1251]  eta: 0:02:40  lr: 0.000003  min_lr: 0.000003  loss: 2.8758 (2.6539)  weight_decay: 0.0500 (0.0500)  grad_norm: nan (nan)  time: 0.3496  data: 0.0004  max mem: 28503
Epoch: [295]  [1000/1251]  eta: 0:01:29  lr: 0.000003  min_lr: 0.000003  loss: 2.7732 (2.6593)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1810 (nan)  time: 0.3518  data: 0.0004  max mem: 28503
Epoch: [295]  [1200/1251]  eta: 0:00:18  lr: 0.000003  min_lr: 0.000003  loss: 2.8250 (2.6631)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2676 (nan)  time: 0.3586  data: 0.0004  max mem: 28503
Epoch: [295]  [1250/1251]  eta: 0:00:00  lr: 0.000003  min_lr: 0.000003  loss: 2.3834 (2.6630)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1539 (nan)  time: 0.2957  data: 0.0007  max mem: 28503
Epoch: [295] Total time: 0:07:23 (0.3543 s / it)
Averaged stats: lr: 0.000003  min_lr: 0.000003  loss: 2.3834 (2.6546)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1539 (nan)
Test:  [ 0/25]  eta: 0:02:20  loss: 0.5438 (0.5438)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6194  data: 5.4131  max mem: 28503
Test:  [10/25]  eta: 0:00:11  loss: 0.7242 (0.6905)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0364)  time: 0.7517  data: 0.5775  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8445 (0.8135)  acc1: 82.0000 (83.6381)  acc5: 96.8000 (96.9143)  time: 0.2167  data: 0.0471  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.8919 (0.8288)  acc1: 81.2000 (83.1360)  acc5: 96.4000 (96.7360)  time: 0.2162  data: 0.0468  max mem: 28503
Test: Total time: 0:00:10 (0.4287 s / it)
* Acc@1 83.830 Acc@5 96.794 loss 0.817
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [296]  [   0/1251]  eta: 1:06:31  lr: 0.000003  min_lr: 0.000003  loss: 2.4552 (2.4552)  weight_decay: 0.0500 (0.0500)  time: 3.1904  data: 2.6995  max mem: 28503
Epoch: [296]  [ 200/1251]  eta: 0:06:24  lr: 0.000003  min_lr: 0.000003  loss: 2.7749 (2.6552)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2124 (1.4922)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [296]  [ 400/1251]  eta: 0:05:04  lr: 0.000003  min_lr: 0.000003  loss: 2.5677 (2.6357)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2641 (1.4248)  time: 0.3478  data: 0.0004  max mem: 28503
Epoch: [296]  [ 600/1251]  eta: 0:03:51  lr: 0.000003  min_lr: 0.000003  loss: 2.8879 (2.6442)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.5130 (1.4276)  time: 0.3505  data: 0.0004  max mem: 28503
Epoch: [296]  [ 800/1251]  eta: 0:02:39  lr: 0.000002  min_lr: 0.000002  loss: 2.7272 (2.6531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3584 (1.4743)  time: 0.3489  data: 0.0004  max mem: 28503
Epoch: [296]  [1000/1251]  eta: 0:01:28  lr: 0.000002  min_lr: 0.000002  loss: 2.6481 (2.6557)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3774 (1.4603)  time: 0.3473  data: 0.0004  max mem: 28503
Epoch: [296]  [1200/1251]  eta: 0:00:18  lr: 0.000002  min_lr: 0.000002  loss: 2.6467 (2.6602)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3851 (1.4448)  time: 0.3492  data: 0.0004  max mem: 28503
Epoch: [296]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.7508 (2.6585)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2846 (1.4434)  time: 0.3017  data: 0.0005  max mem: 28503
Epoch: [296] Total time: 0:07:21 (0.3526 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.7508 (2.6593)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2846 (1.4434)
Test:  [ 0/25]  eta: 0:02:19  loss: 0.5813 (0.5813)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5950  data: 5.3965  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7707 (0.7281)  acc1: 87.2000 (86.8727)  acc5: 98.0000 (98.0364)  time: 0.6827  data: 0.5095  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.8826 (0.8523)  acc1: 81.2000 (83.5429)  acc5: 96.8000 (96.8762)  time: 0.1865  data: 0.0169  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9287 (0.8677)  acc1: 81.2000 (83.0720)  acc5: 96.4000 (96.7200)  time: 0.1852  data: 0.0168  max mem: 28503
Test: Total time: 0:00:10 (0.4042 s / it)
* Acc@1 83.786 Acc@5 96.756 loss 0.858
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [297]  [   0/1251]  eta: 1:09:24  lr: 0.000002  min_lr: 0.000002  loss: 2.9374 (2.9374)  weight_decay: 0.0500 (0.0500)  time: 3.3293  data: 2.5328  max mem: 28503
Epoch: [297]  [ 200/1251]  eta: 0:06:25  lr: 0.000002  min_lr: 0.000002  loss: 2.8170 (2.6491)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1899 (1.3768)  time: 0.3488  data: 0.0004  max mem: 28503
Epoch: [297]  [ 400/1251]  eta: 0:05:04  lr: 0.000002  min_lr: 0.000002  loss: 2.8867 (2.6531)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1743 (1.3891)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [297]  [ 600/1251]  eta: 0:03:51  lr: 0.000002  min_lr: 0.000002  loss: 2.7077 (2.6463)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1491 (1.3570)  time: 0.3508  data: 0.0004  max mem: 28503
Epoch: [297]  [ 800/1251]  eta: 0:02:40  lr: 0.000002  min_lr: 0.000002  loss: 2.5744 (2.6421)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1140 (1.3164)  time: 0.3604  data: 0.0004  max mem: 28503
Epoch: [297]  [1000/1251]  eta: 0:01:28  lr: 0.000002  min_lr: 0.000002  loss: 2.7084 (2.6578)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3343 (1.3204)  time: 0.3547  data: 0.0004  max mem: 28503
Epoch: [297]  [1200/1251]  eta: 0:00:18  lr: 0.000002  min_lr: 0.000002  loss: 2.6661 (2.6538)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2668 (1.3315)  time: 0.3585  data: 0.0004  max mem: 28503
Epoch: [297]  [1250/1251]  eta: 0:00:00  lr: 0.000002  min_lr: 0.000002  loss: 2.7777 (2.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2355 (1.3272)  time: 0.2952  data: 0.0007  max mem: 28503
Epoch: [297] Total time: 0:07:21 (0.3532 s / it)
Averaged stats: lr: 0.000002  min_lr: 0.000002  loss: 2.7777 (2.6576)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2355 (1.3272)
Test:  [ 0/25]  eta: 0:02:22  loss: 0.6108 (0.6108)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.6812  data: 5.4653  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.8038 (0.7586)  acc1: 86.8000 (86.8727)  acc5: 98.4000 (98.0364)  time: 0.6706  data: 0.4971  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9108 (0.8817)  acc1: 81.6000 (83.5429)  acc5: 96.4000 (96.7810)  time: 0.1900  data: 0.0212  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9595 (0.8962)  acc1: 81.2000 (83.0400)  acc5: 96.0000 (96.5760)  time: 0.1895  data: 0.0211  max mem: 28503
Test: Total time: 0:00:10 (0.4104 s / it)
* Acc@1 83.790 Acc@5 96.780 loss 0.885
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Epoch: [298]  [   0/1251]  eta: 1:06:21  lr: 0.000002  min_lr: 0.000002  loss: 2.6668 (2.6668)  weight_decay: 0.0500 (0.0500)  time: 3.1824  data: 2.1695  max mem: 28503
Epoch: [298]  [ 200/1251]  eta: 0:06:24  lr: 0.000001  min_lr: 0.000001  loss: 2.8842 (2.6910)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2895 (1.3611)  time: 0.3471  data: 0.0004  max mem: 28503
Epoch: [298]  [ 400/1251]  eta: 0:05:05  lr: 0.000001  min_lr: 0.000001  loss: 2.7330 (2.6669)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1982 (1.4127)  time: 0.3511  data: 0.0004  max mem: 28503
Epoch: [298]  [ 600/1251]  eta: 0:03:52  lr: 0.000001  min_lr: 0.000001  loss: 2.6931 (2.6411)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2311 (1.4117)  time: 0.3492  data: 0.0004  max mem: 28503
Epoch: [298]  [ 800/1251]  eta: 0:02:39  lr: 0.000001  min_lr: 0.000001  loss: 2.6920 (2.6462)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3007 (1.4328)  time: 0.3477  data: 0.0004  max mem: 28503
Epoch: [298]  [1000/1251]  eta: 0:01:28  lr: 0.000001  min_lr: 0.000001  loss: 2.7761 (2.6447)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3592 (1.4193)  time: 0.3481  data: 0.0004  max mem: 28503
Epoch: [298]  [1200/1251]  eta: 0:00:18  lr: 0.000001  min_lr: 0.000001  loss: 2.7146 (2.6513)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3183 (1.4136)  time: 0.3491  data: 0.0004  max mem: 28503
Epoch: [298]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.6614 (2.6540)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2894 (1.4093)  time: 0.2947  data: 0.0005  max mem: 28503
Epoch: [298] Total time: 0:07:21 (0.3531 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.6614 (2.6580)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2894 (1.4093)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6123 (0.6123)  acc1: 89.6000 (89.6000)  acc5: 99.2000 (99.2000)  time: 5.5588  data: 5.3631  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7971 (0.7539)  acc1: 86.4000 (86.5455)  acc5: 98.0000 (98.0000)  time: 0.6767  data: 0.5047  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9171 (0.8772)  acc1: 81.6000 (83.4476)  acc5: 96.8000 (96.8381)  time: 0.1976  data: 0.0286  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9531 (0.8928)  acc1: 80.8000 (82.9280)  acc5: 96.4000 (96.6560)  time: 0.1974  data: 0.0285  max mem: 28503
Test: Total time: 0:00:10 (0.4109 s / it)
* Acc@1 83.726 Acc@5 96.712 loss 0.883
Accuracy of the model on the 50000 test images: 83.7%
Max accuracy: 83.85%
Epoch: [299]  [   0/1251]  eta: 1:08:14  lr: 0.000001  min_lr: 0.000001  loss: 2.9597 (2.9597)  weight_decay: 0.0500 (0.0500)  time: 3.2728  data: 2.2742  max mem: 28503
Epoch: [299]  [ 200/1251]  eta: 0:06:24  lr: 0.000001  min_lr: 0.000001  loss: 2.6956 (2.6429)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2665 (1.3502)  time: 0.3487  data: 0.0004  max mem: 28503
Epoch: [299]  [ 400/1251]  eta: 0:05:04  lr: 0.000001  min_lr: 0.000001  loss: 2.7662 (2.6306)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2818 (1.3854)  time: 0.3482  data: 0.0004  max mem: 28503
Epoch: [299]  [ 600/1251]  eta: 0:03:51  lr: 0.000001  min_lr: 0.000001  loss: 2.7116 (2.6248)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1922 (1.3766)  time: 0.3483  data: 0.0004  max mem: 28503
Epoch: [299]  [ 800/1251]  eta: 0:02:39  lr: 0.000001  min_lr: 0.000001  loss: 2.5991 (2.6352)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3534 (1.4400)  time: 0.3478  data: 0.0005  max mem: 28503
Epoch: [299]  [1000/1251]  eta: 0:01:28  lr: 0.000001  min_lr: 0.000001  loss: 2.8152 (2.6483)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.1518 (1.4279)  time: 0.3533  data: 0.0005  max mem: 28503
Epoch: [299]  [1200/1251]  eta: 0:00:17  lr: 0.000001  min_lr: 0.000001  loss: 2.6156 (2.6474)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.3776 (1.4273)  time: 0.3522  data: 0.0005  max mem: 28503
Epoch: [299]  [1250/1251]  eta: 0:00:00  lr: 0.000001  min_lr: 0.000001  loss: 2.7802 (2.6479)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2734 (1.4304)  time: 0.2921  data: 0.0007  max mem: 28503
Epoch: [299] Total time: 0:07:20 (0.3521 s / it)
Averaged stats: lr: 0.000001  min_lr: 0.000001  loss: 2.7802 (2.6530)  weight_decay: 0.0500 (0.0500)  grad_norm: 1.2734 (1.4304)
Test:  [ 0/25]  eta: 0:02:18  loss: 0.6064 (0.6064)  acc1: 90.0000 (90.0000)  acc5: 99.2000 (99.2000)  time: 5.5458  data: 5.3515  max mem: 28503
Test:  [10/25]  eta: 0:00:10  loss: 0.7952 (0.7520)  acc1: 86.8000 (86.8000)  acc5: 98.0000 (98.0000)  time: 0.6773  data: 0.5058  max mem: 28503
Test:  [20/25]  eta: 0:00:02  loss: 0.9152 (0.8766)  acc1: 81.6000 (83.4857)  acc5: 96.8000 (96.8000)  time: 0.1855  data: 0.0168  max mem: 28503
Test:  [24/25]  eta: 0:00:00  loss: 0.9513 (0.8920)  acc1: 81.2000 (83.0080)  acc5: 96.4000 (96.6720)  time: 0.1850  data: 0.0168  max mem: 28503
Test: Total time: 0:00:10 (0.4024 s / it)
* Acc@1 83.752 Acc@5 96.736 loss 0.881
Accuracy of the model on the 50000 test images: 83.8%
Max accuracy: 83.85%
Training time 1 day, 13:30:35
